-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Open
Description
def parse_pdf(pdf_path: str, extract_image: bool = False) -> List[dict]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTImage, LTRect, LTTextContainer
doc = []
import pdfplumber
pdf = pdfplumber.open(pdf_path)
for i, page_layout in enumerate(extract_pages(pdf_path)):
page = {'page_num': page_layout.pageid, 'content': []}
elements = []
for element in page_layout:
elements.append(element)
# Init params for table
table_num = 0
tables = []
for element in elements:
if isinstance(element, LTRect):
if not tables:
tables = extract_tables(pdf, i)
if table_num < len(tables):
table_string = table_converter(tables[table_num])
table_num += 1
if table_string:
page['content'].append({'table': table_string, 'obj': element})
elif isinstance(element, LTTextContainer):
# Delete line breaks in the same paragraph
text = element.get_text()
# Todo: Further analysis using font
font = get_font(element)
if text.strip():
new_content_item = {'text': text, 'obj': element}
if font:
new_content_item['font-size'] = round(font[1])
# new_content_item['font-name'] = font[0]
page['content'].append(new_content_item)
elif extract_image and isinstance(element, LTImage):
# Todo: ocr
raise ValueError('Currently, extracting images is not supported!')
else:
pass
# merge elements
page['content'] = postprocess_page_content(page['content'])
doc.append(page)
return doc
对于专利pdf,pdfminer 的 layout 分析器(LTTextContainer)无法正确聚合该 PDF 中的文本字符,导致正文被拆成大量孤立、无语义的碎片(如单个字母、数字、空格),使得后续的合并、去重、结构识别全部失效
Metadata
Metadata
Assignees
Labels
No labels