Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion services/docreader/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ textract
antiword
openai
ollama
pdfplumber
pymupdf
pymupdf4llm

--extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cpu/
paddlepaddle>=3.0.0,<4.0.0
Expand Down
131 changes: 43 additions & 88 deletions services/docreader/src/parser/pdf_parser.py
Original file line number Diff line number Diff line change
@@ -1,113 +1,68 @@
import logging
import os
import io
from typing import Any, List, Iterator, Optional, Mapping, Tuple, Dict, Union

import pdfplumber
from typing import Any, Tuple, Dict, Union
import re
import pymupdf4llm
import tempfile
from .base_parser import BaseParser

from PIL import Image
logger = logging.getLogger(__name__)

class PDFParser(BaseParser):
"""
PDF Document Parser

This parser handles PDF documents by extracting text content.
It uses the pypdf library for simple text extraction.
This parse handles PDF documents by pymupdf4llm.
It can convert PDF docments to makedown,but it isn't scan pdf.
"""
def _convert_table_to_markdown(self, table_data: list) -> str:

if not table_data or not table_data[0]: return ""
def clean_cell(cell):
if cell is None: return ""
return str(cell).replace("\n", " <br> ")
try:
markdown = ""
header = [clean_cell(cell) for cell in table_data[0]]
markdown += "| " + " | ".join(header) + " |\n"
markdown += "| " + " | ".join(["---"] * len(header)) + " |\n"
for row in table_data[1:]:
if not row: continue
body_row = [clean_cell(cell) for cell in row]
if len(body_row) != len(header):
logger.warning(f"Skipping malformed table row: {body_row}")
continue
markdown += "| " + " | ".join(body_row) + " |\n"
return markdown
except Exception as e:
logger.error(f"Error converting table to markdown: {e}")
return ""
def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:

logger.info(f"Parsing PDF with pdfplumber, content size: {len(content)} bytes")

all_page_content = []


def parse_into_text(self,content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:

logger.info(f"Parsing PDF with pymupdf4llm, content size: {len(content)} bytes")
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
temp_pdf_path = temp_pdf.name

ima_part = {}
def replace_img(match):
prefix = match.group(1)
img_path = match.group(2)
suffix = match.group(3)
if img_path.startswith(('http://', 'https://')):
return match.group(0)

if not os.path.exists(img_path):
logger.warning(f"警告:图片不存在,跳过: {img_path}")
image_url = self.upload_file(img_path)
ima_part[image_url] = Image.open(img_path).convert("RGBA")
return f"{prefix}{image_url}{suffix}"
try:
temp_pdf.write(content)
temp_pdf.close()
logger.info(f"PDF content written to temporary file: {temp_pdf_path}")

with pdfplumber.open(temp_pdf_path) as pdf:
logger.info(f"PDF has {len(pdf.pages)} pages")

for page_num, page in enumerate(pdf.pages):
page_content_parts = []

# Try-fallback strategy for table detection
default_settings = { "vertical_strategy": "lines", "horizontal_strategy": "lines" }
found_tables = page.find_tables(default_settings)
if not found_tables:
logger.info(f"Page {page_num+1}: Default strategy found no tables. Trying fallback strategy.")
fallback_settings = { "vertical_strategy": "text", "horizontal_strategy": "lines" }
found_tables = page.find_tables(fallback_settings)

table_bboxes = [table.bbox for table in found_tables]
# Define a filter function that keeps objects NOT inside any table bbox.
def not_within_bboxes(obj):
"""Check if an object is outside all table bounding boxes."""
for bbox in table_bboxes:
# Check if the object's vertical center is within a bbox
if bbox[1] <= (obj["top"] + obj["bottom"]) / 2 <= bbox[3]:
return False # It's inside a table, so we DON'T keep it
return True # It's outside all tables, so we DO keep it

# that contains only the non-table text.
non_table_page = page.filter(not_within_bboxes)

# Now, extract text from this filtered page view.
text = non_table_page.extract_text(x_tolerance=2)
if text:
page_content_parts.append(text)

# Process and append the structured Markdown tables
if found_tables:
logger.info(f"Found {len(found_tables)} tables on page {page_num + 1}")
for table in found_tables:
markdown_table = self._convert_table_to_markdown(table.extract())
page_content_parts.append(f"\n\n{markdown_table}\n\n")


all_page_content.append("".join(page_content_parts))
with tempfile.TemporaryDirectory() as temp_dir:
md_text = pymupdf4llm.to_markdown(
doc=temp_pdf_path,
write_images=True,
table_strategy="lines_strict",
ignore_code=False,
image_path=temp_dir,
show_progress= True
)
logger.info(
f"Successfully extracted image for tempfile")
img_pattern = r'(!\[.*?\]\()([^)\s]+)(\))'
text = re.sub(img_pattern,replace_img,md_text)
logger.info(f"PDF parsing complete.")
return text,ima_part

final_text = "\n\n--- Page Break ---\n\n".join(all_page_content)
logger.info(f"PDF parsing complete. Extracted {len(final_text)} text chars.")

return final_text

except Exception as e:
logger.error(f"Failed to parse PDF document: {str(e)}")
logger.error(f"Parsing PDF with mineru is fail")
return ""
finally:
# This block is GUARANTEED to execute, preventing resource leaks.
# This block is GUARANTEED to execute, preventing resource leaks.
if os.path.exists(temp_pdf_path):
try:
os.remove(temp_pdf_path)
logger.info(f"Temporary file cleaned up: {temp_pdf_path}")
logging.info(f"Temporary file cleaned up: {temp_pdf_path}")
except OSError as e:
logger.error(f"Error removing temporary file {temp_pdf_path}: {e}")