Tencent · xiaohuangpin · Oct 22, 2025
diff --git a/services/docreader/requirements.txt b/services/docreader/requirements.txt
@@ -23,7 +23,8 @@ textract
 antiword
 openai
 ollama
-pdfplumber
+pymupdf
+pymupdf4llm
 
 --extra-index-url https://www.paddlepaddle.org.cn/packages/stable/cpu/
 paddlepaddle>=3.0.0,<4.0.0

diff --git a/services/docreader/src/parser/pdf_parser.py b/services/docreader/src/parser/pdf_parser.py
@@ -1,113 +1,68 @@
 import logging
 import os
-import io
-from typing import Any, List, Iterator, Optional, Mapping, Tuple, Dict, Union
-
-import pdfplumber
+from typing import Any, Tuple, Dict, Union
+import re
+import pymupdf4llm
 import tempfile
 from .base_parser import BaseParser
-
+from PIL import Image
 logger = logging.getLogger(__name__)
 
 class PDFParser(BaseParser):
     """
     PDF Document Parser
-
-    This parser handles PDF documents by extracting text content.
-    It uses the pypdf library for simple text extraction.
+    This parse handles PDF documents by pymupdf4llm.
+    It can convert PDF docments to makedown,but it isn't scan pdf.
     """
-    def _convert_table_to_markdown(self, table_data: list) -> str:
-
-        if not table_data or not table_data[0]: return ""
-        def clean_cell(cell):
-            if cell is None: return ""
-            return str(cell).replace("\n", " <br> ")
-        try:
-            markdown = ""
-            header = [clean_cell(cell) for cell in table_data[0]]
-            markdown += "| " + " | ".join(header) + " |\n"
-            markdown += "| " + " | ".join(["---"] * len(header)) + " |\n"
-            for row in table_data[1:]:
-                if not row: continue
-                body_row = [clean_cell(cell) for cell in row]
-                if len(body_row) != len(header):
-                    logger.warning(f"Skipping malformed table row: {body_row}")
-                    continue
-                markdown += "| " + " | ".join(body_row) + " |\n"
-            return markdown
-        except Exception as e:
-            logger.error(f"Error converting table to markdown: {e}")
-            return ""
-    def parse_into_text(self, content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
-
-        logger.info(f"Parsing PDF with pdfplumber, content size: {len(content)} bytes")
-
-        all_page_content = []
-
-
+    def parse_into_text(self,content: bytes) -> Union[str, Tuple[str, Dict[str, Any]]]:
+
+        logger.info(f"Parsing PDF with pymupdf4llm, content size: {len(content)} bytes")
         temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
         temp_pdf_path = temp_pdf.name
-
+        ima_part = {}
+        def replace_img(match):
+            prefix = match.group(1)
+            img_path = match.group(2)
+            suffix = match.group(3)
+            if img_path.startswith(('http://', 'https://')):
+                    return match.group(0)
+
+            if not os.path.exists(img_path):
+                    logger.warning(f"警告：图片不存在，跳过: {img_path}")
+            image_url = self.upload_file(img_path)
+            ima_part[image_url] = Image.open(img_path).convert("RGBA")
+            return f"{prefix}{image_url}{suffix}"
         try:
             temp_pdf.write(content)
             temp_pdf.close()
             logger.info(f"PDF content written to temporary file: {temp_pdf_path}")
-
-            with pdfplumber.open(temp_pdf_path) as pdf:
-                logger.info(f"PDF has {len(pdf.pages)} pages")
-
-                for page_num, page in enumerate(pdf.pages):
-                    page_content_parts = []
-
-                    # Try-fallback strategy for table detection
-                    default_settings = { "vertical_strategy": "lines", "horizontal_strategy": "lines" }
-                    found_tables = page.find_tables(default_settings)
-                    if not found_tables:
-                        logger.info(f"Page {page_num+1}: Default strategy found no tables. Trying fallback strategy.")
-                        fallback_settings = { "vertical_strategy": "text", "horizontal_strategy": "lines" }
-                        found_tables = page.find_tables(fallback_settings)
-
-                    table_bboxes = [table.bbox for table in found_tables]
-                    # Define a filter function that keeps objects NOT inside any table bbox.
-                    def not_within_bboxes(obj):
-                        """Check if an object is outside all table bounding boxes."""
-                        for bbox in table_bboxes:
-                            # Check if the object's vertical center is within a bbox
-                            if bbox[1] <= (obj["top"] + obj["bottom"]) / 2 <= bbox[3]:
-                                return False # It's inside a table, so we DON'T keep it
-                        return True # It's outside all tables, so we DO keep it
-
-                    # that contains only the non-table text.
-                    non_table_page = page.filter(not_within_bboxes)
-
-                    # Now, extract text from this filtered page view.
-                    text = non_table_page.extract_text(x_tolerance=2)
-                    if text:
-                        page_content_parts.append(text)
-
-                    # Process and append the structured Markdown tables
-                    if found_tables:
-                        logger.info(f"Found {len(found_tables)} tables on page {page_num + 1}")
-                        for table in found_tables:
-                            markdown_table = self._convert_table_to_markdown(table.extract())
-                            page_content_parts.append(f"\n\n{markdown_table}\n\n")
-
-
-                    all_page_content.append("".join(page_content_parts))
+            with tempfile.TemporaryDirectory() as temp_dir:
+                md_text = pymupdf4llm.to_markdown(
+                    doc=temp_pdf_path,
+                    write_images=True,
+                    table_strategy="lines_strict",
+                    ignore_code=False,
+                    image_path=temp_dir,
+                    show_progress= True
+                )
+                logger.info(
+                    f"Successfully extracted image for tempfile")
+                img_pattern = r'(!\[.*?\]\()([^)\s]+)(\))'
+                text = re.sub(img_pattern,replace_img,md_text)
+            logger.info(f"PDF parsing complete.")
+            return text,ima_part
 
-            final_text = "\n\n--- Page Break ---\n\n".join(all_page_content)
-            logger.info(f"PDF parsing complete. Extracted {len(final_text)} text chars.")
-
-            return final_text
-
         except Exception as e:
-            logger.error(f"Failed to parse PDF document: {str(e)}")
+            logger.error(f"Parsing PDF with mineru is fail")
             return ""
         finally:
-            # This block is GUARANTEED to execute, preventing resource leaks.
+              # This block is GUARANTEED to execute, preventing resource leaks.
             if os.path.exists(temp_pdf_path):
                 try:
                     os.remove(temp_pdf_path)
-                    logger.info(f"Temporary file cleaned up: {temp_pdf_path}")
+                    logging.info(f"Temporary file cleaned up: {temp_pdf_path}")
                 except OSError as e:
                     logger.error(f"Error removing temporary file {temp_pdf_path}: {e}")
+
+
+