1212from dedoc .readers .pdf_reader .data_classes .line_with_location import LineWithLocation
1313from dedoc .readers .pdf_reader .data_classes .pdf_image_attachment import PdfImageAttachment
1414from dedoc .readers .pdf_reader .data_classes .tables .scantable import ScanTable
15-
15+ from dedoc . readers . pdf_reader . utils . header_footers_analysis import HeaderFooterDetector
1616
1717ParametersForParseDoc = namedtuple ("ParametersForParseDoc" , [
1818 "is_one_column_document" ,
@@ -54,6 +54,7 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti
5454 self .linker = LineObjectLinker (config = self .config )
5555 self .paragraph_extractor = ScanParagraphClassifierExtractor (config = self .config )
5656 self .gost_frame_recognizer = GOSTFrameRecognizer (config = self .config )
57+ self .header_footer_detector = HeaderFooterDetector ()
5758
5859 def read (self , file_path : str , parameters : Optional [dict ] = None ) -> UnstructuredDocument :
5960 """
@@ -94,12 +95,11 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
9495 result = UnstructuredDocument (lines = lines , tables = scan_tables , attachments = attachments , warnings = warnings , metadata = metadata )
9596 return self ._postprocess (result )
9697
97- def _parse_document (self , path : str , parameters : ParametersForParseDoc ) -> (
98- Tuple ) [List [LineWithMeta ], List [ScanTable ], List [PdfImageAttachment ], List [str ], Optional [dict ]]:
98+ def _parse_document (self , path : str , parameters : ParametersForParseDoc ) \
99+ -> Tuple [List [LineWithMeta ], List [ScanTable ], List [PdfImageAttachment ], List [str ], Optional [dict ]]:
99100 import math
100101 from joblib import Parallel , delayed
101102 from dedoc .data_structures .hierarchy_level import HierarchyLevel
102- from dedoc .readers .pdf_reader .utils .header_footers_analysis import footer_header_analysis
103103 from dedoc .utils .pdf_utils import get_pdf_page_count
104104 from dedoc .readers .pdf_reader .pdf_image_reader .pdf_image_reader import PdfImageReader
105105 from dedoc .readers .pdf_reader .pdf_txtlayer_reader .pdf_txtlayer_reader import PdfTxtlayerReader
@@ -131,12 +131,15 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
131131 all_lines , unref_tables , attachments , page_angles = [], [], [], []
132132 else :
133133 all_lines , unref_tables , attachments , page_angles = map (list , map (flatten , zip (* result )))
134+
134135 if parameters .need_header_footers_analysis :
135136 lines = [lines for lines , _ , _ , _ in result ]
136- lines , headers , footers = footer_header_analysis (lines )
137+ lines , headers , footers = self . header_footer_detector . detect (lines )
137138 all_lines = list (flatten (lines ))
139+
138140 if parameters .need_gost_frame_analysis and isinstance (self , PdfImageReader ):
139- self ._shift_all_contents (lines = all_lines , unref_tables = unref_tables , attachments = attachments , gost_analyzed_images = gost_analyzed_images )
141+ self ._shift_all_contents (lines = all_lines , onepage_tables = unref_tables , attachments = attachments , gost_analyzed_images = gost_analyzed_images )
142+
140143 mp_tables = self .table_recognizer .convert_to_multipages_tables (unref_tables , lines_with_meta = all_lines )
141144 all_lines_with_links = self .linker .link_objects (lines = all_lines , tables = mp_tables , images = attachments )
142145
@@ -156,27 +159,35 @@ def _process_document_with_gost_frame(self, images: Iterator[ndarray], first_pag
156159 gost_analyzed_images = Parallel (n_jobs = self .config ["n_jobs" ])(delayed (self .gost_frame_recognizer .rec_and_clean_frame )(image ) for image in images )
157160 page_range = range (first_page , first_page + len (gost_analyzed_images ))
158161 gost_analyzed_images = dict (zip (page_range , gost_analyzed_images ))
162+
159163 if isinstance (self , PdfTxtlayerReader ):
160164 self .gost_frame_boxes = dict (zip (page_range , [(item [1 ], item [2 ]) for item in gost_analyzed_images .values ()]))
165+
161166 result = Parallel (n_jobs = self .config ["n_jobs" ])(
162167 delayed (self ._process_one_page )(image , parameters , page_number , path ) for page_number , (image , box , original_image_shape ) in
163168 gost_analyzed_images .items ()
164169 )
165170 return result , gost_analyzed_images
166171
167- def _shift_all_contents (self , lines : List [LineWithMeta ], unref_tables : List [ScanTable ], attachments : List [PdfImageAttachment ],
172+ def _shift_all_contents (self , lines : List [LineWithMeta ], onepage_tables : List [ScanTable ], attachments : List [PdfImageAttachment ],
168173 gost_analyzed_images : Dict [int , Tuple [ndarray , BBox , Tuple [int , ...]]]) -> None :
174+ """
175+ Shift all recognized content relative to the original source image
176+ """
169177 # shift unref_tables
170- for scan_table in unref_tables :
178+ for scan_table in onepage_tables :
171179 for location in scan_table .locations :
172- table_page_number = location .page_number
173- location .shift (shift_x = gost_analyzed_images [table_page_number ][1 ].x_top_left , shift_y = gost_analyzed_images [table_page_number ][1 ].y_top_left )
180+ page_number = location .page_number
181+ location .shift (shift_x = gost_analyzed_images [page_number ][1 ].x_top_left , shift_y = gost_analyzed_images [page_number ][1 ].y_top_left )
182+ location .page_width , location .page_height = gost_analyzed_images [page_number ][2 ][1 ], gost_analyzed_images [page_number ][2 ][0 ]
183+
174184 page_number = scan_table .locations [0 ].page_number
175185 for row in scan_table .cells :
176186 for cell in row :
177- image_width , image_height = gost_analyzed_images [page_number ][2 ][1 ], gost_analyzed_images [page_number ][2 ][0 ]
178- shift_x , shift_y = (gost_analyzed_images [page_number ][1 ].x_top_left , gost_analyzed_images [page_number ][1 ].y_top_left )
179- cell .shift (shift_x = shift_x , shift_y = shift_y , image_width = image_width , image_height = image_height )
187+ orig_image_width , orig_image_height = gost_analyzed_images [page_number ][2 ][1 ], gost_analyzed_images [page_number ][2 ][0 ]
188+ gost_frame_bbox = gost_analyzed_images [page_number ][1 ]
189+ shift_x , shift_y = gost_frame_bbox .x_top_left , gost_frame_bbox .y_top_left
190+ cell .shift (shift_x = shift_x , shift_y = shift_y , image_width = orig_image_width , image_height = orig_image_height )
180191
181192 # shift attachments
182193 for attachment in attachments :
0 commit comments