ispras
diff --git a/‎.github/workflows/docs.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/docs.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/release.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/release.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/test_labeling.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test_labeling.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/test_on_push.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test_on_push.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎VERSION‎
Lines changed: 1 addition & 1 deletion b/‎VERSION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dedoc/readers/pdf_reader/data_classes/tables/location.py‎
Lines changed: 9 additions & 2 deletions b/‎dedoc/readers/pdf_reader/data_classes/tables/location.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎dedoc/readers/pdf_reader/data_classes/tables/scantable.py‎
Lines changed: 2 additions & 2 deletions b/‎dedoc/readers/pdf_reader/data_classes/tables/scantable.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎dedoc/readers/pdf_reader/pdf_base_reader.py‎
Lines changed: 24 additions & 13 deletions b/‎dedoc/readers/pdf_reader/pdf_base_reader.py‎
Lines changed: 24 additions & 13 deletions
@@ -15,7 +15,7 @@ jobs:
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v2
       with:
-        python-version: '3.9'
+        python-version: '3.10'
 
     - name: Install dependencies
       run: |
 
@@ -15,7 +15,7 @@ jobs:
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v2
         with:
-          python-version: '3.9'
+          python-version: '3.10'
 
       - name: Check version correctness
         run: |
 
@@ -28,7 +28,7 @@ jobs:
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v2
       with:
-        python-version: '3.9'
+        python-version: '3.10'
     - name: Run tests for labeling
       run: |
         test="true" docker compose -f labeling/docker-compose.yml up --build --exit-code-from test
@@ -28,7 +28,7 @@ jobs:
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v2
       with:
-        python-version: '3.9'
+        python-version: '3.10'
     - name: Run lint
       run: |
         python3 -m pip install --upgrade pip
 
@@ -1,5 +1,5 @@
 ARG REPOSITORY="docker.io"
-FROM dedocproject/dedoc_p3.9_base:version_2023_08_28
+FROM dedocproject/dedoc_jammy_p3.10_base:version_2025_09_11
 ARG LANGUAGES=""
 RUN for lang in $LANGUAGES; do apt install -y tesseract-ocr-$(echo $lang | tr "_" "-"); done
 
 
@@ -162,7 +162,7 @@ If you need to change some application settings, you may update `config.py` acco
 If you don't want to use docker for running the application, it's possible to run dedoc locally.
 However, it isn't suitable for any operating system (`Ubuntu 20+` is recommended) and
 there may be not enough machine resources for its work.
-You should have `python` (`python3.8`, `python3.9` are recommended) and `pip` installed.
+You should have `python` (`python3.9`, `python3.10` are recommended) and `pip` installed.
 Installation instructions via pip are available [here](https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-dedoc-using-pip).
 
 ## Install and run dedoc from sources
 
@@ -1 +1 @@
-2.5
+2.6
@@ -1,13 +1,15 @@
 from functools import total_ordering
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 
 from dedocutils.data_structures import BBox
 
 
 @total_ordering
 class Location:
-    def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle: float = 0.0) -> None:
+    def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle: float = 0.0, page_width: int = None, page_height: int = None) -> None:
         self.page_number = page_number
+        self.page_width = page_width
+        self.page_height = page_height
         self.bbox = bbox
         self.name = name
         # TODO put self.order (change LineWithLocation, PdfImageAttachment, ScanTable)
@@ -16,6 +18,11 @@ def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle:
     def shift(self, shift_x: int, shift_y: int) -> None:
         self.bbox.shift(shift_x, shift_y)
 
+    def to_relative_bbox_dict(self) -> Optional[Dict]:
+        if not self.page_height or not self.page_width:
+            return None
+        return self.bbox.to_relative_dict(self.page_width, self.page_height)
+
     def to_dict(self) -> Dict[str, Any]:
         from collections import OrderedDict
 
 
@@ -14,11 +14,11 @@ class ScanTable(Table):
     Utility class for storing recognized tables from document images. The class
     :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer` works with this class.
     """
-    def __init__(self, page_number: int, cells: List[List[CellWithMeta]], bbox: BBox, order: int = -1) -> None:
+    def __init__(self, page_number: int, cells: List[List[CellWithMeta]], bbox: BBox, order: int = -1, page_width: int = None, page_height: int = None) -> None:
 
         super().__init__(cells, TableMetadata(page_id=page_number))
         self.order = order
-        self.locations = [Location(page_number, bbox)]
+        self.locations = [Location(page_number, bbox, page_width=page_width, page_height=page_height)]
 
     def extended(self, table: "ScanTable") -> None:
         # extend locations
 
@@ -12,7 +12,7 @@
 from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation
 from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
 from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
-
+from dedoc.readers.pdf_reader.utils.header_footers_analysis import HeaderFooterDetector
 
 ParametersForParseDoc = namedtuple("ParametersForParseDoc", [
     "is_one_column_document",
@@ -54,6 +54,7 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti
         self.linker = LineObjectLinker(config=self.config)
         self.paragraph_extractor = ScanParagraphClassifierExtractor(config=self.config)
         self.gost_frame_recognizer = GOSTFrameRecognizer(config=self.config)
+        self.header_footer_detector = HeaderFooterDetector()
 
     def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
         """
@@ -94,12 +95,11 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
         result = UnstructuredDocument(lines=lines, tables=scan_tables, attachments=attachments, warnings=warnings, metadata=metadata)
         return self._postprocess(result)
 
-    def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
-            Tuple)[List[LineWithMeta], List[ScanTable], List[PdfImageAttachment], List[str], Optional[dict]]:
+    def _parse_document(self, path: str, parameters: ParametersForParseDoc) \
+            -> Tuple[List[LineWithMeta], List[ScanTable], List[PdfImageAttachment], List[str], Optional[dict]]:
         import math
         from joblib import Parallel, delayed
         from dedoc.data_structures.hierarchy_level import HierarchyLevel
-        from dedoc.readers.pdf_reader.utils.header_footers_analysis import footer_header_analysis
         from dedoc.utils.pdf_utils import get_pdf_page_count
         from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader
         from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader
@@ -131,12 +131,15 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
             all_lines, unref_tables, attachments, page_angles = [], [], [], []
         else:
             all_lines, unref_tables, attachments, page_angles = map(list, map(flatten, zip(*result)))
+
         if parameters.need_header_footers_analysis:
             lines = [lines for lines, _, _, _ in result]
-            lines, headers, footers = footer_header_analysis(lines)
+            lines, headers, footers = self.header_footer_detector.detect(lines)
             all_lines = list(flatten(lines))
+
         if parameters.need_gost_frame_analysis and isinstance(self, PdfImageReader):
-            self._shift_all_contents(lines=all_lines, unref_tables=unref_tables, attachments=attachments, gost_analyzed_images=gost_analyzed_images)
+            self._shift_all_contents(lines=all_lines, onepage_tables=unref_tables, attachments=attachments, gost_analyzed_images=gost_analyzed_images)
+
         mp_tables = self.table_recognizer.convert_to_multipages_tables(unref_tables, lines_with_meta=all_lines)
         all_lines_with_links = self.linker.link_objects(lines=all_lines, tables=mp_tables, images=attachments)
 
@@ -156,27 +159,35 @@ def _process_document_with_gost_frame(self, images: Iterator[ndarray], first_pag
         gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images)
         page_range = range(first_page, first_page + len(gost_analyzed_images))
         gost_analyzed_images = dict(zip(page_range, gost_analyzed_images))
+
         if isinstance(self, PdfTxtlayerReader):
             self.gost_frame_boxes = dict(zip(page_range, [(item[1], item[2]) for item in gost_analyzed_images.values()]))
+
         result = Parallel(n_jobs=self.config["n_jobs"])(
             delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box, original_image_shape) in
             gost_analyzed_images.items()
         )
         return result, gost_analyzed_images
 
-    def _shift_all_contents(self, lines: List[LineWithMeta], unref_tables: List[ScanTable], attachments: List[PdfImageAttachment],
+    def _shift_all_contents(self, lines: List[LineWithMeta], onepage_tables: List[ScanTable], attachments: List[PdfImageAttachment],
                             gost_analyzed_images: Dict[int, Tuple[ndarray, BBox, Tuple[int, ...]]]) -> None:
+        """
+            Shift all recognized content relative to the original source image
+        """
         # shift unref_tables
-        for scan_table in unref_tables:
+        for scan_table in onepage_tables:
             for location in scan_table.locations:
-                table_page_number = location.page_number
-                location.shift(shift_x=gost_analyzed_images[table_page_number][1].x_top_left, shift_y=gost_analyzed_images[table_page_number][1].y_top_left)
+                page_number = location.page_number
+                location.shift(shift_x=gost_analyzed_images[page_number][1].x_top_left, shift_y=gost_analyzed_images[page_number][1].y_top_left)
+                location.page_width, location.page_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0]
+
             page_number = scan_table.locations[0].page_number
             for row in scan_table.cells:
                 for cell in row:
-                    image_width, image_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0]
-                    shift_x, shift_y = (gost_analyzed_images[page_number][1].x_top_left, gost_analyzed_images[page_number][1].y_top_left)
-                    cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)
+                    orig_image_width, orig_image_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0]
+                    gost_frame_bbox = gost_analyzed_images[page_number][1]
+                    shift_x, shift_y = gost_frame_bbox.x_top_left, gost_frame_bbox.y_top_left
+                    cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=orig_image_width, image_height=orig_image_height)
 
         # shift attachments
         for attachment in attachments: