Skip to content

Commit 67b0d42

Browse files
authored
new version 2.6 (#545)
1 parent 982afbb commit 67b0d42

File tree

30 files changed

+485
-225
lines changed

30 files changed

+485
-225
lines changed

.github/workflows/docs.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515
- name: Set up Python ${{ matrix.python-version }}
1616
uses: actions/setup-python@v2
1717
with:
18-
python-version: '3.9'
18+
python-version: '3.10'
1919

2020
- name: Install dependencies
2121
run: |

.github/workflows/release.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515
- name: Set up Python ${{ matrix.python-version }}
1616
uses: actions/setup-python@v2
1717
with:
18-
python-version: '3.9'
18+
python-version: '3.10'
1919

2020
- name: Check version correctness
2121
run: |

.github/workflows/test_labeling.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ jobs:
2828
- name: Set up Python ${{ matrix.python-version }}
2929
uses: actions/setup-python@v2
3030
with:
31-
python-version: '3.9'
31+
python-version: '3.10'
3232
- name: Run tests for labeling
3333
run: |
3434
test="true" docker compose -f labeling/docker-compose.yml up --build --exit-code-from test

.github/workflows/test_on_push.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ jobs:
2828
- name: Set up Python ${{ matrix.python-version }}
2929
uses: actions/setup-python@v2
3030
with:
31-
python-version: '3.9'
31+
python-version: '3.10'
3232
- name: Run lint
3333
run: |
3434
python3 -m pip install --upgrade pip

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
ARG REPOSITORY="docker.io"
2-
FROM dedocproject/dedoc_p3.9_base:version_2023_08_28
2+
FROM dedocproject/dedoc_jammy_p3.10_base:version_2025_09_11
33
ARG LANGUAGES=""
44
RUN for lang in $LANGUAGES; do apt install -y tesseract-ocr-$(echo $lang | tr "_" "-"); done
55

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ If you need to change some application settings, you may update `config.py` acco
162162
If you don't want to use docker for running the application, it's possible to run dedoc locally.
163163
However, it isn't suitable for any operating system (`Ubuntu 20+` is recommended) and
164164
there may be not enough machine resources for its work.
165-
You should have `python` (`python3.8`, `python3.9` are recommended) and `pip` installed.
165+
You should have `python` (`python3.9`, `python3.10` are recommended) and `pip` installed.
166166
Installation instructions via pip are available [here](https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-dedoc-using-pip).
167167

168168
## Install and run dedoc from sources

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.5
1+
2.6

dedoc/readers/pdf_reader/data_classes/tables/location.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
from functools import total_ordering
2-
from typing import Any, Dict
2+
from typing import Any, Dict, Optional
33

44
from dedocutils.data_structures import BBox
55

66

77
@total_ordering
88
class Location:
9-
def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle: float = 0.0) -> None:
9+
def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle: float = 0.0, page_width: int = None, page_height: int = None) -> None:
1010
self.page_number = page_number
11+
self.page_width = page_width
12+
self.page_height = page_height
1113
self.bbox = bbox
1214
self.name = name
1315
# TODO put self.order (change LineWithLocation, PdfImageAttachment, ScanTable)
@@ -16,6 +18,11 @@ def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle:
1618
def shift(self, shift_x: int, shift_y: int) -> None:
1719
self.bbox.shift(shift_x, shift_y)
1820

21+
def to_relative_bbox_dict(self) -> Optional[Dict]:
22+
if not self.page_height or not self.page_width:
23+
return None
24+
return self.bbox.to_relative_dict(self.page_width, self.page_height)
25+
1926
def to_dict(self) -> Dict[str, Any]:
2027
from collections import OrderedDict
2128

dedoc/readers/pdf_reader/data_classes/tables/scantable.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,11 @@ class ScanTable(Table):
1414
Utility class for storing recognized tables from document images. The class
1515
:class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer` works with this class.
1616
"""
17-
def __init__(self, page_number: int, cells: List[List[CellWithMeta]], bbox: BBox, order: int = -1) -> None:
17+
def __init__(self, page_number: int, cells: List[List[CellWithMeta]], bbox: BBox, order: int = -1, page_width: int = None, page_height: int = None) -> None:
1818

1919
super().__init__(cells, TableMetadata(page_id=page_number))
2020
self.order = order
21-
self.locations = [Location(page_number, bbox)]
21+
self.locations = [Location(page_number, bbox, page_width=page_width, page_height=page_height)]
2222

2323
def extended(self, table: "ScanTable") -> None:
2424
# extend locations

dedoc/readers/pdf_reader/pdf_base_reader.py

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation
1313
from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
1414
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
15-
15+
from dedoc.readers.pdf_reader.utils.header_footers_analysis import HeaderFooterDetector
1616

1717
ParametersForParseDoc = namedtuple("ParametersForParseDoc", [
1818
"is_one_column_document",
@@ -54,6 +54,7 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti
5454
self.linker = LineObjectLinker(config=self.config)
5555
self.paragraph_extractor = ScanParagraphClassifierExtractor(config=self.config)
5656
self.gost_frame_recognizer = GOSTFrameRecognizer(config=self.config)
57+
self.header_footer_detector = HeaderFooterDetector()
5758

5859
def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
5960
"""
@@ -94,12 +95,11 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
9495
result = UnstructuredDocument(lines=lines, tables=scan_tables, attachments=attachments, warnings=warnings, metadata=metadata)
9596
return self._postprocess(result)
9697

97-
def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
98-
Tuple)[List[LineWithMeta], List[ScanTable], List[PdfImageAttachment], List[str], Optional[dict]]:
98+
def _parse_document(self, path: str, parameters: ParametersForParseDoc) \
99+
-> Tuple[List[LineWithMeta], List[ScanTable], List[PdfImageAttachment], List[str], Optional[dict]]:
99100
import math
100101
from joblib import Parallel, delayed
101102
from dedoc.data_structures.hierarchy_level import HierarchyLevel
102-
from dedoc.readers.pdf_reader.utils.header_footers_analysis import footer_header_analysis
103103
from dedoc.utils.pdf_utils import get_pdf_page_count
104104
from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader
105105
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader
@@ -131,12 +131,15 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
131131
all_lines, unref_tables, attachments, page_angles = [], [], [], []
132132
else:
133133
all_lines, unref_tables, attachments, page_angles = map(list, map(flatten, zip(*result)))
134+
134135
if parameters.need_header_footers_analysis:
135136
lines = [lines for lines, _, _, _ in result]
136-
lines, headers, footers = footer_header_analysis(lines)
137+
lines, headers, footers = self.header_footer_detector.detect(lines)
137138
all_lines = list(flatten(lines))
139+
138140
if parameters.need_gost_frame_analysis and isinstance(self, PdfImageReader):
139-
self._shift_all_contents(lines=all_lines, unref_tables=unref_tables, attachments=attachments, gost_analyzed_images=gost_analyzed_images)
141+
self._shift_all_contents(lines=all_lines, onepage_tables=unref_tables, attachments=attachments, gost_analyzed_images=gost_analyzed_images)
142+
140143
mp_tables = self.table_recognizer.convert_to_multipages_tables(unref_tables, lines_with_meta=all_lines)
141144
all_lines_with_links = self.linker.link_objects(lines=all_lines, tables=mp_tables, images=attachments)
142145

@@ -156,27 +159,35 @@ def _process_document_with_gost_frame(self, images: Iterator[ndarray], first_pag
156159
gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images)
157160
page_range = range(first_page, first_page + len(gost_analyzed_images))
158161
gost_analyzed_images = dict(zip(page_range, gost_analyzed_images))
162+
159163
if isinstance(self, PdfTxtlayerReader):
160164
self.gost_frame_boxes = dict(zip(page_range, [(item[1], item[2]) for item in gost_analyzed_images.values()]))
165+
161166
result = Parallel(n_jobs=self.config["n_jobs"])(
162167
delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box, original_image_shape) in
163168
gost_analyzed_images.items()
164169
)
165170
return result, gost_analyzed_images
166171

167-
def _shift_all_contents(self, lines: List[LineWithMeta], unref_tables: List[ScanTable], attachments: List[PdfImageAttachment],
172+
def _shift_all_contents(self, lines: List[LineWithMeta], onepage_tables: List[ScanTable], attachments: List[PdfImageAttachment],
168173
gost_analyzed_images: Dict[int, Tuple[ndarray, BBox, Tuple[int, ...]]]) -> None:
174+
"""
175+
Shift all recognized content relative to the original source image
176+
"""
169177
# shift unref_tables
170-
for scan_table in unref_tables:
178+
for scan_table in onepage_tables:
171179
for location in scan_table.locations:
172-
table_page_number = location.page_number
173-
location.shift(shift_x=gost_analyzed_images[table_page_number][1].x_top_left, shift_y=gost_analyzed_images[table_page_number][1].y_top_left)
180+
page_number = location.page_number
181+
location.shift(shift_x=gost_analyzed_images[page_number][1].x_top_left, shift_y=gost_analyzed_images[page_number][1].y_top_left)
182+
location.page_width, location.page_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0]
183+
174184
page_number = scan_table.locations[0].page_number
175185
for row in scan_table.cells:
176186
for cell in row:
177-
image_width, image_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0]
178-
shift_x, shift_y = (gost_analyzed_images[page_number][1].x_top_left, gost_analyzed_images[page_number][1].y_top_left)
179-
cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)
187+
orig_image_width, orig_image_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0]
188+
gost_frame_bbox = gost_analyzed_images[page_number][1]
189+
shift_x, shift_y = gost_frame_bbox.x_top_left, gost_frame_bbox.y_top_left
190+
cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=orig_image_width, image_height=orig_image_height)
180191

181192
# shift attachments
182193
for attachment in attachments:

0 commit comments

Comments
 (0)