Skip to content

Commit 982afbb

Browse files
authored
new version 2.5 (#539)
* Added simple multilingual textual layer correctness classification based on letter percentage calculation (`textual_layer_classifier=letter`). * Added a new parameter `textual_layer_classifier = [simple, ml (default), letter]`. * Remove parameter `fast_textual_layer_detection`. Now it is a `textual_layer_classifier=simple`. * Fix bug with `table_type=table_wo_external_bounds` (fixed cv2.BoundingRect). * Some refactoring `TableRecognition`. * Added parameter `table_type` and `TableRecognition` info into documentation.
1 parent be737a7 commit 982afbb

File tree

30 files changed

+288
-76
lines changed

30 files changed

+288
-76
lines changed

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.4
1+
2.5

dedoc/api/api_args.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,7 @@ class QueryParameters:
2626
# pdf handling
2727
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby", "bad_encoding"],
2828
description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
29-
fast_textual_layer_detection: str = Form("false", enum=["true", "false"],
30-
description="Use non-ML solution to detect textual layer. Much faster but less accurate.")
29+
textual_layer_classifier: str = Form("ml", enum=["ml", "simple", "letter"], description="Type of classifier for PDF textual layer detection")
3130
each_page_textual_layer_detection: str = Form("false", enum=["true", "false"], description="Detect textual layer on each page. Slower but more accurate.")
3231
language: str = Form("rus+eng", description="Recognition language ('rus+eng', 'rus', 'eng', 'fra', 'spa')")
3332
pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')

dedoc/api/web/index.html

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
details > summary {font-style: italic; cursor: pointer; display: list-item;}
1414
.child.max {padding-left: 5px; flex: 1}
1515
.parent {display: flex}
16+
details { padding-left: 24px;}
1617
</style>
1718
</head>
1819

@@ -100,7 +101,7 @@ <h4>Attachments handling</h4>
100101

101102
<div class="parameters">
102103
<h4>PDF handling</h4>
103-
<details><summary>pdf_with_text_layer, need_pdf_table_analysis, fast_textual_layer_detection, each_page_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization, need_gost_frame_analysis</summary>
104+
<details><summary>pdf_with_text_layer, need_pdf_table_analysis, textual_layer_classifier, each_page_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization, need_gost_frame_analysis</summary>
104105
<br>
105106
<p>
106107
<label>
@@ -116,7 +117,13 @@ <h4>PDF handling</h4>
116117
</p>
117118

118119
<p>
119-
<label><input name="fast_textual_layer_detection" type="checkbox" value="true"> fast_textual_layer_detection</label>
120+
<label>
121+
<select name="textual_layer_classifier">
122+
<option value="ml">ml</option>
123+
<option value="simple">simple</option>
124+
<option value="letter">letter</option>
125+
</select> textual_layer_classifier
126+
</label>
120127
</p>
121128

122129
<p>
@@ -136,15 +143,18 @@ <h4>PDF handling</h4>
136143
</label>
137144
</p>
138145

139-
<details><summary>need_pdf_table_analysis</summary>
146+
<details><summary>need_pdf_table_analysis, table_type</summary>
140147
<br>
141148
<p>
142149
<label>
143150
<input type="hidden" name="need_pdf_table_analysis" value="false">
144151
<input type="checkbox" name="need_pdf_table_analysis" value="true" checked> need_pdf_table_analysis</label>
145152
</p>
153+
<p>
154+
<label>table_type <input name="table_type" type="text" size="20" value=""></label>
155+
</p>
146156
</details>
147-
157+
<br>
148158
<p>
149159
<label>pages <input name="pages" type="text" size="8" value=":"></label>
150160
</p>

dedoc/readers/pdf_reader/data_classes/tables/scantable.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@
1010

1111

1212
class ScanTable(Table):
13+
"""
14+
Utility class for storing recognized tables from document images. The class
15+
:class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer` works with this class.
16+
"""
1317
def __init__(self, page_number: int, cells: List[List[CellWithMeta]], bbox: BBox, order: int = -1) -> None:
1418

1519
super().__init__(cells, TableMetadata(page_id=page_number))

dedoc/readers/pdf_reader/data_classes/tables/table_tree.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from collections import namedtuple
22
from typing import List, Optional
33

4+
import numpy as np
45
from dedocutils.data_structures import BBox
56
from numpy import ndarray
67

@@ -79,7 +80,7 @@ def parse_contours_to_tree(contours: List, hierarchy: List, *, config: dict) ->
7980
if len(contours) == 0:
8081
return table_tree
8182

82-
bbox = [cv2.boundingRect(c) for c in contours[0]][0] # [x_begin, y_begin, width, height]
83+
bbox = cv2.boundingRect(contours[0].astype(np.int32)) # [x_begin, y_begin, width, height]
8384
table_tree.cell_box = BBox(x_top_left=bbox[0], y_top_left=bbox[1], width=bbox[2], height=bbox[3])
8485

8586
table_tree = table_tree.__build_childs(table_tree, hierarchy, contours)
@@ -101,7 +102,7 @@ def __build_childs(self, cur: "TableTree", hierarchy: List, contours: List) -> "
101102
list_childs = []
102103
for i, h in enumerate(hierarchy[0]):
103104
if h[3] == cur.id_contours:
104-
bbox = cv2.boundingRect(contours[i]) # [x_begin, y_begin, width, height]
105+
bbox = cv2.boundingRect(contours[i].astype(np.int32)) # [x_begin, y_begin, width, height]
105106
# Эвристика №1 на ячейку
106107
if bbox[2] < self.min_w_cell or bbox[3] < self.min_h_cell:
107108
if self.config.get("debug_mode", False):

dedoc/readers/pdf_reader/data_classes/tables/table_type.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,61 @@
11
class TableTypeAdditionalOptions:
2+
"""
3+
Setting up the table recognizer. The value of the parameter specifies the type of tables recognized when processed by
4+
class :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer`.
5+
6+
* Parameter `table_type=wo_external_bounds` - recognize tables without external bounds;
7+
8+
Example of a table of type `wo_external_bounds`::
9+
10+
text | text | text
11+
--------+------+------
12+
text | text | text
13+
--------+------+------
14+
text | text | text
15+
--------+------+------
16+
text | text | text
17+
18+
19+
* Parameter `table_type=one_cell_table` - if a document contains a bounding box with text, it will be considered a table;
20+
21+
Example of a page with a table of type `one_cell_table`::
22+
23+
_________________________
24+
Header of document
25+
text text text +------+
26+
text | text | <--- it is a table
27+
+------+
28+
________________________
29+
30+
* Parameter `table_type=split_last_column` - specified parameter for the merged last column of the table;
31+
32+
Example of a table of type `split_last_column`::
33+
34+
+--------+------+-------+
35+
| text | text | text1 |
36+
+--------+------+ |
37+
| text0 | text | text2 |
38+
| | -----| |
39+
| | text | text3 |
40+
+--------+------+ |
41+
| text | text | text4 |
42+
+--------+------+-------+
43+
|
44+
Recognition
45+
|
46+
V
47+
+--------+------+-------+
48+
| text | text | text1 |
49+
+--------+------+-------|
50+
| text0 | text | text2 |
51+
|--------+ -----+------ |
52+
| text0 | text | text3 |
53+
+--------+------+------ |
54+
| text | text | text4 |
55+
+--------+------+-------+
56+
57+
"""
58+
259
def __init__(self) -> None:
360
self.table_wo_external_bounds = "wo_external_bounds"
461
self.detect_one_cell_table = "one_cell_table"
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
from typing import Dict
2+
3+
from .abstract_txtlayer_classifier import AbstractTxtlayerClassifier
4+
from .letter_txtlayer_classifier import LetterTxtlayerClassifier
5+
from .ml_txtlayer_classifier import MlTxtlayerClassifier
6+
from .simple_txtlayer_classifier import SimpleTxtlayerClassifier
7+
8+
9+
def get_classifiers(config: dict) -> Dict[str, AbstractTxtlayerClassifier]:
10+
return {
11+
"ml": MlTxtlayerClassifier(config=config),
12+
"simple": SimpleTxtlayerClassifier(config=config),
13+
"letter": LetterTxtlayerClassifier(config=config)
14+
}
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
from typing import List
2+
3+
import numpy as np
4+
5+
from dedoc.data_structures.line_with_meta import LineWithMeta
6+
from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier.abstract_txtlayer_classifier import AbstractTxtlayerClassifier
7+
8+
9+
class LetterTxtlayerClassifier(AbstractTxtlayerClassifier):
10+
"""
11+
Simple multilingual textual layer correctness classification.
12+
Textual layer is considered as correct if percent of letters in the text > 50%.
13+
"""
14+
def __init__(self, *, config: dict) -> None:
15+
super().__init__(config=config)
16+
self.__symbol_threshold = 0.5
17+
18+
def predict(self, lines: List[List[LineWithMeta]]) -> np.ndarray:
19+
texts = np.array(["".join(line.line for line in line_list) for line_list in lines])
20+
result = np.array([bool(text.strip()) for text in texts])
21+
ids_for_pred = np.where(result)[0]
22+
23+
for idx in ids_for_pred:
24+
text = texts[idx].replace(".", "").replace("…", "")
25+
letters_number = sum(1 for symbol in text if symbol.isalpha())
26+
result[idx] = letters_number / max(len(text), 1) > self.__symbol_threshold
27+
28+
return result

dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,8 @@
77
import numpy as np
88

99
from dedoc.data_structures.unstructured_document import UnstructuredDocument
10+
from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier import get_classifiers
1011
from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier.abstract_txtlayer_classifier import AbstractTxtlayerClassifier
11-
from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier.ml_txtlayer_classifier import MlTxtlayerClassifier
12-
from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier.simple_txtlayer_classifier import SimpleTxtlayerClassifier
1312
from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_result import TxtLayerResult
1413
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader
1514
from dedoc.utils.parameter_utils import get_bool_parameter, get_param_page_slice
@@ -22,8 +21,7 @@ def __init__(self, pdf_reader: PdfTabbyReader, *, config: dict) -> None:
2221
self.config = config
2322
self.logger = config.get("logger", logging.getLogger())
2423

25-
self.ml_txtlayer_classifier = MlTxtlayerClassifier(config=config)
26-
self.simple_txtlayer_classifier = SimpleTxtlayerClassifier(config=config)
24+
self.classifiers = get_classifiers(config=config)
2725
self.pdf_reader = pdf_reader
2826

2927
def detect_txtlayer(self, path: str, parameters: dict) -> List[TxtLayerResult]:
@@ -34,10 +32,11 @@ def detect_txtlayer(self, path: str, parameters: dict) -> List[TxtLayerResult]:
3432
:param parameters: parameters for the txtlayer classifier
3533
:return: information about a textual layer in the PDF document
3634
"""
37-
if get_bool_parameter(parameters, "fast_textual_layer_detection", False):
38-
txtlayer_classifier = self.simple_txtlayer_classifier
39-
else:
40-
txtlayer_classifier = self.ml_txtlayer_classifier
35+
classifier_name = str(parameters.get("textual_layer_classifier", "ml")).lower()
36+
txtlayer_classifier = self.classifiers.get(classifier_name)
37+
if txtlayer_classifier is None:
38+
raise ValueError(f"Unknown textual layer classifier `{classifier_name}`")
39+
4140
classify_each_page = get_bool_parameter(parameters, "each_page_textual_layer_detection", False)
4241
detect_function = self.__classify_each_page if classify_each_page else self.__classify_all_pages
4342
try:
@@ -110,7 +109,10 @@ def __classify_each_page(self, path: str, parameters: dict, txtlayer_classifier:
110109
prev_idx = 0
111110
for transition_idx in transitions:
112111
chunk_lines = list(chain.from_iterable(lines_for_predict[prev_idx:transition_idx]))
113-
chunk_document = UnstructuredDocument(lines=chunk_lines, tables=document.tables, attachments=document.attachments)
112+
if is_correct:
113+
chunk_document = UnstructuredDocument(lines=chunk_lines, tables=document.tables, attachments=document.attachments)
114+
else:
115+
chunk_document = None
114116
chunk_result = TxtLayerResult(start=prev_idx + fisrt_page_id + 1, end=transition_idx + fisrt_page_id, correct=is_correct, document=chunk_document)
115117
result.append(chunk_result)
116118
is_correct = not is_correct

dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ def __init__(self, *, config: Optional[dict] = None) -> None:
5353
config=self.config)
5454
self.binarizer = AdaptiveBinarizer()
5555
self.ocr = OCRLineExtractor(config=self.config)
56+
self.page_number = None
5657

5758
def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
5859
return super().read(file_path, parameters)
@@ -68,6 +69,7 @@ def _process_one_page(self,
6869
from dedoc.utils.parameter_utils import get_path_param
6970

7071
# --- Step 1: correct orientation and detect column count ---
72+
self.page_number = page_number
7173
rotated_image, is_one_column_document, angle = self._detect_column_count_and_orientation(image, parameters)
7274
if self.config.get("debug_mode", False):
7375
self.logger.info(f"Angle page rotation = {angle}")
@@ -105,7 +107,6 @@ def _detect_column_count_and_orientation(self, image: ndarray, parameters: Param
105107
Return: rotated_image and indicator if the page is one-column
106108
"""
107109
import os
108-
from datetime import datetime
109110
import cv2
110111
from dedoc.utils.parameter_utils import get_path_param
111112

@@ -124,7 +125,7 @@ def _detect_column_count_and_orientation(self, image: ndarray, parameters: Param
124125

125126
if self.config.get("debug_mode", False):
126127
debug_dir = get_path_param(self.config, "path_debug")
127-
img_path = os.path.join(debug_dir, f"{datetime.now().strftime('%H-%M-%S')}_result_orientation.jpg")
128+
img_path = os.path.join(debug_dir, f"page-{self.page_number}_result_orientation.jpg")
128129
self.logger.info(f"Save image to {img_path}")
129130
cv2.imwrite(img_path, rotated_image)
130131

0 commit comments

Comments
 (0)