Skip to content

Commit 5750d57

Browse files
NastyBogetalexander1999-hubAlexander Golodkovdronperminovoksidgy
authored
update master (#465)
Co-authored-by: Alexander Golodkov <[email protected]> Co-authored-by: Alexander Golodkov <[email protected]> Co-authored-by: Andrew Perminov <[email protected]> Co-authored-by: Oksana Belyaeva <[email protected]> Co-authored-by: RichardScottOZ <[email protected]> Co-authored-by: Andrey Mikhailov <[email protected]>
1 parent 370f6ef commit 5750d57

File tree

175 files changed

+2391
-1800
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

175 files changed

+2391
-1800
lines changed

.flake8

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,19 @@ inline-quotes = "
77
application-import-names = dedoc, tests, scripts, train_dataset
88
import-order-style = pycharm
99
10+
extend-immutable-calls = File, Depends
11+
12+
banned-modules =
13+
dedoc = Use full path
14+
dedoc.data_structures = Use full path
15+
dedoc.attachments_extractors = Use full path
16+
dedoc.attachments_handler = Use full path
17+
dedoc.converters = Use full path
18+
dedoc.metadata_extractors = Use full path
19+
dedoc.readers = Use full path
20+
dedoc.structure_constructors = Use full path
21+
dedoc.structure_extractors = Use full path
22+
1023
exclude =
1124
.git,
1225
__pycache__,
@@ -28,9 +41,11 @@ exclude =
2841
# ANN202 - Missing return type annotation for protected function
2942
# ANN204 - Missing return type annotation for special method
3043
# N802 - function name should be lowercase
44+
# I251 - Banned import (Use full path)
3145
ignore =
3246
ANN101
3347
per-file-ignores =
3448
scripts/*:T201
3549
scripts/benchmark_pdf_performance*:JS101
3650
tests/custom_test_runner.py:ANN001,ANN201,ANN202,ANN204,N802
51+
docs/source/_static/code_examples/*:I251

.github/check_version.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def is_correct_version(version: str, tag: str, old_version: str, regexp: Pattern
2323
args = parser.parse_args()
2424

2525
print(f"Old version: {args.old_version}, new version: {args.new_version}, "
26-
f"branch: {args.branch}, tag: {args.tag}, pre_release: {args.pre_release}") # noqa
26+
f"branch: {args.branch}, tag: {args.tag}, pre_release: {args.pre_release}")
2727

2828
master_version_pattern = re.compile(r"^\d+\.\d+(\.\d+)?$")
2929
develop_version_pattern = re.compile(r"^\d+\.\d+\.\d+rc\d+$")
@@ -43,4 +43,4 @@ def is_correct_version(version: str, tag: str, old_version: str, regexp: Pattern
4343
is_correct_version(args.new_version, args.tag, args.old_version, master_version_pattern)
4444
assert args.pre_release != "true", "Pre-releases are not allowed on master"
4545

46-
print("Version is correct") # noqa
46+
print("Version is correct")

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ repos:
1515
flake8-import-order==0.18.2,
1616
flake8-multiline-containers==0.0.19,
1717
flake8-print==5.0.0,
18+
flake8-tidy-imports==4.10.0,
1819
flake8-quotes==3.3.2,
1920
flake8-use-fstring==1.4,
2021
pycodestyle==2.9.0,

Dockerfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
ARG REPOSITORY="docker.io"
22
FROM dedocproject/dedoc_p3.9_base:version_2023_08_28
3+
ARG LANGUAGES=""
4+
RUN for lang in $LANGUAGES; do apt install -y tesseract-ocr-$lang; done
35

46
ENV PYTHONPATH "${PYTHONPATH}:/dedoc_root"
57
ENV RESOURCES_PATH "/dedoc_root/resources"

README.md

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,18 @@
11
# Dedoc
22

3+
[![GitHub release](https://img.shields.io/github/release/ispras/dedoc.svg)](https://github.com/ispras/dedoc/releases/)
34
[![PyPI version](https://badge.fury.io/py/dedoc.svg)](https://badge.fury.io/py/dedoc)
5+
[![PyPI downloads](https://pepy.tech/badge/dedoc)](https://pepy.tech/project/dedoc)
6+
[![Docker Hub](https://img.shields.io/docker/pulls/dedocproject/dedoc.svg)](https://hub.docker.com/r/dedocproject/dedoc/ "Docker Pulls")
47
[![License](http://img.shields.io/:license-apache-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0.html)
5-
[![Documentation Status](https://readthedocs.org/projects/dedoc/badge/?version=latest)](https://dedoc.readthedocs.io/en/latest/?badge=latest)
6-
[![GitHub release](https://img.shields.io/github/release/ispras/dedoc.svg)](https://github.com/ispras/dedoc/releases/)
78
[![Demo dedoc-readme.hf.space](https://img.shields.io/website-up-down-green-red/https/huggingface.co/spaces/dedoc/README.svg)](https://dedoc-readme.hf.space)
8-
[![Docker Hub](https://img.shields.io/docker/pulls/dedocproject/dedoc.svg)](https://hub.docker.com/r/dedocproject/dedoc/ "Docker Pulls")
9+
[![Documentation Status](https://readthedocs.org/projects/dedoc/badge/?version=latest)](https://dedoc.readthedocs.io/en/latest/?badge=latest)
910
[![CI tests](https://github.com/ispras/dedoc/workflows/CI/badge.svg)](https://github.com/ispras/dedoc/actions)
1011

1112
![Dedoc](https://github.com/ispras/dedoc/raw/master/dedoc_logo.png)
1213

1314
Dedoc is an open universal system for converting documents to a unified output format.
14-
It extracts a document’s logical structure and content, its tables, text formatting and metadata.
15+
It extracts a document’s logical structure and content: tables, text formatting and metadata.
1516
The document’s content is represented as a tree storing headings and lists of any level.
1617
Dedoc can be integrated in a document contents and structure analysis system as a separate module.
1718

@@ -22,14 +23,14 @@ Dedoc can be integrated in a document contents and structure analysis system as
2223
Workflow description is given [`here`](https://dedoc.readthedocs.io/en/latest/?badge=latest#workflow)
2324

2425
## Features and advantages
25-
Dedoc is implemented in Python and works with semi-structured data formats (DOC/DOCX, ODT, XLS/XLSX, CSV, TXT, JSON) and none-structured data formats like images (PNG, JPG etc.), archives (ZIP, RAR etc.), PDF and HTML formats.
26+
Dedoc is implemented in Python and works with semi-structured data formats (DOC/DOCX, ODT, XLS/XLSX, CSV, TXT, JSON) and unstructured data formats like images (PNG, JPG etc.), archives (ZIP, RAR etc.), PDF and HTML formats.
2627
Document structure extraction is fully automatic regardless of input data type.
2728
Metadata and text formatting are also extracted automatically.
2829

2930
In 2022, the system won a grant to support the development of promising AI projects from the [Innovation Assistance Foundation (Фонд содействия инновациям)](https://fasie.ru/).
3031

3132
## Dedoc provides:
32-
* Extensibility due to a flexible addition of new document formats and to an easy change of an output data format.
33+
* Extensibility due to flexible addition of new document formats and easy change of an output data format.
3334
* Support for extracting document structure out of nested documents having different formats.
3435
* Extracting various text formatting features (indentation, font type, size, style etc.).
3536
* Working with documents of various origin (statements of work, legal documents, technical reports, scientific papers) allowing flexible tuning for new domains.
@@ -68,7 +69,7 @@ The system processes different document formats. The main formats are listed bel
6869

6970

7071
## Impact
71-
This project may be useful as a first step of automatic document analysis pipeline (e.g. before the NLP part).
72+
This project may be useful as a first step of an automatic document analysis pipeline (e.g. before the NLP part).
7273
Dedoc is in demand for information analytic systems, information leak monitoring systems, as well as for natural language processing systems.
7374
The library is intended for application use by developers of systems for automatic analysis and structuring of electronic documents, including for further search in electronic documents.
7475

@@ -92,7 +93,7 @@ Relevant documentation of dedoc is available [here](https://dedoc.readthedocs.io
9293

9394
# Installation instructions
9495

95-
This project has REST Api and you can run it in Docker container.
96+
This project has a REST api and you can run it in Docker container.
9697
Also, dedoc can be installed as a library via `pip`.
9798
There are two ways to install and run dedoc as a web application or a library that are described below.
9899

@@ -149,7 +150,7 @@ If you need to change some application settings, you may update `config.py` acco
149150

150151
If you don't want to use docker for running the application, it's possible to run dedoc locally.
151152
However, it isn't suitable for any operating system (`Ubuntu 20+` is recommended) and
152-
there may be not enough machine's resources for its work.
153+
there may be not enough machine resources for its work.
153154
You should have `python` (`python3.8`, `python3.9` are recommended) and `pip` installed.
154155
Installation instructions via pip are available [here](https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-dedoc-using-pip).
155156

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.2.3
1+
2.2.4

dedoc/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
from .dedoc_manager import DedocManager # noqa
2-
from .version import __version__ # noqa
1+
from .dedoc_manager import DedocManager
2+
from .version import __version__

dedoc/api/api_args.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ class QueryParameters:
2828
# pdf handling
2929
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
3030
description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
31-
language: str = Form("rus+eng", enum=["rus+eng", "rus", "eng", "fra", "spa"], description="Recognition language")
31+
language: str = Form("rus+eng", description="Recognition language ('rus+eng', 'rus', 'eng', 'fra', 'spa')")
3232
pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')
3333
is_one_column_document: str = Form("auto", enum=["auto", "true", "false"],
3434
description='One or multiple column document, "auto" - predict number of page columns automatically')

dedoc/api/api_utils.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,12 +120,20 @@ def json2html(text: str,
120120
attachments: Optional[List[ParsedDocument]],
121121
tabs: int = 0,
122122
table2id: Dict[str, int] = None,
123-
attach2id: Dict[str, int] = None) -> str:
123+
attach2id: Dict[str, int] = None,
124+
prev_page_id: Optional[List[int]] = None) -> str:
125+
if prev_page_id is None:
126+
prev_page_id = [0]
127+
124128
tables = [] if tables is None else tables
125129
attachments = [] if attachments is None else attachments
126130
table2id = {table.metadata.uid: table_id for table_id, table in enumerate(tables)} if table2id is None else table2id
127131
attach2id = {attachment.metadata.uid: attachment_id for attachment_id, attachment in enumerate(attachments)} if attach2id is None else attach2id
128132

133+
if paragraph.metadata.page_id != prev_page_id[0]:
134+
text += f"<center><small><b>Page {prev_page_id[0] + 1}</b></small></center><hr>"
135+
prev_page_id[0] = paragraph.metadata.page_id
136+
129137
ptext = __annotations2html(paragraph=paragraph, table2id=table2id, attach2id=attach2id, tabs=tabs)
130138

131139
if paragraph.metadata.hierarchy_level.line_type in [HierarchyLevel.header, HierarchyLevel.root]:
@@ -141,7 +149,8 @@ def json2html(text: str,
141149
text += ptext
142150

143151
for subparagraph in paragraph.subparagraphs:
144-
text = json2html(text=text, paragraph=subparagraph, tables=None, attachments=None, tabs=tabs + 4, table2id=table2id, attach2id=attach2id)
152+
text = json2html(text=text, paragraph=subparagraph, tables=None, attachments=None, tabs=tabs + 4, table2id=table2id, attach2id=attach2id,
153+
prev_page_id=prev_page_id)
145154

146155
if tables is not None and len(tables) > 0:
147156
text += "<h3> Tables: </h3>"

dedoc/api/dedoc_api.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,12 @@
77
import traceback
88
from typing import Optional
99

10-
import uvicorn
1110
from fastapi import Depends, FastAPI, File, Request, Response, UploadFile
1211
from fastapi.responses import ORJSONResponse, UJSONResponse
1312
from fastapi.staticfiles import StaticFiles
1413
from starlette.responses import FileResponse, HTMLResponse, JSONResponse, PlainTextResponse
1514

16-
import dedoc
15+
import dedoc.version
1716
from dedoc.api.api_args import QueryParameters
1817
from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree, json2txt
1918
from dedoc.api.schema.parsed_document import ParsedDocument
@@ -53,7 +52,7 @@ def get_static_file(request: Request) -> Response:
5352

5453
@app.get("/version")
5554
def get_version() -> Response:
56-
return PlainTextResponse(dedoc.__version__)
55+
return PlainTextResponse(dedoc.version.__version__)
5756

5857

5958
def _get_static_file_path(request: Request) -> str:
@@ -70,10 +69,10 @@ def __add_base64_info_to_attachments(document_tree: ParsedDocument, attachments_
7069

7170

7271
@app.post("/upload", response_model=ParsedDocument)
73-
async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response: # noqa
72+
async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response:
7473
parameters = dataclasses.asdict(query_params)
7574
if not file or file.filename == "":
76-
raise MissingFileError("Error: Missing content in request_post file parameter", version=dedoc.__version__)
75+
raise MissingFileError("Error: Missing content in request_post file parameter", version=dedoc.version.__version__)
7776

7877
return_format = str(parameters.get("return_format", "json")).lower()
7978

@@ -152,4 +151,5 @@ def get_api() -> FastAPI:
152151

153152

154153
def run_api(app: FastAPI) -> None:
154+
import uvicorn
155155
uvicorn.run(app=app, host="0.0.0.0", port=int(PORT))

0 commit comments

Comments
 (0)