ispras
diff --git a/‎.flake8‎
Lines changed: 0 additions & 1 deletion b/‎.flake8‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/docs.yaml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/docs.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/test_on_push.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test_on_push.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Dockerfile‎
Lines changed: 5 additions & 2 deletions b/‎Dockerfile‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 22 additions & 22 deletions b/‎README.md‎
Lines changed: 22 additions & 22 deletions
diff --git a/‎VERSION‎
Lines changed: 1 addition & 1 deletion b/‎VERSION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dedoc/api/api_args.py‎
Lines changed: 54 additions & 103 deletions b/‎dedoc/api/api_args.py‎
Lines changed: 54 additions & 103 deletions
@@ -16,7 +16,6 @@ exclude =
     resources,
     dedoc/scripts,
     examples,
-    docs,
     venv,
     build,
     dedoc.egg-info
 
@@ -19,7 +19,7 @@ jobs:
 
     - name: Install dependencies
       run: |
-        sudo apt-get install -y libreoffice
+        sudo apt-get install -y libreoffice djvulibre-bin poppler-utils tesseract-ocr libtesseract-dev tesseract-ocr-rus tesseract-ocr-eng
         python -m pip install --upgrade --no-cache-dir pip setuptools
         python -m pip install --exists-action=w --no-cache-dir -r requirements.txt
         python -m pip install --upgrade --upgrade-strategy eager --no-cache-dir .[torch,docs]
@@ -30,3 +30,4 @@ jobs:
         python -m sphinx -T -E -W -b html -d docs/_build/doctrees -D language=en docs/source docs/_build
         cd docs/source/_static/code_examples
         python dedoc_usage_tutorial.py
+        python dedoc_add_new_doc_type_tutorial.py
@@ -28,7 +28,7 @@ jobs:
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v2
       with:
-        python-version: '3.8'
+        python-version: '3.9'
     - name: Run lint
       run: |
         python3 -m pip install --upgrade pip
 
@@ -11,7 +11,9 @@ repos:
             flake8-annotations==2.9.1,
             flake8-bugbear==23.3.12,
             flake8-builtins==2.1.0,
+            flake8-fill-one-line>=0.4.0,
             flake8-import-order==0.18.2,
+            flake8-multiline-containers==0.0.19,
             flake8-print==5.0.0,
             flake8-quotes==3.3.2,
             flake8-use-fstring==1.4,
 
@@ -8,11 +8,14 @@ ADD requirements.txt .
 RUN pip3 install --no-cache-dir -r requirements.txt
 
 RUN mkdir /dedoc_root
+RUN mkdir /dedoc_root/dedoc
+ADD dedoc/config.py /dedoc_root/dedoc/config.py
+ADD dedoc/download_models.py /dedoc_root/dedoc/download_models.py
+RUN python3 /dedoc_root/dedoc/download_models.py
+
 ADD dedoc /dedoc_root/dedoc
 ADD VERSION /dedoc_root
-
 RUN echo "__version__ = \"$(cat /dedoc_root/VERSION)\"" > /dedoc_root/dedoc/version.py
-RUN python3 /dedoc_root/dedoc/download_models.py
 
 ADD tests /dedoc_root/tests
 ADD resources /dedoc_root/resources
 
@@ -47,19 +47,19 @@ There are two ways to install and run dedoc as a web application or a library th
 
 ## Install and run dedoc using docker 
 
-You should have [`git`] (https://git-scm.com) and [`docker`](https://www.docker.com) installed for running dedoc by this method.
+You should have [`git`](https://git-scm.com) and [`docker`](https://www.docker.com) installed for running dedoc by this method.
 This method is more flexible because it doesn't depend on the operating system and other user's limitations,
 still, the docker application should be installed and configured properly.
 
 If you don't need to change the application configuration, you may use the built docker image as well.
 
 ### 1. Pull the image
-```bash
+```shell
 docker pull dedocproject/dedoc
 ```
 
 ### 2. Run the container
-```bash
+```shell
 docker run -p 1231:1231 --rm dedocproject/dedoc python3 /dedoc_root/dedoc/main.py
 ```
 
@@ -69,22 +69,22 @@ If you need to change some application settings, you may update `config.py` acco
 You can build and run image:
 
 ### 1. Clone the repository
-```bash
+```shell
 git clone https://github.com/ispras/dedoc
 ```
 
 ### 2. Go to the `dedoc` directory
-```bash
+```shell
 cd dedoc
 ```
 
 ### 3. Build the image and run the application
-```bash
+```shell
 docker-compose up --build
 ```
 
 ### 4. Run container with tests
-```bash
+```shell
 test="true" docker-compose up --build
 ```
 
@@ -99,7 +99,7 @@ there may be not enough machine's resources for its work.
 You should have `python` (`python3.8`, `python3.9` are recommended) and `pip` installed.
 
 ### 1. Install necessary packages:
-```bash
+```shell
 sudo apt-get install -y libreoffice djvulibre-bin unzip unrar
 ```
 
@@ -112,14 +112,14 @@ You can try any tutorial for this purpose or look [`here`](https://github.com/is
 to get the example of Tesseract installing for dedoc container or use next commands for building Tesseract OCR 5 from sources:
 
 #### 2.1. Install compilers and libraries required by the Tesseract OCR:
-```bash
+```shell
 sudo apt-get update
 sudo apt-get install -y automake binutils-dev build-essential ca-certificates clang g++ g++-multilib gcc-multilib libcairo2 libffi-dev \
 libgdk-pixbuf2.0-0 libglib2.0-dev libjpeg-dev libleptonica-dev libpango-1.0-0 libpango1.0-dev libpangocairo-1.0-0 libpng-dev libsm6 \
 libtesseract-dev libtool libxext6 make pkg-config poppler-utils pstotext shared-mime-info software-properties-common swig zlib1g-dev
 ```
 #### 2.2. Build Tesseract from sources:
-```bash
+```shell
 sudo add-apt-repository -y ppa:alex-p/tesseract-ocr-devel
 sudo apt-get update --allow-releaseinfo-change
 sudo apt-get install -y tesseract-ocr tesseract-ocr-rus
@@ -130,24 +130,24 @@ export TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata/
 
 ## Install the dedoc library via pip.
 
-You need torch~=1.11.0 and torchvision~=0.12.0 installed. If you already have torch and torchvision in your environment:
+You need `torch~=1.11.0` and `torchvision~=0.12.0` installed. If you already have torch and torchvision in your environment:
 
-```bash
+```shell
 pip install dedoc
 ```
 
 Or you can install dedoc with torch and torchvision included:
 
-```bash
+```shell
 pip install "dedoc[torch]"
 ```
 
 ## Install and run dedoc from sources
 
-If you want to run dedoc as a service from sources. it's possible to run dedoc locally.
-However, it isn't suitable for any operating system (Ubuntu 20+ is recommended) and
+If you want to run dedoc as a service from sources, it's possible to run dedoc locally.
+However, it is suitable not for all operating systems (`Ubuntu 20+` is recommended) and
 there may be not enough machine's resources for its work.
-You should have `python` (python3.8, python3.9 are recommended) and `pip` installed.
+You should have `python` (`python3.8`, `python3.9` are recommended) and `pip` installed.
 
 ### 1. Install necessary packages: according to instructions [install necessary packages](#1-Install-necessary-packages)
 
@@ -157,7 +157,7 @@ You should have `python` (python3.8, python3.9 are recommended) and `pip` instal
 
 Below are the instructions for installing the package `virtualenvwrapper`:
 
-```bash
+```shell
 sudo pip3 install virtualenv virtualenvwrapper
 mkdir ~/.virtualenvs
 export WORKON_HOME=~/.virtualenvs
@@ -169,7 +169,7 @@ mkvirtualenv dedoc_env
 
 ### 4. Install python's requirements and launch dedoc service on default port `1231`:
 
-```bash
+```shell
 # clone dedoc project
 git clone https://github.com/ispras/dedoc.git
 cd dedoc
@@ -183,14 +183,14 @@ python dedoc/main.py -c ./dedoc/config.py
 Now you can go to the `localhost:1231` and look at the docs and examples.
 
 ## Option: You can change the port of service:
-you need to change environment DOCREADER_PORT
+You need to change environment `DOCREADER_PORT`
 
-1. For local service launching on your_port (1166 example). [Install instruction from sources](#Install-and-run-dedoc-from-sources) and launch with environment: 
-```bash
+1. For local service launching on `your_port` (e.g. `1166`). Install ([installation instruction](#Install-and-run-dedoc-from-sources)) and launch with environment: 
+```shell
 DOCREADER_PORT=1166 python dedoc/main.py -c ./dedoc/config.py
 ```
 
-2. For service launching in docker-container you need to change port value in DOCREADER_PORT env and field 'ports' in docker-compose.yml file:
+2. For service launching in docker-container you need to change port value in `DOCREADER_PORT` env and field `ports` in `docker-compose.yml` file:
 ```yaml
     ...
     dedoc:
 
@@ -1 +1 @@
-0.11.2
+1.0
@@ -1,103 +1,54 @@
-from typing import Any, Optional
-
-from fastapi import Body
-from pydantic import BaseModel
-
-
-class QueryParameters(BaseModel):
-    document_type: Optional[str]
-    structure_type: Optional[str]
-    return_format: Optional[str]
-
-    with_attachments: Optional[str]
-    need_content_analysis: Optional[str]
-    recursion_deep_attachments: Optional[str]
-    return_base64: Optional[str]
-    attachments_dir: Optional[str]
-
-    insert_table: Optional[str]
-    need_pdf_table_analysis: Optional[str]
-    table_type: Optional[str]
-    orient_analysis_cells: Optional[str]
-    orient_cell_angle: Optional[str]
-
-    pdf_with_text_layer: Optional[str]
-    language: Optional[str]
-    pages: Optional[str]
-    is_one_column_document: Optional[str]
-    document_orientation: Optional[str]
-    need_header_footer_analysis: Optional[str]
-    need_binarization: Optional[str]
-
-    delimiter: Optional[str]
-    encoding: Optional[str]
-    html_fields: Optional[str]
-    handle_invisible_table: Optional[str]
-
-    def __init__(self,
-                 # type of document structure parsing
-                 document_type: Optional[str] = Body(description="a document type. Default: ''", enum=["", "law", "tz", "diploma"], default=None),  # noqa
-                 structure_type: Optional[str] = Body(description="output structure type (linear or tree). Default: 'tree'", enum=["linear", "tree"], default=None),  # noqa
-                 return_format: Optional[str] = Body(description="an option for returning a response in html form, json, pretty_json or tree. Assume that one should use json in all cases, all other formats are used for debug porpoises only. Default: 'json'", default=None),  # noqa
-
-                 # attachments handling
-                 with_attachments: Optional[str] = Body(description="an option to enable the analysis of attached files. Default: 'false'", default=None),  # noqa
-                 need_content_analysis: Optional[str] = Body(description="turn on if you need parse the contents of the document attachments. Default: 'false'", default=None),  # noqa
-                 recursion_deep_attachments: Optional[str] = Body(description="the depth on which nested attachments will be parsed if need_content_analysis=true. Default: '10'", default=None),  # noqa
-                 return_base64: Optional[str] = Body(description="returns images in base64 format. Default: 'false'", default=None),  # noqa
-                 attachments_dir: Optional[str] = Body(description="path to the directory where to save files' attachments", default=None),  # noqa
-
-                 # tables handling
-                 insert_table: Optional[str] = Body(description="Insert table into the result tree's content or not. Default: 'false'", default=None),  # noqa
-                 need_pdf_table_analysis: Optional[str] = Body(description="include a table analysis into pdfs. Default: 'true'", default=None),  # noqa
-                 table_type: Optional[str] = Body(description="a pipeline mode for a table recognition. Default: ''", default=None),  # noqa
-                 orient_analysis_cells: Optional[str] = Body(description="a table recognition option enables analysis of rotated cells in table headers. Default: 'false'", default=None),  # noqa
-                 orient_cell_angle: Optional[str] = Body(description="an option to set orientation of cells in table headers. \"270\" - cells are rotated 90 degrees clockwise, \"90\" - cells are rotated 90 degrees counterclockwise (or 270 clockwise)", default=None),  # noqa
-
-                 # pdf handling
-                 pdf_with_text_layer: Optional[str] = Body(description="an option to extract text from a text layer to PDF or using OCR methods for image-documents. Default: 'auto_tabby'", enum=["true", "false", "auto", "auto_tabby", "tabby"], default=None),  # noqa
-                 language: Optional[str] = Body(description="a recognition language. Default: 'rus+eng'", enum=["rus+eng", "rus", "eng"], default=None),  # noqa
-                 pages: Optional[str] = Body(description="an option to limit page numbers in pdf, archives with images. left:right, read pages from left to right. Default: ':'", default=None),  # noqa
-                 is_one_column_document: Optional[str] = Body(description="an option to set one or multiple column document. \"auto\" - system predict number of columns in document pages, \"true\" - is one column documents, \"false\" - is multiple column documents. Default: 'auto'", default=None),  # noqa
-                 document_orientation: Optional[str] = Body(description="an option to set vertical orientation of the document without using an orientation classifier \"auto\" - system predict angle (0, 90, 180, 270) and rotate document, \"no_change\" - do not predict orientation. Default: 'auto'", enum=["auto", "no_change"], default=None),  # noqa
-                 need_header_footer_analysis: Optional[str] = Body(description="include header-footer analysis into pdf with text layer. Default: 'false'", default=None),  # noqa
-                 need_binarization: Optional[str] = Body(description="include an adaptive binarization into pdf without a text layer. Default: 'false'", default=None),  # noqa
-
-                 # other formats handling
-                 delimiter: Optional[str] = Body(description="a column separator for csv-files", default=None),  # noqa
-                 encoding: Optional[str] = Body(description="a document encoding", default=None),  # noqa
-                 html_fields: Optional[str] = Body(description="a list of fields for JSON documents to be parsed as HTML documents. It is written as a json string of a list, where each list item is a list of keys to get the field. Default: ''", default=None),  # noqa
-                 handle_invisible_table: Optional[str] = Body(description="handle table without visible borders as tables in html. Default: 'false'", default=None),  # noqa
-
-
-                 **data: Any) -> None:  # noqa
-
-        super().__init__(**data)
-        self.document_type: str = document_type or ""
-        self.structure_type: str = structure_type or "tree"
-        self.return_format: str = return_format or "json"
-
-        self.with_attachments: str = with_attachments or "false"
-        self.need_content_analysis: str = need_content_analysis or "false"
-        self.recursion_deep_attachments: str = recursion_deep_attachments or "10"
-        self.return_base64: str = return_base64 or "false"
-        self.attachments_dir: str = attachments_dir
-
-        self.insert_table: str = insert_table or "false"
-        self.need_pdf_table_analysis: str = need_pdf_table_analysis or "true"
-        self.table_type: str = table_type or ""
-        self.orient_analysis_cells: str = orient_analysis_cells or "false"
-        self.orient_cell_angle: str = orient_cell_angle or "90"
-
-        self.pdf_with_text_layer: str = pdf_with_text_layer or "auto_tabby"
-        self.language: str = language or "rus+eng"
-        self.pages: str = pages or ":"
-        self.is_one_column_document: str = is_one_column_document or "auto"
-        self.document_orientation: str = document_orientation or "auto"
-        self.need_header_footer_analysis: str = need_header_footer_analysis or "false"
-        self.need_binarization: str = need_binarization or "false"
-
-        self.delimiter: str = delimiter
-        self.encoding: str = encoding
-        self.html_fields: str = html_fields or ""
-        self.handle_invisible_table: str = handle_invisible_table or "false"
+from dataclasses import asdict, dataclass
+from typing import Optional
+
+from fastapi import Form
+
+
+@dataclass
+class QueryParameters:
+    # type of document structure parsing
+    document_type: str = Form("", enum=["", "law", "tz", "diploma"], description="Document domain")
+    structure_type: str = Form("tree", enum=["linear", "tree"], description="Output structure type")
+    return_format: str = Form("json", enum=["json", "html", "plain_text", "tree", "collapsed_tree", "ujson", "pretty_json"],
+                              description="Response representation, most types (except json) are used for debug purposes only")
+
+    # attachments handling
+    with_attachments: str = Form("false", enum=["true", "false"], description="Enable attached files extraction")
+    need_content_analysis: str = Form("false", enum=["true", "false"], description="Enable parsing contents of the attached files")
+    recursion_deep_attachments: str = Form("10", description="Depth on which nested attachments will be parsed if need_content_analysis=true")
+    return_base64: str = Form("false", enum=["true", "false"], description="Save attached images to the document metadata in base64 format")
+    attachments_dir: Optional[str] = Form(None, description="Path to the directory where to save files' attachments")
+
+    # tables handling
+    need_pdf_table_analysis: str = Form("true", enum=["true", "false"], description="Enable table recognition for pdf")
+    table_type: str = Form("", description="Pipeline mode for table recognition")
+    orient_analysis_cells: str = Form("false", enum=["true", "false"], description="Enable analysis of rotated cells in table headers")
+    orient_cell_angle: str = Form("90", enum=["90", "270"],
+                                  description='Set cells orientation in table headers, "90" means 90 degrees counterclockwise cells rotation')
+
+    # pdf handling
+    pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
+                                    description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
+    language: str = Form("rus+eng", enum=["rus+eng", "rus", "eng"], description="Recognition language")
+    pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')
+    is_one_column_document: str = Form("auto", enum=["auto", "true", "false"],
+                                       description='One or multiple column document, "auto" - predict number of page columns automatically')
+    document_orientation: str = Form("auto", enum=["auto", "no_change"],
+                                     description='Orientation of the document pages, "auto" - predict orientation (0, 90, 180, 270 degrees), '
+                                                 '"no_change" - set vertical orientation of the document without using an orientation classifier')
+    need_header_footer_analysis: str = Form("false", enum=["true", "false"], description="Exclude headers and footers from PDF parsing result")
+    need_binarization: str = Form("false", enum=["true", "false"], description="Binarize document pages (for images or PDF without a textual layer)")
+
+    # other formats handling
+    delimiter: Optional[str] = Form(None, description="Column separator for CSV files")
+    encoding: Optional[str] = Form(None, description="Document encoding")
+    html_fields: str = Form("", description="List of fields for JSON documents to be parsed as HTML documents")
+    handle_invisible_table: str = Form("false", enum=["true", "false"], description="Handle tables without visible borders as tables in HTML")
+
+    def to_dict(self) -> dict:
+        parameters = {}
+
+        for parameter_name, parameter_value in asdict(self).items():
+            parameters[parameter_name] = getattr(parameter_value, "default", parameter_value)
+
+        return parameters