Skip to content

Commit a2e8feb

Browse files
authored
update master (#288)
* Add version checking for release pipeline * TLDR-260 fix docx bug: change attachment annotation value (#286) * TLDR-260 fix docx bug: change attachment annotation value from filename to file uid * TLDR-260 review fixes * new version 0.9.1 (#287)
1 parent 5237390 commit a2e8feb

File tree

8 files changed

+66
-49
lines changed

8 files changed

+66
-49
lines changed

.github/workflows/release.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@ jobs:
1717
with:
1818
python-version: '3.9'
1919

20-
# - name: Check version correctness
21-
# run: |
22-
# python3 .github/check_version.py --branch ${{ github.event.release.target_commitish }} --tag $GITHUB_REF_NAME \
23-
# --new_version $(< VERSION) --old_version $(git cat-file -p $(git rev-parse "$GITHUB_SHA"^1):VERSION) \
24-
# --pre_release ${{ github.event.release.prerelease }}
20+
- name: Check version correctness
21+
run: |
22+
python3 .github/check_version.py --branch ${{ github.event.release.target_commitish }} --tag $GITHUB_REF_NAME \
23+
--new_version $(< VERSION) --old_version $(git cat-file -p $(git rev-parse "$GITHUB_SHA"^1):VERSION) \
24+
--pre_release ${{ github.event.release.prerelease }}
2525
2626
- name: Install dependencies
2727
run: |

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.9
1+
0.9.1

dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44
import tempfile
55
import zipfile
66
from typing import List, Optional
7+
78
from bs4 import BeautifulSoup, Tag
89

910
from dedoc.attachments_extractors.concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor
11+
from dedoc.common.exceptions.bad_file_exception import BadFileFormatException
1012
from dedoc.data_structures.attached_file import AttachedFile
1113
from dedoc.extensions import recognized_extensions, recognized_mimes
12-
from dedoc.utils.utils import splitext_
1314

1415

1516
class DocxAttachmentsExtractor(AbstractOfficeAttachmentsExtractor):
@@ -30,17 +31,16 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[
3031
the methods' parameters.
3132
"""
3233
result = []
33-
name, ext = splitext_(filename)
34-
35-
if ext.lower() != '.docx':
36-
return []
34+
try:
35+
with zipfile.ZipFile(os.path.join(tmpdir, filename), 'r') as zfile:
36+
diagram_attachments = self.__extract_diagrams(zfile)
37+
need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true"
38+
result += self._content2attach_file(content=diagram_attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis)
3739

38-
with zipfile.ZipFile(os.path.join(tmpdir, filename), 'r') as zfile:
39-
diagram_attachments = self.__extract_diagrams(zfile)
40-
need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true"
41-
result += self._content2attach_file(content=diagram_attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis)
40+
result += self._get_attachments(tmpdir=tmpdir, filename=filename, parameters=parameters, attachments_dir="word")
4241

43-
result += self._get_attachments(tmpdir=tmpdir, filename=filename, parameters=parameters, attachments_dir="word")
42+
except zipfile.BadZipFile:
43+
raise BadFileFormatException("Bad docx file:\n file_name = {}. Seems docx is broken".format(filename))
4444
return result
4545

4646
def __extract_diagrams(self, document: zipfile.ZipFile) -> List[tuple]:

dedoc/readers/docx_reader/data_structures/docx_document.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from bs4 import BeautifulSoup, Tag
1010

1111
from dedoc.common.exceptions.bad_file_exception import BadFileFormatException
12+
from dedoc.data_structures.attached_file import AttachedFile
1213
from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation
1314
from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation
1415
from dedoc.data_structures.line_with_meta import LineWithMeta
@@ -23,10 +24,11 @@
2324

2425

2526
class DocxDocument:
26-
def __init__(self, path: str, logger: logging.Logger) -> None:
27+
def __init__(self, path: str, attachments: List[AttachedFile], logger: logging.Logger) -> None:
2728
self.logger = logger
2829
self.path = path
2930
self.path_hash = calculate_file_hash(path=path)
31+
self.attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments}
3032

3133
self.document_bs_tree = self.__get_bs_tree('word/document.xml')
3234
if self.document_bs_tree is None:
@@ -68,7 +70,7 @@ def __get_lines(self, logger: logging.Logger) -> List[LineWithMeta]:
6870
continue
6971

7072
if paragraph_xml.pict: # diagrams are saved using docx_attachments_extractor
71-
self.__handle_diagrams_xml(paragraph_xml, diagram_refs, uids_set, cnt)
73+
self.__handle_diagram_xml(paragraph_xml, diagram_refs, uids_set, cnt)
7274
continue
7375

7476
if paragraph_xml.name != 'p':
@@ -179,11 +181,22 @@ def __handle_images_xml(self, xmls: List[Tag], image_refs: dict, uids_set: set,
179181

180182
for image_xml in xmls:
181183
blips = image_xml.find_all("a:blip")
182-
image_uid = images_rels[blips[0]["r:embed"]]
184+
image_name = images_rels[blips[0]["r:embed"]]
185+
186+
if image_name in self.attachment_name2uid:
187+
image_uid = self.attachment_name2uid[image_name]
188+
else:
189+
self.logger.info(f"Attachment with name {image_name} not found")
190+
continue
183191
image_refs[len(self.paragraph_list) - 1].append(image_uid)
184192

185-
def __handle_diagrams_xml(self, xml: Tag, diagram_refs: dict, uids_set: set, cnt: Counter) -> None:
186-
diagram_uid = hashlib.md5(xml.encode()).hexdigest()
193+
def __handle_diagram_xml(self, xml: Tag, diagram_refs: dict, uids_set: set, cnt: Counter) -> None:
194+
diagram_name = f"{hashlib.md5(xml.encode()).hexdigest()}.docx"
195+
if diagram_name in self.attachment_name2uid:
196+
diagram_uid = self.attachment_name2uid[diagram_name]
197+
else:
198+
self.logger.info(f"Attachment with name {diagram_name} not found")
199+
return
187200
self.__prepare_paragraph_list(uids_set, cnt)
188201
diagram_refs[len(self.paragraph_list) - 1].append(diagram_uid)
189202

dedoc/readers/docx_reader/docx_reader.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@
33
from typing import Optional, List
44

55
from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor
6+
from dedoc.data_structures.hierarchy_level import HierarchyLevel
67
from dedoc.data_structures.line_with_meta import LineWithMeta
78
from dedoc.data_structures.unstructured_document import UnstructuredDocument
89
from dedoc.extensions import recognized_extensions, recognized_mimes
910
from dedoc.readers.base_reader import BaseReader
1011
from dedoc.readers.docx_reader.data_structures.docx_document import DocxDocument
11-
from dedoc.data_structures.hierarchy_level import HierarchyLevel
1212

1313

1414
class DocxReader(BaseReader):
@@ -34,8 +34,9 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
3434
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
3535
"""
3636
parameters = {} if parameters is None else parameters
37-
docx_document = self._parse_document(path=path)
3837
attachments = self.attachment_extractor.get_attachments(tmpdir=os.path.dirname(path), filename=os.path.basename(path), parameters=parameters)
38+
39+
docx_document = DocxDocument(path=path, attachments=attachments, logger=self.logger)
3940
lines = self.__fix_lines(docx_document.lines)
4041
return UnstructuredDocument(lines=lines, tables=docx_document.tables, attachments=attachments, warnings=[])
4142

@@ -54,7 +55,3 @@ def __fix_lines(self, lines: List[LineWithMeta]) -> List[LineWithMeta]:
5455
annotation.end += 1
5556

5657
return lines
57-
58-
def _parse_document(self, path: str) -> DocxDocument:
59-
docx_document = DocxDocument(path=path, logger=self.logger)
60-
return docx_document

dedoc/train_dataset/taskers/images_creators/concrete_creators/docx_images_creator.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from copy import deepcopy
1010
from typing import Iterator, Optional, Dict, Iterable, Tuple
1111
from typing import List
12+
1213
import numpy as np
1314
from PIL import Image
1415
from PIL import ImageColor
@@ -18,11 +19,9 @@
1819
from dedoc.common.exceptions.conversion_exception import ConversionException
1920
from dedoc.readers.docx_reader.data_structures.docx_document import DocxDocument
2021
from dedoc.readers.docx_reader.data_structures.paragraph import Paragraph
21-
from dedoc.readers.docx_reader.docx_reader import DocxReader
2222
from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader
23-
from dedoc.train_dataset.train_dataset_utils import get_original_document_path
24-
2523
from dedoc.train_dataset.taskers.images_creators.concrete_creators.abstract_images_creator import AbstractImagesCreator
24+
from dedoc.train_dataset.train_dataset_utils import get_original_document_path
2625
from dedoc.utils.image_utils import get_concat_v
2726

2827
PairedPdf = namedtuple("PairedPdf", ["many_color_pdf", "two_color_pdf", "many_colors", "two_colors"])
@@ -32,7 +31,6 @@ class DocxImagesCreator(AbstractImagesCreator):
3231

3332
def __init__(self, path2docs: str, *, config: dict) -> None:
3433
self.path2docs = path2docs
35-
self.docx_reader = DocxReader(config=config)
3634
self.color_step = 16
3735
self.first_color = 15
3836
self.base_color = 0
@@ -58,7 +56,7 @@ def add_images(self, page: List[dict], archive: zipfile.ZipFile) -> None:
5856
"""
5957
path2doc = get_original_document_path(self.path2docs, page)
6058
# here we get half processing docx document (with raw xml)
61-
document = self.docx_reader._parse_document(path2doc)
59+
document = DocxDocument(path=path2doc, attachments=[], logger=self.logger)
6260
with zipfile.ZipFile(path2doc) as d:
6361
with tempfile.TemporaryDirectory() as tmp_dir:
6462
pdfs = self.__create_pair_pdfs(docx_archive=d, document=document, tmp_dir=tmp_dir)

docs/source/changelog.rst

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
Changelog
22
=========
33

4+
v0.9.1 (2023-07-05)
5+
-------------------
6+
Release note: `v0.9.1 <https://github.com/ispras/dedoc/releases/tag/v0.9.1>`_
7+
8+
* Fixed bug with `AttachAnnotation` in docx: its value is equal attachment uid instead of file name.
9+
410

511
v0.9 (2023-06-26)
6-
-------------------
12+
-----------------
713
Release note: `v0.9 <https://github.com/ispras/dedoc/releases/tag/v0.9>`_
814

9-
* Publication of the first version of dedoc library
15+
* Publication of the first version of dedoc library.

tests/api_tests/test_api_with_images_refs.py

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -10,52 +10,55 @@ class TestApiImageRefs(AbstractTestApiDocReader):
1010
def test_docx_with_images(self) -> None:
1111
file_name = "docx_with_images.docx"
1212
result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear"))
13+
attachments_name2uid = {attachment["metadata"]["file_name"]: attachment["metadata"]["uid"] for attachment in result["attachments"]}
1314
content = result["content"]["structure"]
1415

1516
image_paragraph = content["subparagraphs"][0]
16-
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image1.png')
17+
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image1.png'])
1718

1819
image_paragraph = content["subparagraphs"][2]
19-
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image2.jpeg')
20-
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image3.jpeg')
20+
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image2.jpeg'])
21+
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image3.jpeg'])
2122

2223
image_paragraph = content["subparagraphs"][5]
23-
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image4.jpeg')
24+
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image4.jpeg'])
2425

2526
image_paragraph = content["subparagraphs"][6]
26-
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image5.jpeg')
27-
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image6.jpeg')
28-
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image7.jpeg')
27+
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image5.jpeg'])
28+
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image6.jpeg'])
29+
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image7.jpeg'])
2930

3031
def test_odt_with_images(self) -> None:
3132
file_name = "odt_with_images.odt"
3233
result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear"))
34+
attachments_name2uid = {attachment["metadata"]["file_name"]: attachment["metadata"]["uid"] for attachment in result["attachments"]}
3335
content = result["content"]["structure"]
3436

3537
image_paragraph = content["subparagraphs"][0]
36-
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image1.jpeg')
38+
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image1.jpeg'])
3739

3840
image_paragraph = content["subparagraphs"][7]
39-
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image2.jpeg')
41+
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image2.jpeg'])
4042

4143
image_paragraph = content["subparagraphs"][8]
42-
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image3.jpeg')
44+
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image3.jpeg'])
4345

4446
def test_docx_with_images_from_mac(self) -> None:
4547
file_name = "doc_with_images.docx"
4648
result = self._send_request(file_name, dict(with_attachments=True, structure_type="linear"))
49+
attachments_name2uid = {attachment["metadata"]["file_name"]: attachment["metadata"]["uid"] for attachment in result["attachments"]}
4750
content = result["content"]["structure"]
4851

4952
image_paragraph = content["subparagraphs"][2]
50-
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image1.jpeg')
53+
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image1.jpeg'])
5154

5255
image_paragraph = content["subparagraphs"][3]
53-
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image2.jpeg')
56+
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image2.jpeg'])
5457

5558
image_paragraph = content["subparagraphs"][5]
56-
self.__check_image_paragraph(image_paragraph=image_paragraph, image_name='image3.png')
59+
self.__check_image_paragraph(image_paragraph=image_paragraph, image_uid=attachments_name2uid['image3.png'])
5760

58-
def __check_image_paragraph(self, image_paragraph: dict, image_name: str) -> None:
61+
def __check_image_paragraph(self, image_paragraph: dict, image_uid: str) -> None:
5962
text = image_paragraph["text"]
6063
image_annotations = image_paragraph["annotations"]
61-
self.assertIn({'start': 0, 'end': len(text), 'name': 'attachment', 'value': image_name}, image_annotations)
64+
self.assertIn({'start': 0, 'end': len(text), 'name': 'attachment', 'value': image_uid}, image_annotations)

0 commit comments

Comments
 (0)