Skip to content

Commit 12997ba

Browse files
committed
fix for test?
1 parent 122e9fb commit 12997ba

10 files changed

Lines changed: 128 additions & 120 deletions

mindee/image/extracted_image.py

Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,42 +20,30 @@ class ExtractedImage:
2020
"""Generic class for image extraction."""
2121

2222
buffer: BinaryIO
23+
filename: str
2324
_page_id: int
2425
"""Id of the page the image was extracted from."""
2526
_element_id: int
2627
"""Id of the element on a given page."""
27-
filename: str
28-
"""Name of the file the image was extracted from."""
2928

3029
def __init__(
3130
self,
3231
img_byte_stream: BinaryIO,
33-
orig_filename: str,
34-
orig_extension: str,
32+
filename: str,
3533
page_id: int,
3634
element_id: int,
3735
) -> None:
3836
"""
3937
Initialize the ExtractedImage with a buffer and an internal file name.
4038
4139
:param img_byte_stream: The raw image bytes.
42-
:param orig_filename: Name of the file the image was extracted from.
40+
:param filename: Name of the file.
4341
:param page_id: ID of the page the element was found on.
4442
:param element_id: ID of the element in a page.
4543
"""
4644
self.buffer = img_byte_stream
47-
self.filename = orig_filename
48-
49-
if orig_extension.lower().endswith("pdf"):
50-
extension = "jpg"
51-
else:
52-
extension = orig_extension.lower()
5345
self.buffer.seek(0)
54-
pg_number = str(page_id).zfill(3)
55-
elem_number = str(element_id).zfill(3)
56-
self.internal_file_name = (
57-
f"{orig_filename}_page{pg_number}-{elem_number}.{extension}"
58-
)
46+
self.filename = filename
5947
self._page_id = page_id
6048
self._element_id = 0 if element_id is None else element_id
6149

@@ -88,7 +76,7 @@ def as_input_source(self) -> BytesInput:
8876
:returns: A BufferInput source.
8977
"""
9078
self.buffer.seek(0)
91-
return BytesInput(self.buffer.read(), self.internal_file_name)
79+
return BytesInput(self.buffer.read(), self.filename)
9280

9381
@property
9482
def page_id(self):

mindee/image/extracted_images.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
1+
from pathlib import Path
2+
13
from mindee.image.extracted_image import ExtractedImage
24

35

46
class ExtractedImages(list[ExtractedImage]):
57
"""List of extracted images."""
8+
9+
def save_all_to_disk(self, output_path: Path | str) -> None:
10+
"""Save all extracted images to disk."""
11+
for image in self:
12+
image.save_to_file(output_path)

mindee/image/image_extractor.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import io
4+
from pathlib import Path
45
from typing import Any, BinaryIO
56

67
from mindee.dependencies import requires_pypdfium2
@@ -28,7 +29,7 @@
2829

2930
@requires_pillow
3031
@requires_pypdfium2
31-
def attach_image_as_new_file( # type: ignore
32+
def _attach_image_as_new_file( # type: ignore
3233
input_buffer: BinaryIO,
3334
) -> pdfium.PdfDocument:
3435
"""
@@ -86,11 +87,11 @@ def extract_image_from_polygon(
8687
int(min_max_y.max * height),
8788
)
8889
)
89-
return save_image_to_buffer(cropped_image, file_format)
90+
return _save_image_to_buffer(cropped_image, file_format)
9091

9192

9293
@requires_pillow
93-
def save_image_to_buffer(image: Image.Image, file_format: str) -> BinaryIO:
94+
def _save_image_to_buffer(image: Image.Image, file_format: str) -> BinaryIO:
9495
"""
9596
Saves an image as a buffer.
9697
@@ -144,7 +145,8 @@ def extract_multiple_images_from_source(
144145
:param polygons: List of coordinates to pull the elements from.
145146
:return: List of byte arrays representing the extracted elements.
146147
"""
147-
page = load_pdf_doc(input_source).get_page(page_id)
148+
stem = Path(input_source.filename).stem
149+
page = _load_pdf_doc(input_source).get_page(page_id)
148150
page_content = page.render().to_pil()
149151
width, height = page.get_size()
150152

@@ -159,18 +161,16 @@ def extract_multiple_images_from_source(
159161
extracted_elements.append(
160162
ExtractedImage(
161163
image_data,
162-
input_source.filename,
163-
file_extension,
164+
f"{stem}_page-{(page_id + 1):03d}-item-{(element_id + 1):03d}.{file_extension}",
164165
page_id,
165166
element_id,
166167
)
167168
)
168-
169169
return extracted_elements
170170

171171

172172
@requires_pypdfium2
173-
def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: ignore
173+
def _load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: ignore
174174
"""
175175
Loads a PDF document from a local input source.
176176
@@ -181,4 +181,4 @@ def load_pdf_doc(input_file: LocalInputSource) -> pdfium.PdfDocument: # type: i
181181
input_file.file_object.seek(0)
182182
return pdfium.PdfDocument(input_file.file_object.read())
183183

184-
return attach_image_as_new_file(input_file.file_object)
184+
return _attach_image_as_new_file(input_file.file_object)

mindee/pdf/extracted_pdf.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,14 @@ class ExtractedPDF:
2020

2121
buffer: BinaryIO
2222
filename: str
23+
_page_indexes: tuple[int, int]
2324

24-
def __init__(self, pdf_byte_stream: BinaryIO, filename: str):
25+
def __init__(
26+
self, pdf_byte_stream: BinaryIO, filename: str, page_indexes: tuple[int, int]
27+
):
2528
self.buffer = pdf_byte_stream
2629
self.filename = filename
30+
self._page_indexes = page_indexes
2731

2832
@requires_pypdfium2
2933
def get_page_count(self) -> int:
@@ -40,12 +44,12 @@ def save_to_file(self, output_path: Path | str):
4044
"""
4145
Writes the contents of the current PDF object to a file.
4246
43-
:param output_path: Path of the destination file. If
44-
not extension is provided, pdf will be appended by default.
47+
:param output_path: Path of the destination file.
48+
If no extension is provided, '.pdf' will be appended by default.
4549
"""
46-
out_path = Path(output_path)
47-
if out_path.resolve().is_dir():
48-
raise MindeeError("Provided path is not a file.")
50+
out_path = Path(output_path) / self.filename
51+
if not out_path.resolve().is_dir():
52+
raise MindeeError("Provided path is not a directory.")
4953
if not output_path or not out_path.parent.exists():
5054
raise MindeeError("Invalid save path provided {}.")
5155
if out_path.suffix.lower() != "pdf":
@@ -58,3 +62,8 @@ def as_input_source(self) -> BytesInput:
5862
"""Returns the current PDF object as a usable BytesInput source."""
5963
self.buffer.seek(0)
6064
return BytesInput(self.buffer.read(), self.filename)
65+
66+
@property
67+
def page_indexes(self) -> tuple[int, int]:
68+
"""This PDF was extracted from this page range of the original PDF."""
69+
return self._page_indexes

mindee/pdf/extracted_pdfs.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
1+
from pathlib import Path
2+
13
from mindee.pdf.extracted_pdf import ExtractedPDF
24

35

46
class ExtractedPDFs(list[ExtractedPDF]):
57
"""List of extracted PDFs."""
8+
9+
def save_all_to_disk(self, output_path: Path | str) -> None:
10+
"""Save all extracted images to disk."""
11+
12+
for image in self:
13+
image.save_to_file(output_path)

mindee/pdf/pdf_extractor.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def extract_sub_documents(
6868
"""
6969
Extract the sub-documents from the main pdf, based on the given list of page indexes.
7070
71-
:param page_indexes: List of list of numbers, representing page indexes.
71+
:param page_indexes: 2D list of numbers, representing page indexes.
7272
:return: A list of created PDFS.
7373
"""
7474
extracted_pdfs: list[ExtractedPDF] = []
@@ -80,10 +80,12 @@ def extract_sub_documents(
8080
for page_index in page_index_elem:
8181
if page_index > self.get_page_count():
8282
raise MindeeError(f"Index {page_index} is out of range.")
83-
formatted_max_index = f"{page_index_elem[len(page_index_elem) - 1] + 1:03d}"
84-
field_filename = f"{stem}_{(page_index_elem[0] + 1):03d}-{formatted_max_index}{extension}"
83+
first_page = page_index_elem[0]
84+
last_page = page_index_elem[len(page_index_elem) - 1]
8585
extracted_pdf = ExtractedPDF(
86-
self.cut_pages(page_index_elem), field_filename
86+
self.cut_pages(page_index_elem),
87+
f"{stem}_{(first_page + 1):03d}-{(last_page + 1):03d}{extension}",
88+
(first_page, last_page),
8789
)
8890
extracted_pdfs.append(extracted_pdf)
8991
return extracted_pdfs

tests/v2/file_operations/test_crop_operation.py

Lines changed: 37 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -13,56 +13,51 @@
1313
Image = pytest.importorskip("PIL.Image")
1414

1515

16-
@pytest.fixture
17-
def crops_single_page_path():
18-
return V2_PRODUCT_DATA_DIR / "crop" / "default_sample.jpg"
19-
20-
21-
@pytest.fixture
22-
def crops_multi_page_path():
23-
return V2_PRODUCT_DATA_DIR / "crop" / "multipage_sample.pdf"
24-
25-
26-
@pytest.fixture
27-
def crops_single_page_json_path():
28-
return V2_PRODUCT_DATA_DIR / "crop" / "crop_single.json"
29-
30-
31-
@pytest.fixture
32-
def crops_multi_page_json_path():
33-
return V2_PRODUCT_DATA_DIR / "crop" / "crop_multiple.json"
34-
35-
3616
@pytest.mark.pillow
3717
@pytest.mark.pypdfium2
38-
def test_single_page_crop_split(crops_single_page_path, crops_single_page_json_path):
39-
input_sample = PathInput(crops_single_page_path)
40-
with open(crops_single_page_json_path, "rb") as f:
18+
def test_single_page_crop():
19+
input_sample = PathInput(V2_PRODUCT_DATA_DIR / "crop" / "default_sample.jpg")
20+
with open(V2_PRODUCT_DATA_DIR / "crop" / "default_sample.json", "rb") as f:
4121
response = CropResponse(json.load(f))
4222
extracted_crops = response.inference.result.extract_from_input_source(input_sample)
43-
assert len(extracted_crops) == 1
23+
assert len(extracted_crops) == 2
24+
25+
crop0 = extracted_crops[0]
26+
assert crop0.page_id == 0
27+
assert crop0.element_id == 0
28+
assert crop0.filename == "default_sample_page-001-item-001.jpg"
29+
assert Image.open(crop0.buffer).size == (1057, 2071)
4430

45-
assert extracted_crops[0].page_id == 0
46-
assert extracted_crops[0].element_id == 0
47-
image_buffer_0 = Image.open(extracted_crops[0].buffer)
48-
assert image_buffer_0.size == (2823, 1571)
31+
crop1 = extracted_crops[1]
32+
assert crop1.page_id == 0
33+
assert crop1.element_id == 1
34+
assert crop1.filename == "default_sample_page-001-item-002.jpg"
35+
assert Image.open(crop1.buffer).size == (1298, 1869)
4936

5037

5138
@pytest.mark.pillow
5239
@pytest.mark.pypdfium2
53-
def test_multi_page_receipt_crop(crops_multi_page_path, crops_multi_page_json_path):
54-
input_sample = PathInput(crops_multi_page_path)
55-
with open(crops_multi_page_json_path, "rb") as f:
40+
def test_multi_page_crop():
41+
input_sample = PathInput(V2_PRODUCT_DATA_DIR / "crop" / "multipage_sample.pdf")
42+
with open(V2_PRODUCT_DATA_DIR / "crop" / "multipage_sample.json", "rb") as f:
5643
response = CropResponse(json.load(f))
5744
extracted_crops = response.inference.result.extract_from_input_source(input_sample)
58-
assert len(extracted_crops) == 2
59-
60-
assert extracted_crops[0].page_id == 0
61-
assert extracted_crops[0].element_id == 0
62-
image_buffer_0 = Image.open(extracted_crops[0].buffer)
63-
assert image_buffer_0.size == (156, 758)
64-
65-
assert extracted_crops[1].page_id == 0
66-
assert extracted_crops[1].element_id == 1
67-
image_buffer_1 = Image.open(extracted_crops[1].buffer)
68-
assert image_buffer_1.size == (187, 690)
45+
assert len(extracted_crops) == 5
46+
47+
crop0 = extracted_crops[0]
48+
assert crop0.page_id == 0
49+
assert crop0.element_id == 0
50+
assert crop0.filename == "multipage_sample_page-001-item-001.jpg"
51+
assert Image.open(crop0.buffer).size == (200, 553)
52+
53+
crop1 = extracted_crops[1]
54+
assert crop1.page_id == 0
55+
assert crop1.element_id == 1
56+
assert crop1.filename == "multipage_sample_page-001-item-002.jpg"
57+
assert Image.open(crop1.buffer).size == (203, 333)
58+
59+
crop4 = extracted_crops[4]
60+
assert crop4.page_id == 1
61+
assert crop4.element_id == 1
62+
assert crop4.filename == "multipage_sample_page-002-item-002.jpg"
63+
assert Image.open(crop4.buffer).size == (197, 520)

tests/v2/file_operations/test_crop_operation_integration.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,17 @@
1515
from tests.utils import OUTPUT_DIR, V2_PRODUCT_DATA_DIR, cleanup_output_files
1616

1717

18-
@pytest.fixture
19-
def crop_sample():
20-
return V2_PRODUCT_DATA_DIR / "crop" / "default_sample.jpg"
21-
22-
2318
def check_findoc_return(findoc_response: ExtractionResponse):
2419
assert len(findoc_response.inference.model.id) > 0
2520
assert findoc_response.inference.result.fields.get("total_amount").value > 0
2621

2722

23+
output_files = [
24+
"default_sample_page-001-item-001.jpg",
25+
"default_sample_page-001-item-001.jpg",
26+
]
27+
28+
2829
@pytest.mark.pillow
2930
@pytest.mark.pypdfium2
3031
@pytest.mark.integration
@@ -38,30 +39,30 @@ def test_image_should_extract_crops():
3839
)
3940
assert len(response.inference.result.crops) == 2
4041

41-
extracted_images = extract_multiple_crops(
42+
extracted_crops = extract_multiple_crops(
4243
crop_input, response.inference.result.crops
4344
)
4445

45-
assert len(extracted_images) == 2
46-
assert extracted_images[0].filename == "default_sample.jpg_page1-0.jpg"
47-
assert extracted_images[1].filename == "default_sample.jpg_page1-1.jpg"
46+
assert len(extracted_crops) == 2
47+
assert extracted_crops[0].filename == output_files[0]
48+
assert extracted_crops[1].filename == output_files[1]
4849

4950
invoice_0 = client.enqueue_and_get_result(
5051
ExtractionResponse,
51-
extracted_images[0].as_input_source(),
52+
extracted_crops[0].as_input_source(),
5253
ExtractionParameters(
5354
getenv("MINDEE_V2_SE_TESTS_FINDOC_MODEL_ID"), close_file=False
5455
),
5556
)
5657
check_findoc_return(invoice_0)
57-
extracted_images.save_all_to_disk(OUTPUT_DIR)
58-
crop1size = os.path.getsize(OUTPUT_DIR / "crop_001.jpg")
59-
crop2size = os.path.getsize(OUTPUT_DIR / "crop_002.jpg")
60-
assert 180000 <= crop1size <= 199685
61-
assert 190000 <= crop2size <= 199433
58+
extracted_crops.save_all_to_disk(OUTPUT_DIR)
59+
crop0_size = os.path.getsize(OUTPUT_DIR / output_files[0])
60+
crop1_size = os.path.getsize(OUTPUT_DIR / output_files[1])
61+
assert 180000 <= crop0_size <= 199685
62+
assert 190000 <= crop1_size <= 199433
6263

6364

6465
@pytest.fixture(scope="module", autouse=True)
6566
def cleanup():
6667
yield
67-
cleanup_output_files(["crop_001.jpg", "crop_002.jpg"])
68+
cleanup_output_files()

0 commit comments

Comments
 (0)