99import pytest
1010from azure .ai .documentintelligence .aio import DocumentIntelligenceClient
1111from azure .ai .documentintelligence .models import (
12+ AnalyzeDocumentRequest ,
1213 AnalyzeResult ,
1314 BoundingRegion ,
1415 DocumentCaption ,
2122from azure .core .credentials import AzureKeyCredential
2223from azure .core .exceptions import HttpResponseError
2324from PIL import Image , ImageChops
25+ from werkzeug .datastructures import FileStorage
2426
2527from prepdocslib .figureprocessor import (
2628 FigureProcessor ,
@@ -178,8 +180,11 @@ def mock_crop_image_from_pdf_page(doc, page_number, bounding_box):
178180@pytest .mark .asyncio
179181async def test_parse_simple (monkeypatch ):
180182 mock_poller = MagicMock ()
183+ captured_bodies : list [AnalyzeDocumentRequest ] = []
181184
182- async def mock_begin_analyze_document (self , model_id , analyze_request , ** kwargs ):
185+ async def mock_begin_analyze_document (self , model_id , ** kwargs ):
186+ body = kwargs ["body" ]
187+ captured_bodies .append (body )
183188 return mock_poller
184189
185190 async def mock_poller_result ():
@@ -205,13 +210,106 @@ async def mock_poller_result():
205210 assert pages [0 ].page_num == 0
206211 assert pages [0 ].offset == 0
207212 assert pages [0 ].text == "Page content"
213+ assert len (captured_bodies ) == 1
214+ assert isinstance (captured_bodies [0 ], AnalyzeDocumentRequest )
215+ assert captured_bodies [0 ].bytes_source == b"pdf content bytes"
216+
217+
218+ @pytest .mark .asyncio
219+ async def test_parse_with_filestorage (monkeypatch ):
220+ mock_poller = MagicMock ()
221+ captured_bodies : list [AnalyzeDocumentRequest ] = []
222+
223+ async def mock_begin_analyze_document (self , model_id , ** kwargs ):
224+ captured_bodies .append (kwargs ["body" ])
225+ return mock_poller
226+
227+ async def mock_poller_result ():
228+ return AnalyzeResult (
229+ content = "Page content" ,
230+ pages = [DocumentPage (page_number = 1 , spans = [DocumentSpan (offset = 0 , length = 12 )])],
231+ tables = [],
232+ figures = [],
233+ )
234+
235+ monkeypatch .setattr (DocumentIntelligenceClient , "begin_analyze_document" , mock_begin_analyze_document )
236+ monkeypatch .setattr (mock_poller , "result" , mock_poller_result )
237+
238+ parser = DocumentAnalysisParser (
239+ endpoint = "https://example.com" ,
240+ credential = MockAzureCredential (),
241+ )
242+ stream = io .BytesIO (b"pdf content bytes" )
243+ file_storage = FileStorage (stream = stream , filename = "upload.pdf" )
244+ file_storage .name = "upload.pdf"
245+ pages = [page async for page in parser .parse (file_storage )]
246+
247+ assert len (pages ) == 1
248+ assert pages [0 ].page_num == 0
249+ assert pages [0 ].offset == 0
250+ assert pages [0 ].text == "Page content"
251+ assert len (captured_bodies ) == 1
252+ assert isinstance (captured_bodies [0 ], AnalyzeDocumentRequest )
253+ assert captured_bodies [0 ].bytes_source == b"pdf content bytes"
254+
255+
256+ @pytest .mark .asyncio
257+ async def test_parse_with_non_seekable_stream (monkeypatch ):
258+ mock_poller = MagicMock ()
259+ captured_bodies : list [AnalyzeDocumentRequest ] = []
260+
261+ async def mock_begin_analyze_document (self , model_id , ** kwargs ):
262+ captured_bodies .append (kwargs ["body" ])
263+ return mock_poller
264+
265+ async def mock_poller_result ():
266+ return AnalyzeResult (
267+ content = "Page content" ,
268+ pages = [DocumentPage (page_number = 1 , spans = [DocumentSpan (offset = 0 , length = 12 )])],
269+ tables = [],
270+ figures = [],
271+ )
272+
273+ monkeypatch .setattr (DocumentIntelligenceClient , "begin_analyze_document" , mock_begin_analyze_document )
274+ monkeypatch .setattr (mock_poller , "result" , mock_poller_result )
275+
276+ class NonSeekableStream :
277+ def __init__ (self , data : bytes , name : str ):
278+ self ._data = data
279+ self ._name = name
280+ self ._consumed = False
281+
282+ @property
283+ def name (self ) -> str : # type: ignore[override]
284+ return self ._name
285+
286+ def read (self ) -> bytes :
287+ return self ._data
288+
289+ parser = DocumentAnalysisParser (
290+ endpoint = "https://example.com" ,
291+ credential = MockAzureCredential (),
292+ )
293+
294+ stream = NonSeekableStream (b"pdf content bytes" , "nonseekable.pdf" )
295+ pages = [page async for page in parser .parse (stream )]
296+
297+ assert len (pages ) == 1
298+ assert pages [0 ].page_num == 0
299+ assert pages [0 ].offset == 0
300+ assert pages [0 ].text == "Page content"
301+ assert len (captured_bodies ) == 1
302+ assert isinstance (captured_bodies [0 ], AnalyzeDocumentRequest )
303+ assert captured_bodies [0 ].bytes_source == b"pdf content bytes"
208304
209305
210306@pytest .mark .asyncio
211307async def test_parse_doc_with_tables (monkeypatch ):
212308 mock_poller = MagicMock ()
309+ captured_bodies : list [AnalyzeDocumentRequest ] = []
213310
214- async def mock_begin_analyze_document (self , model_id , analyze_request , ** kwargs ):
311+ async def mock_begin_analyze_document (self , model_id , ** kwargs ):
312+ captured_bodies .append (kwargs ["body" ])
215313 return mock_poller
216314
217315 async def mock_poller_result ():
@@ -281,13 +379,17 @@ async def mock_poller_result():
281379 pages [0 ].text
282380 == "# Simple HTML Table\n \n \n <figure><table><tr><th>Header 1</th><th>Header 2</th></tr><tr><td>Cell 1</td><td>Cell 2</td></tr><tr><td>Cell 3</td><td>Cell 4</td></tr></table></figure>"
283381 )
382+ assert len (captured_bodies ) == 1
383+ assert isinstance (captured_bodies [0 ], AnalyzeDocumentRequest )
284384
285385
286386@pytest .mark .asyncio
287387async def test_parse_doc_with_figures (monkeypatch ):
288388 mock_poller = MagicMock ()
389+ captured_kwargs : list [dict ] = []
289390
290- async def mock_begin_analyze_document (self , model_id , analyze_request , ** kwargs ):
391+ async def mock_begin_analyze_document (self , model_id , ** kwargs ):
392+ captured_kwargs .append (kwargs )
291393 return mock_poller
292394
293395 async def mock_poller_result ():
@@ -330,13 +432,20 @@ async def mock_poller_result():
330432 == '# Simple Figure\n \n This text is before the figure and NOT part of it.\n \n \n <figure id="1.1"></figure>\n \n \n This is text after the figure that\' s not part of it.'
331433 )
332434 assert pages [0 ].images [0 ].placeholder == '<figure id="1.1"></figure>'
435+ assert len (captured_kwargs ) == 1
436+ body = captured_kwargs [0 ]["body" ]
437+ assert isinstance (body , AnalyzeDocumentRequest )
438+ assert captured_kwargs [0 ]["output" ] == ["figures" ]
439+ assert captured_kwargs [0 ]["features" ] == ["ocrHighResolution" ]
333440
334441
335442@pytest .mark .asyncio
336443async def test_parse_unsupportedformat (monkeypatch , caplog ):
337444 mock_poller = MagicMock ()
445+ captured_kwargs : list [dict ] = []
338446
339- async def mock_begin_analyze_document (self , model_id , analyze_request , ** kwargs ):
447+ async def mock_begin_analyze_document (self , model_id , ** kwargs ):
448+ captured_kwargs .append (kwargs )
340449
341450 if kwargs .get ("features" ) == ["ocrHighResolution" ]:
342451
@@ -387,6 +496,11 @@ async def mock_poller_result():
387496 assert pages [0 ].page_num == 0
388497 assert pages [0 ].offset == 0
389498 assert pages [0 ].text == "Page content"
499+ assert len (captured_kwargs ) == 2
500+ assert captured_kwargs [0 ]["features" ] == ["ocrHighResolution" ]
501+ assert isinstance (captured_kwargs [0 ]["body" ], AnalyzeDocumentRequest )
502+ assert captured_kwargs [1 ].get ("features" ) is None
503+ assert isinstance (captured_kwargs [1 ]["body" ], AnalyzeDocumentRequest )
390504
391505
392506@pytest .mark .asyncio
0 commit comments