Skip to content

Commit f524a35

Browse files
committed
Added tesseract version and libcurl check for URL input
1 parent 30a0b28 commit f524a35

File tree

3 files changed

+48
-3
lines changed

3 files changed

+48
-3
lines changed

pytesseract/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from .pytesseract import ALTONotSupported
33
from .pytesseract import get_languages
44
from .pytesseract import get_tesseract_version
5+
from .pytesseract import has_libcurl
56
from .pytesseract import image_to_alto_xml
67
from .pytesseract import image_to_boxes
78
from .pytesseract import image_to_data
@@ -14,6 +15,7 @@
1415
from .pytesseract import TesseractError
1516
from .pytesseract import TesseractNotFoundError
1617
from .pytesseract import TSVNotSupported
18+
from .pytesseract import URLNotSupported
1719

1820

1921
__version__ = '0.3.13'

pytesseract/pytesseract.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@
8181

8282
TESSERACT_MIN_VERSION = Version('3.05')
8383
TESSERACT_ALTO_VERSION = Version('4.1.0')
84+
TESSERACT_URL_VERSION = Version('4.1.1')
8485

8586

8687
class Output:
@@ -123,6 +124,13 @@ def __init__(self):
123124
'ALTO output not supported. Tesseract >= 4.1.0 required',
124125
)
125126

127+
class URLNotSupported(EnvironmentError):
128+
def __init__(self):
129+
super().__init__(
130+
'URL input not supported. '
131+
'Tesseract >= 4.1.1 built with libcurl required',
132+
)
133+
126134

127135
def kill(process, code):
128136
process.terminate()
@@ -210,6 +218,9 @@ def save(image):
210218
with NamedTemporaryFile(prefix='tess_', delete=False) as f:
211219
if isinstance(image, str):
212220
if image.startswith('http:') or image.startswith('https:'):
221+
if get_tesseract_version(cached=True) < TESSERACT_URL_VERSION\
222+
or not has_libcurl(cached=True):
223+
raise URLNotSupported()
213224
yield f.name, image
214225
else:
215226
yield f.name, realpath(normpath(normcase(image)))
@@ -473,6 +484,24 @@ def get_tesseract_version():
473484
return version
474485

475486

487+
@run_once
488+
def has_libcurl():
489+
"""
490+
Returns True if tesseract-ocr was installed with libcurl or False otherwise
491+
"""
492+
try:
493+
output = subprocess.check_output(
494+
[tesseract_cmd, '--version'],
495+
stderr=subprocess.STDOUT,
496+
env=environ,
497+
stdin=subprocess.DEVNULL,
498+
)
499+
except OSError:
500+
raise TesseractNotFoundError()
501+
502+
return 'libcurl' in output.decode(DEFAULT_ENCODING)
503+
504+
476505
def image_to_string(
477506
image,
478507
lang=None,

tests/pytesseract_test.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from pytesseract import ALTONotSupported
1515
from pytesseract import get_languages
1616
from pytesseract import get_tesseract_version
17+
from pytesseract import has_libcurl
1718
from pytesseract import image_to_alto_xml
1819
from pytesseract import image_to_boxes
1920
from pytesseract import image_to_data
@@ -24,6 +25,7 @@
2425
from pytesseract import run_and_get_multiple_output
2526
from pytesseract import TesseractNotFoundError
2627
from pytesseract import TSVNotSupported
28+
from pytesseract import URLNotSupported
2729
from pytesseract.pytesseract import file_to_dict
2830
from pytesseract.pytesseract import numpy_installed
2931
from pytesseract.pytesseract import pandas_installed
@@ -45,12 +47,14 @@
4547
IS_PYTHON_3 = not IS_PYTHON_2
4648

4749
TESSERACT_VERSION = tuple(get_tesseract_version().release) # to skip tests
50+
HAS_LIBCURL = has_libcurl() # to skip tests
4851

4952
TESTS_DIR = path.dirname(path.abspath(__file__))
5053
DATA_DIR = path.join(TESTS_DIR, 'data')
5154
TESSDATA_DIR = path.join(TESTS_DIR, 'tessdata')
5255
TEST_JPEG = path.join(DATA_DIR, 'test.jpg')
53-
TEST_JPEG_URL = 'https://i.imgur.com/hWO45US.jpg'
56+
TEST_JPEG_URL = ('https://github.com/madmaze/pytesseract'
57+
'/blob/master/tests/data/test.jpg?raw=true')
5458

5559
pytestmark = pytest.mark.pytesseract # used marker for the module
5660
string_type = unicode if IS_PYTHON_2 else str # noqa: 821
@@ -128,8 +132,9 @@ def test_image_to_string_with_image_type(test_file):
128132
ids=['jpeg_url'],
129133
)
130134
def test_image_to_string_with_url(test_file):
131-
# Tesseract-ocr supports image URLs from version 4.1.1
132-
if TESSERACT_VERSION[0] < 4:
135+
# Tesseract-ocr supports image URLs from version 4.1.1
136+
# and must be built with libcurl.
137+
if TESSERACT_VERSION < (4, 1, 1) or not HAS_LIBCURL:
133138
pytest.skip('skip url test')
134139
assert 'The quick brown dog' in image_to_string(test_file)
135140

@@ -311,6 +316,15 @@ def test_image_to_data__pandas_support(test_file_small):
311316
image_to_data(test_file_small, output_type=Output.DATAFRAME)
312317

313318

319+
@pytest.mark.skipif(
320+
TESSERACT_VERSION >= (4, 1, 1) and HAS_LIBCURL,
321+
reason='requires tesseract < 4.1.1 or tesseract built without libcurl',
322+
)
323+
def test_image_to_string_url_support():
324+
with pytest.raises(URLNotSupported):
325+
image_to_string(TEST_JPEG_URL)
326+
327+
314328
@pytest.mark.skipif(
315329
TESSERACT_VERSION[:2] < (3, 5),
316330
reason='requires tesseract >= 3.05',

0 commit comments

Comments
 (0)