-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathExtractor.py
More file actions
32 lines (25 loc) · 796 Bytes
/
Extractor.py
File metadata and controls
32 lines (25 loc) · 796 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import os
import Color
import pytesseract as tess
tess.pytesseract.tesseract_cmd = r'Tesseract-OCR\tesseract.exe'
from PIL import Image
garbage = '=#«»|'
def remove_garbage(text, garbage_list):
for char in garbage_list:
text = text.replace(char, '')
return text.replace('. ', ' ')
def Extract_Table(path):
try:
img = Image.open(path)
except:
print(Color.red, 'Unable to Load File', Color.reset, sep='')
input()
os.system('pause')
data = tess.image_to_string(img)
data = remove_garbage(data, garbage)
data = data.split('\n')
data = list(filter(lambda x: x!='', data))
for i in range(len(data)):
data[i] = data[i].split(' ')
data[i] = list(filter(lambda x: x!='', data[i]))
return data