-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfile_handler.py
More file actions
124 lines (95 loc) · 4.18 KB
/
file_handler.py
File metadata and controls
124 lines (95 loc) · 4.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import io
import xml.etree.ElementTree as ET
from pathlib import Path
from typing import List
import aspose.words as aw
from colorama import Fore, Style
from docx import Document
from docx2python import docx2python
class FileHandler:
from text_editor import TextEditor
def __init__(self, text_editor: TextEditor) -> None:
self.text_editor = text_editor
def read_docx_tables(self, file_path: str):
"""
Reads all the tables in a docx file and returns them as a list.
"""
document = Document(file_path)
tables = document.tables
return tables
def extract_table_columns(
self, table, columns: list[int], num_rows: int = -1
) -> List[List[str]]:
"""
Extracts the contents of the specified columns of a table up to a specified number of rows.
"""
extracted_columns = [[] for _ in range(len(columns))]
if num_rows == -1:
num_rows = len(table.rows)
for i, row in enumerate(table.rows):
if num_rows > -1 and i >= num_rows:
break
for j, col in enumerate(columns):
if col < len(row.cells):
extracted_columns[j].append(row.cells[col].text.strip())
if i % 100 == 0:
print(
f"{Fore.GREEN}Current row number: {i} of {len(table.rows)}{Style.RESET_ALL}, First column: {Fore.BLUE}{extracted_columns[0][-1]}{Style.RESET_ALL}"
)
return extracted_columns
def load_file(self, file_name: str) -> str:
with open(file_name, "r") as file:
extension = Path(file_name).suffix.lstrip(".")
match extension:
case "docx" | "doc" | "rtf":
file_path = None
match extension:
case "rtf":
# Load file as bytesio
with open(file_name, "rb") as f:
data = f.read()
stream = io.BytesIO(data)
doc = aw.Document(stream)
# Save as docx
stream = io.BytesIO()
doc.save(stream, aw.SaveFormat.DOCX)
stream.seek(0)
file_path = stream
case "docx" | "doc":
file_path = file_name
current_template = self.text_editor.current_template
if current_template.get("simple", True):
with docx2python(file_path) as docx_content:
text = docx_content.text
else:
tables = self.read_docx_tables(file_path)
text = ""
table = tables[current_template["row"]]
column_index = current_template["target_col_index"]
target1 = self.extract_table_columns(table, [column_index])[0]
text = "\n".join(target1)
case "xliff":
# Parse the xliff file
root = ET.parse(file_name).getroot()
# source = root.findall(".//{urn:oasis:names:tc:xliff:document:1.2}source")
target = root.findall(
".//{urn:oasis:names:tc:xliff:document:1.2}target"
)
text = ""
for t in target:
if t.text:
text += t.text + "\n"
case "mxliff":
# Parse the xliff file
root = ET.parse(file_name).getroot()
# source = root.findall(".//{urn:oasis:names:tc:xliff:document:1.2}source")
target = root.findall(
".//{urn:oasis:names:tc:xliff:document:1.2}target"
)
text = ""
for t in target:
if t.text:
text += t.text + "\n"
case _:
text = file.read()
return text