Skip to content

Commit 01a57d9

Browse files
authored
Merge pull request #177 from PDFMathTranslate/dev
feat: integrate BabelDOC v0.5.1 advanced processing options
2 parents 695500f + 78b6e81 commit 01a57d9

File tree

8 files changed

+135
-6
lines changed

8 files changed

+135
-6
lines changed

docs/en/advanced/advanced.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,11 @@ In the following table, we list all advanced options for reference:
8080
| `--only-include-translated-page`| Only include translated pages in the output PDF. Effective only when --pages is used. | `pdf2zh example.pdf --pages 1-5 --only-include-translated-page` |
8181
| `--glossaries` | Custom glossary for translation. | `pdf2zh example.pdf --glossaries "glossary1.csv,glossary2.csv,glossary3.csv"` |
8282
| `--save-auto-extracted-glossary`| save automatically extracted glossary. | `pdf2zh example.pdf --save-auto-extracted-glossary` |
83+
| `--no-merge-alternating-line-numbers` | Disable merging of alternating line numbers and text paragraphs in documents with line numbers | `pdf2zh example.pdf --no-merge-alternating-line-numbers` |
84+
| `--no-remove-non-formula-lines` | Disable removal of non-formula lines within paragraph areas | `pdf2zh example.pdf --no-remove-non-formula-lines` |
85+
| `--non-formula-line-iou-threshold` | Set IoU threshold for identifying non-formula lines (0.0-1.0) | `pdf2zh example.pdf --non-formula-line-iou-threshold 0.85` |
86+
| `--figure-table-protection-threshold` | Set protection threshold for figures and tables (0.0-1.0). Lines within figures/tables will not be processed | `pdf2zh example.pdf --figure-table-protection-threshold 0.95` |
87+
| `--skip-formula-offset-calculation` | Skip formula offset calculation during processing | `pdf2zh example.pdf --skip-formula-offset-calculation` |
8388

8489

8590
##### GUI Args

pdf2zh_next/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030

3131
# from pdf2zh_next.high_level import translate, translate_stream
3232

33-
__version__ = "2.4.2"
33+
__version__ = "2.5.0"
3434
__author__ = "Byaidu, awwaawwa"
3535
__license__ = "AGPL-3.0"
3636
__maintainer__ = "awwaawwa"

pdf2zh_next/config/model.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,26 @@ class PDFSettings(BaseModel):
178178
default=False,
179179
description="Only include translated pages in the output PDF. Effective only when --pages is used.",
180180
)
181+
no_merge_alternating_line_numbers: bool = Field(
182+
default=False,
183+
description="Handle alternating line numbers and text paragraphs in documents with line numbers",
184+
)
185+
no_remove_non_formula_lines: bool = Field(
186+
default=False,
187+
description="Remove non-formula lines within paragraph areas",
188+
)
189+
non_formula_line_iou_threshold: float = Field(
190+
default=0.9,
191+
description="IoU threshold for identifying non-formula lines",
192+
)
193+
figure_table_protection_threshold: float = Field(
194+
default=0.9,
195+
description="Protection threshold for figures and tables (lines within figures/tables will not be processed)",
196+
)
197+
skip_formula_offset_calculation: bool = Field(
198+
default=False,
199+
description="Skip formula offset calculation during processing",
200+
)
181201

182202

183203
class SettingsModel(BaseModel):
@@ -307,6 +327,16 @@ def validate_settings(self) -> None:
307327
f"Invalid primary font family: {self.translation.primary_font_family}"
308328
)
309329

330+
if not (0.0 <= self.pdf.non_formula_line_iou_threshold <= 1.0):
331+
raise ValueError(
332+
"non_formula_line_iou_threshold must be between 0.0 and 1.0"
333+
)
334+
335+
if not (0.0 <= self.pdf.figure_table_protection_threshold <= 1.0):
336+
raise ValueError(
337+
"figure_table_protection_threshold must be between 0.0 and 1.0"
338+
)
339+
310340
if self.pdf.auto_enable_ocr_workaround and self.pdf.ocr_workaround:
311341
self.pdf.ocr_workaround = False
312342
log.warning(

pdf2zh_next/const.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "2.4.2"
1+
__version__ = "2.5.0"
22
__major_version__ = "2"
33
__config_file_version__ = "3"
44

pdf2zh_next/gui.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,15 @@ def _build_translate_settings(
462462
auto_enable_ocr_workaround = ui_inputs.get("auto_enable_ocr_workaround")
463463
only_include_translated_page = ui_inputs.get("only_include_translated_page")
464464

465+
# BabelDOC v0.5.1 new options
466+
merge_alternating_line_numbers = ui_inputs.get("merge_alternating_line_numbers")
467+
remove_non_formula_lines = ui_inputs.get("remove_non_formula_lines")
468+
non_formula_line_iou_threshold = ui_inputs.get("non_formula_line_iou_threshold")
469+
figure_table_protection_threshold = ui_inputs.get(
470+
"figure_table_protection_threshold"
471+
)
472+
skip_formula_offset_calculation = ui_inputs.get("skip_formula_offset_calculation")
473+
465474
# New input for custom_system_prompt
466475
custom_system_prompt_input = ui_inputs.get("custom_system_prompt_input")
467476
glossaries = ui_inputs.get("glossaries")
@@ -557,6 +566,23 @@ def _build_translate_settings(
557566
if formular_char_pattern:
558567
translate_settings.pdf.formular_char_pattern = formular_char_pattern
559568

569+
# Apply BabelDOC v0.5.1 new options
570+
translate_settings.pdf.no_merge_alternating_line_numbers = (
571+
not merge_alternating_line_numbers
572+
)
573+
translate_settings.pdf.no_remove_non_formula_lines = not remove_non_formula_lines
574+
if non_formula_line_iou_threshold is not None:
575+
translate_settings.pdf.non_formula_line_iou_threshold = float(
576+
non_formula_line_iou_threshold
577+
)
578+
if figure_table_protection_threshold is not None:
579+
translate_settings.pdf.figure_table_protection_threshold = float(
580+
figure_table_protection_threshold
581+
)
582+
translate_settings.pdf.skip_formula_offset_calculation = (
583+
skip_formula_offset_calculation
584+
)
585+
560586
assert service in TRANSLATION_ENGINE_METADATA_MAP, "UNKNOW TRANSLATION ENGINE!"
561587

562588
for metadata in TRANSLATION_ENGINE_METADATA:
@@ -800,6 +826,12 @@ async def translate_file(
800826
ocr_workaround,
801827
auto_enable_ocr_workaround,
802828
only_include_translated_page,
829+
# BabelDOC v0.5.1 new options
830+
merge_alternating_line_numbers,
831+
remove_non_formula_lines,
832+
non_formula_line_iou_threshold,
833+
figure_table_protection_threshold,
834+
skip_formula_offset_calculation,
803835
*translation_engine_arg_inputs,
804836
progress=None,
805837
):
@@ -889,6 +921,12 @@ async def translate_file(
889921
"ocr_workaround": ocr_workaround,
890922
"auto_enable_ocr_workaround": auto_enable_ocr_workaround,
891923
"only_include_translated_page": only_include_translated_page,
924+
# BabelDOC v0.5.1 new options
925+
"merge_alternating_line_numbers": merge_alternating_line_numbers,
926+
"remove_non_formula_lines": remove_non_formula_lines,
927+
"non_formula_line_iou_threshold": non_formula_line_iou_threshold,
928+
"figure_table_protection_threshold": figure_table_protection_threshold,
929+
"skip_formula_offset_calculation": skip_formula_offset_calculation,
892930
}
893931
for arg_name, arg_input in zip(
894932
__gui_service_arg_names, translation_engine_arg_inputs, strict=False
@@ -1434,6 +1472,50 @@ async def translate_file(
14341472
interactive=True,
14351473
)
14361474

1475+
# BabelDOC v0.5.1 new options
1476+
gr.Markdown("#### BabelDOC Advanced Options")
1477+
1478+
merge_alternating_line_numbers = gr.Checkbox(
1479+
label="Merge alternating line numbers",
1480+
info="Handle alternating line numbers and text paragraphs in documents with line numbers",
1481+
value=not settings.pdf.no_merge_alternating_line_numbers,
1482+
interactive=True,
1483+
)
1484+
1485+
remove_non_formula_lines = gr.Checkbox(
1486+
label="Remove non-formula lines",
1487+
info="Remove non-formula lines within paragraph areas",
1488+
value=not settings.pdf.no_remove_non_formula_lines,
1489+
interactive=True,
1490+
)
1491+
1492+
non_formula_line_iou_threshold = gr.Slider(
1493+
label="Non-formula line IoU threshold",
1494+
info="IoU threshold for identifying non-formula lines",
1495+
value=settings.pdf.non_formula_line_iou_threshold,
1496+
minimum=0.0,
1497+
maximum=1.0,
1498+
step=0.05,
1499+
interactive=True,
1500+
)
1501+
1502+
figure_table_protection_threshold = gr.Slider(
1503+
label="Figure/table protection threshold",
1504+
info="Protection threshold for figures and tables (lines within figures/tables will not be processed)",
1505+
value=settings.pdf.figure_table_protection_threshold,
1506+
minimum=0.0,
1507+
maximum=1.0,
1508+
step=0.05,
1509+
interactive=True,
1510+
)
1511+
1512+
skip_formula_offset_calculation = gr.Checkbox(
1513+
label="Skip formula offset calculation",
1514+
info="Skip formula offset calculation during processing",
1515+
value=settings.pdf.skip_formula_offset_calculation,
1516+
interactive=True,
1517+
)
1518+
14371519
output_title = gr.Markdown("## Translated", visible=False)
14381520
output_file_mono = gr.File(
14391521
label="Download Translation (Mono)", visible=False
@@ -1697,6 +1779,12 @@ def on_service_change_with_rate_limit(mode, service_name):
16971779
ocr_workaround,
16981780
auto_enable_ocr_workaround,
16991781
only_include_translated_page,
1782+
# BabelDOC v0.5.1 new options
1783+
merge_alternating_line_numbers,
1784+
remove_non_formula_lines,
1785+
non_formula_line_iou_threshold,
1786+
figure_table_protection_threshold,
1787+
skip_formula_offset_calculation,
17001788
*translation_engine_arg_inputs,
17011789
],
17021790
outputs=[

pdf2zh_next/high_level.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -483,6 +483,12 @@ def create_babeldoc_config(settings: SettingsModel, file: Path) -> BabelDOCConfi
483483
auto_extract_glossary=not settings.translation.no_auto_extract_glossary,
484484
primary_font_family=settings.translation.primary_font_family,
485485
only_include_translated_page=settings.pdf.only_include_translated_page,
486+
# BabelDOC v0.5.1 new options
487+
merge_alternating_line_numbers=not settings.pdf.no_merge_alternating_line_numbers,
488+
remove_non_formula_lines=not settings.pdf.no_remove_non_formula_lines,
489+
non_formula_line_iou_threshold=settings.pdf.non_formula_line_iou_threshold,
490+
figure_table_protection_threshold=settings.pdf.figure_table_protection_threshold,
491+
skip_formula_offset_calculation=settings.pdf.skip_formula_offset_calculation,
486492
)
487493
return babeldoc_config
488494

pdf2zh_next/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from pdf2zh_next.config import ConfigManager
1515
from pdf2zh_next.high_level import do_translate_file_async
1616

17-
__version__ = "2.4.2"
17+
__version__ = "2.5.0"
1818

1919
logger = logging.getLogger(__name__)
2020

pyproject.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "pdf2zh-next"
3-
version = "2.4.2"
3+
version = "2.5.0"
44
description = "Latex PDF Translator"
55
authors = [
66
{ name = "awwaawwa", email = "[email protected]" },
@@ -34,7 +34,7 @@ dependencies = [
3434
"gradio_pdf>=0.0.21",
3535
"peewee>=3.17.8",
3636
"fontTools",
37-
"babeldoc>=0.4.11, <0.5.0",
37+
"babeldoc>=0.5.1, <0.6.0",
3838
"rich",
3939
"pydantic-settings>=2.8.1",
4040
"pydantic>=2.10.6",
@@ -81,7 +81,7 @@ max-line-length = 88
8181

8282

8383
[bumpver]
84-
current_version = "2.4.2"
84+
current_version = "2.5.0"
8585
version_pattern = "MAJOR.MINOR.PATCH[.PYTAGNUM]"
8686

8787
[bumpver.file_patterns]

0 commit comments

Comments
 (0)