From 842256ad82326210ff8547af7bb2263c4d4e6c2c Mon Sep 17 00:00:00 2001 From: tbartley94 Date: Tue, 18 Nov 2025 12:10:58 -0800 Subject: [PATCH 1/3] Hebrew itn (#366) * rebase to main Signed-off-by: tbartley94 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rebase Signed-off-by: tbartley94 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rebasing Signed-off-by: tbartley94 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * responding to formatting pr Signed-off-by: tbartley94 * isort and moving string map to string file Signed-off-by: tbartley94 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: tbartley94 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- Jenkinsfile | 25 ++- .../inverse_text_normalization/he/__init__.py | 13 ++ .../he/data/decimals/minutes_exception.tsv | 2 + .../he/data/measurements.tsv | 45 ++++ .../he/data/months.tsv | 13 ++ .../he/data/months_name2number.tsv | 12 ++ .../he/data/months_ordinal2number.tsv | 12 ++ .../he/data/numbers/__init__.py | 13 ++ .../he/data/numbers/decimal_fractions.tsv | 6 + .../he/data/numbers/digit.tsv | 20 ++ .../he/data/numbers/hundreds_exception.tsv | 2 + .../he/data/numbers/millions_exception.tsv | 1 + .../he/data/numbers/teen.tsv | 21 ++ .../he/data/numbers/thousands.tsv | 8 + .../he/data/numbers/thousands_exception.tsv | 2 + .../he/data/numbers/ties.tsv | 8 + .../he/data/numbers/viable_hours.tsv | 15 ++ .../he/data/numbers/zero.tsv | 1 + .../he/data/ordinals/__init__.py | 13 ++ .../he/data/ordinals/digit.tsv | 10 + .../he/data/prefix.tsv | 17 ++ .../he/data/spaced_measurements.tsv | 17 ++ .../he/data/time/__init__.py | 13 ++ .../he/data/time/day_suffix.tsv | 2 + .../he/data/time/evening_suffix.tsv | 2 + .../he/data/time/hour_to_evening.tsv | 7 + .../he/data/time/hour_to_night.tsv | 9 + .../he/data/time/hour_to_noon.tsv | 7 + .../he/data/time/midnight_to_hour.tsv | 1 + .../he/data/time/minute_to.tsv | 58 +++++ .../he/data/time/minute_to_verbose.tsv | 6 + .../he/data/time/minute_verbose.tsv | 8 + .../he/data/time/night_suffix.tsv | 1 + .../he/data/time/noon_suffix.tsv | 3 + .../he/data/time/time_suffix.tsv | 8 + .../he/data/time/to_hour.tsv | 13 ++ .../he/data/whitelist.tsv | 20 ++ .../he/graph_utils.py | 119 +++++++++++ .../he/taggers/__init__.py | 13 ++ .../he/taggers/cardinal.py | 158 ++++++++++++++ .../he/taggers/date.py | 106 +++++++++ .../he/taggers/decimal.py | 144 +++++++++++++ .../he/taggers/measure.py | 114 ++++++++++ .../he/taggers/ordinal.py | 43 ++++ .../he/taggers/punctuation.py | 35 +++ .../he/taggers/time.py | 202 ++++++++++++++++++ .../he/taggers/tokenize_and_classify.py | 104 +++++++++ .../he/taggers/whitelist.py | 55 +++++ .../he/taggers/word.py | 31 +++ .../inverse_text_normalization/he/utils.py | 182 ++++++++++++++++ .../he/verbalizers/__init__.py | 13 ++ .../he/verbalizers/cardinal.py | 84 ++++++++ .../he/verbalizers/date.py | 120 +++++++++++ .../he/verbalizers/decimal.py | 90 ++++++++ .../he/verbalizers/measure.py | 107 ++++++++++ .../he/verbalizers/ordinal.py | 38 ++++ .../he/verbalizers/time.py | 95 ++++++++ .../he/verbalizers/verbalize.py | 54 +++++ .../he/verbalizers/verbalize_final.py | 44 ++++ .../he/verbalizers/whitelist.py | 50 +++++ .../he/verbalizers/word.py | 34 +++ .../inverse_normalize.py | 7 +- tests/nemo_text_processing/he/__init__.py | 13 ++ .../test_cases_cardinal.txt | 138 ++++++++++++ .../test_cases_date.txt | 29 +++ .../test_cases_decimal.txt | 66 ++++++ .../test_cases_measure.txt | 10 + .../test_cases_time.txt | 34 +++ .../test_cases_whitelist.txt | 4 + .../test_full_sentences.txt | 56 +++++ .../nemo_text_processing/he/test_cardinal.py | 31 +++ tests/nemo_text_processing/he/test_date.py | 31 +++ tests/nemo_text_processing/he/test_decimal.py | 31 +++ .../he/test_full_sentences.py | 31 +++ tests/nemo_text_processing/he/test_measure.py | 31 +++ ..._sparrowhawk_inverse_text_normalization.sh | 61 ++++++ tests/nemo_text_processing/he/test_time.py | 31 +++ .../nemo_text_processing/he/test_whitelist.py | 31 +++ .../pynini_export.py | 10 + 79 files changed, 3112 insertions(+), 2 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/he/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/decimals/minutes_exception.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/measurements.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/months.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/months_name2number.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/months_ordinal2number.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/numbers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/numbers/decimal_fractions.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/numbers/digit.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/numbers/hundreds_exception.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/numbers/millions_exception.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/numbers/teen.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/numbers/thousands.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/numbers/thousands_exception.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/numbers/ties.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/numbers/viable_hours.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/numbers/zero.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/ordinals/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/ordinals/digit.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/prefix.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/spaced_measurements.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/day_suffix.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/evening_suffix.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_evening.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_night.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_noon.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/midnight_to_hour.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/minute_to.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/minute_to_verbose.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/minute_verbose.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/night_suffix.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/noon_suffix.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/time_suffix.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/time/to_hour.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/whitelist.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/he/graph_utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/taggers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/taggers/date.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/taggers/measure.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/taggers/time.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/taggers/word.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/verbalizers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py create mode 100644 nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py create mode 100644 tests/nemo_text_processing/he/__init__.py create mode 100644 tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_cardinal.txt create mode 100644 tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_date.txt create mode 100644 tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_decimal.txt create mode 100644 tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_measure.txt create mode 100644 tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_time.txt create mode 100644 tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_whitelist.txt create mode 100644 tests/nemo_text_processing/he/data_inverse_text_normalization/test_full_sentences.txt create mode 100644 tests/nemo_text_processing/he/test_cardinal.py create mode 100644 tests/nemo_text_processing/he/test_date.py create mode 100644 tests/nemo_text_processing/he/test_decimal.py create mode 100644 tests/nemo_text_processing/he/test_full_sentences.py create mode 100644 tests/nemo_text_processing/he/test_measure.py create mode 100644 tests/nemo_text_processing/he/test_sparrowhawk_inverse_text_normalization.sh create mode 100644 tests/nemo_text_processing/he/test_time.py create mode 100644 tests/nemo_text_processing/he/test_whitelist.py diff --git a/Jenkinsfile b/Jenkinsfile index 7b8bf9336..f0ac60196 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -23,6 +23,7 @@ pipeline { SV_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' ZH_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/11-13-24-0' IT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/08-22-24-0' + HE_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/09-24-25-0' HY_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' @@ -257,7 +258,24 @@ pipeline { } } } - + stage('L0: Create He TN/ITN Grammars & MR') { + when { + anyOf { + branch 'main' + branch 'staging/**' + branch 'staging_*' + changeRequest target: 'main' + } + } + failFast true + parallel { + stage('L0: HE ITN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=he --text="ת " --cache_dir ${HE_TN_CACHE}' + } + } + } + } stage('L0: Create HY TN/ITN Grammars & MR') { when { anyOf { @@ -417,6 +435,11 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/hy/ -m "not pleasefixme" --cpu --tn_cache_dir ${HY_TN_CACHE}' } } + stage('L1: Run all HE TN/ITN tests (restore grammars from cache)') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/he/ -m "not pleasefixme" --cpu --tn_cache_dir ${HE_TN_CACHE}' + } + } } } diff --git a/nemo_text_processing/inverse_text_normalization/he/__init__.py b/nemo_text_processing/inverse_text_normalization/he/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/data/decimals/minutes_exception.tsv b/nemo_text_processing/inverse_text_normalization/he/data/decimals/minutes_exception.tsv new file mode 100644 index 000000000..5626b7100 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/decimals/minutes_exception.tsv @@ -0,0 +1,2 @@ +חצי +רבע diff --git a/nemo_text_processing/inverse_text_normalization/he/data/measurements.tsv b/nemo_text_processing/inverse_text_normalization/he/data/measurements.tsv new file mode 100644 index 000000000..fbd061bc5 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/measurements.tsv @@ -0,0 +1,45 @@ +°F פרנהייט +°C צלסיוס +° מעלות +°F מעלות פרנהייט +°C מעלות צלסיוס +K קלווין +% אחוז +% אחוזים +Hz הרץ +kW קילוואט +kW קילו ואט +kW קילו וואט +kWh קילו ואט לשעה +kWh קילוואט לשעה +Wh ואט לשעה +W ואט +ghz ג׳יגה הרץ +ghz גיגה הרץ +khz קילו הרץ +mhz מגה הרץ +v וולט +nm ננומטר +mA מילי אמפר +tW טרה ואט +mv מילי וולט +mW מגה ואט +μm מיקרומטר +" אינץ׳ +cc סי סי +ω אוהם +db דציבל +db דציבלים +kb קילו ביט +mb מגה ביט +gb ג׳יגה ביט +gb גיגה ביט +tb טרה ביט +pb פטה ביט +mb מגה בייט +kb קילו בייט +gb ג׳יגה בייט +gb גיגה בייט +tb טרה בייט +pb פטה בייט +A אמפר \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/months.tsv b/nemo_text_processing/inverse_text_normalization/he/data/months.tsv new file mode 100644 index 000000000..05415cc3d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/months.tsv @@ -0,0 +1,13 @@ +ינואר +פברואר +מרץ +מרס +אפריל +מאי +יוני +יולי +אוגוסט +ספטמבר +אוקטובר +נובמבר +דצמבר \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/months_name2number.tsv b/nemo_text_processing/inverse_text_normalization/he/data/months_name2number.tsv new file mode 100644 index 000000000..651118ca1 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/months_name2number.tsv @@ -0,0 +1,12 @@ +ינואר 1 +פברואר 2 +מרץ 3 +אפריל 4 +מאי 5 +יוני 6 +יולי 7 +אוגוסט 8 +ספטמבר 9 +אוקטובר 10 +נובמבר 11 +דצמבר 12 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/months_ordinal2number.tsv b/nemo_text_processing/inverse_text_normalization/he/data/months_ordinal2number.tsv new file mode 100644 index 000000000..e75a452d8 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/months_ordinal2number.tsv @@ -0,0 +1,12 @@ +ראשון 1 +שני 2 +שלישי 3 +רביעי 4 +חמישי 5 +שישי 6 +שביעי 7 +שמיני 8 +תשיעי 9 +עשירי 10 +אחת עשרה 11 +שתיים עשרה 12 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/__init__.py b/nemo_text_processing/inverse_text_normalization/he/data/numbers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/decimal_fractions.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/decimal_fractions.tsv new file mode 100644 index 000000000..d88316454 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/decimal_fractions.tsv @@ -0,0 +1,6 @@ +חצי 5 +רבע 25 +שלושת רבעי 75 +עשירית 1 +שתי עשיריות 2 +חמישית 2 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/digit.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/digit.tsv new file mode 100644 index 000000000..68c02dd42 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/digit.tsv @@ -0,0 +1,20 @@ +אחד 1 +שניים 2 +שני 2 +שלושה 3 +ארבעה 4 +חמישה 5 +שישה 6 +שבעה 7 +שמונה 8 +תשעה 9 +אחת 1 +שתיים 2 +שתים 2 +שתי 2 +שלוש 3 +ארבע 4 +חמש 5 +שש 6 +שבע 7 +תשע 9 diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/hundreds_exception.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/hundreds_exception.tsv new file mode 100644 index 000000000..88e54ab57 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/hundreds_exception.tsv @@ -0,0 +1,2 @@ +מאה 1 +מאתיים 2 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/millions_exception.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/millions_exception.tsv new file mode 100644 index 000000000..1443e5def --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/millions_exception.tsv @@ -0,0 +1 @@ +מיליון 1 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/teen.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/teen.tsv new file mode 100644 index 000000000..26f1a5a4c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/teen.tsv @@ -0,0 +1,21 @@ +עשר 10 +אחד עשר 11 +שניים עשר 12 +שלושה עשר 13 +ארבעה עשר 14 +חמישה עשר 15 +שישה עשר 16 +שבעה עשר 17 +שמונה עשר 18 +תשעה עשר 19 +עשרה 10 +אחת עשרה 11 +שתיים עשרה 12 +שתים עשרה 12 +שלוש עשרה 13 +ארבע עשרה 14 +חמש עשרה 15 +שש עשרה 16 +שבע עשרה 17 +שמונה עשרה 18 +תשע עשרה 19 diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/thousands.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/thousands.tsv new file mode 100644 index 000000000..534789509 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/thousands.tsv @@ -0,0 +1,8 @@ +שלושת 3 +ארבעת 4 +חמשת 5 +ששת 6 +שבעת 7 +שמונת 8 +תשעת 9 +עשרת 10 diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/thousands_exception.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/thousands_exception.tsv new file mode 100644 index 000000000..dd0c71c0d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/thousands_exception.tsv @@ -0,0 +1,2 @@ +אלף 1 +אלפיים 2 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/ties.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/ties.tsv new file mode 100644 index 000000000..b6dd59ca3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/ties.tsv @@ -0,0 +1,8 @@ +עשרים 2 +שלושים 3 +ארבעים 4 +חמישים 5 +שישים 6 +שבעים 7 +שמונים 8 +תשעים 9 diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/viable_hours.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/viable_hours.tsv new file mode 100644 index 000000000..6a2cb1307 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/viable_hours.tsv @@ -0,0 +1,15 @@ +אחד 1 +אחת 1 +שתיים 2 +שתים 2 +שלוש 3 +ארבע 4 +חמש 5 +שש 6 +שבע 7 +שמונה 8 +תשע 9 +עשר 10 +אחת עשרה 11 +שתיים עשרה 12 +שתים עשרה 12 diff --git a/nemo_text_processing/inverse_text_normalization/he/data/numbers/zero.tsv b/nemo_text_processing/inverse_text_normalization/he/data/numbers/zero.tsv new file mode 100644 index 000000000..a0b033c5d --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/numbers/zero.tsv @@ -0,0 +1 @@ +אפס 0 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/ordinals/__init__.py b/nemo_text_processing/inverse_text_normalization/he/data/ordinals/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/ordinals/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/data/ordinals/digit.tsv b/nemo_text_processing/inverse_text_normalization/he/data/ordinals/digit.tsv new file mode 100644 index 000000000..036e1433a --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/ordinals/digit.tsv @@ -0,0 +1,10 @@ +ראשון אחד +שני שניים +שלישי שלושה +רביעי ארבעה +חמישי חמישה +שישי שישה +שביעי שבעה +שמיני שמונה +תשיעי תשעה +עשירי עשרה \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/prefix.tsv b/nemo_text_processing/inverse_text_normalization/he/data/prefix.tsv new file mode 100644 index 000000000..988d6aedf --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/prefix.tsv @@ -0,0 +1,17 @@ +וה +שה +ב +כ +ל +מ +ה +ו +וב +ול +ש +מה +ומ +שכ +שב +בכ +לכ \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/spaced_measurements.tsv b/nemo_text_processing/inverse_text_normalization/he/data/spaced_measurements.tsv new file mode 100644 index 000000000..a97b03412 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/spaced_measurements.tsv @@ -0,0 +1,17 @@ +ק״מ קילומטר +ק״מ קילומטרים +מ׳ מטר +מ׳ מטרים +ס״מ סנטימטר +ס״מ סנטימטרים +מ״מ מילימטר +מ״מ מילימטרים +מ״ג מיליגרם +מ״ג מיליגרמים +מ״ל מיליליטר +ק״ג קילוגרם +ק״ג קילוגרמים +קמ״ש קילומטר לשעה +קמ״ש קילומטרים לשעה +ג׳ גרם +ג׳ גרמים \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/__init__.py b/nemo_text_processing/inverse_text_normalization/he/data/time/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/day_suffix.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/day_suffix.tsv new file mode 100644 index 000000000..a4f9d2d46 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/day_suffix.tsv @@ -0,0 +1,2 @@ +בבוקר +לפנות בוקר \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/evening_suffix.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/evening_suffix.tsv new file mode 100644 index 000000000..583470a05 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/evening_suffix.tsv @@ -0,0 +1,2 @@ +בערב +לפנות ערב \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_evening.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_evening.tsv new file mode 100644 index 000000000..4fd47d1e2 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_evening.tsv @@ -0,0 +1,7 @@ +5 17 +6 18 +7 19 +8 20 +9 21 +10 22 +11 23 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_night.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_night.tsv new file mode 100644 index 000000000..656d161b2 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_night.tsv @@ -0,0 +1,9 @@ +8 20 +9 21 +10 22 +11 23 +12 0 +1 1 +2 2 +3 3 +4 4 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_noon.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_noon.tsv new file mode 100644 index 000000000..8d0de9024 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/hour_to_noon.tsv @@ -0,0 +1,7 @@ +12 12 +1 13 +2 14 +3 15 +4 16 +5 17 +6 18 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/midnight_to_hour.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/midnight_to_hour.tsv new file mode 100644 index 000000000..5b86a39eb --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/midnight_to_hour.tsv @@ -0,0 +1 @@ +חצות 0 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/minute_to.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/minute_to.tsv new file mode 100644 index 000000000..38858859c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/minute_to.tsv @@ -0,0 +1,58 @@ +02 58 +03 57 +04 56 +05 55 +06 54 +07 53 +08 52 +09 51 +10 50 +11 49 +12 48 +13 47 +14 46 +15 45 +16 44 +17 43 +18 42 +19 41 +20 40 +21 39 +22 38 +23 37 +24 36 +25 35 +26 34 +27 33 +28 32 +29 31 +30 30 +31 29 +32 28 +33 27 +34 26 +35 25 +36 24 +37 23 +38 22 +39 21 +40 20 +41 19 +42 18 +43 17 +44 16 +45 15 +46 14 +47 13 +48 12 +49 11 +50 10 +51 09 +52 08 +53 07 +54 06 +55 05 +56 04 +57 03 +58 01 +59 01 diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/minute_to_verbose.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/minute_to_verbose.tsv new file mode 100644 index 000000000..8f62ae4de --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/minute_to_verbose.tsv @@ -0,0 +1,6 @@ +רבע 45 +עשרה 50 +חמישה 55 +עשרים 40 +עשרים וחמישה 35 +דקה 59 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/minute_verbose.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/minute_verbose.tsv new file mode 100644 index 000000000..efa2207c3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/minute_verbose.tsv @@ -0,0 +1,8 @@ +שלושת רבעי 45 +חצי 30 +רבע 15 +עשרים 20 +עשרה 10 +חמישה 05 +דקה 01 +שתי 02 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/night_suffix.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/night_suffix.tsv new file mode 100644 index 000000000..464aa81c0 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/night_suffix.tsv @@ -0,0 +1 @@ +בלילה \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/noon_suffix.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/noon_suffix.tsv new file mode 100644 index 000000000..963d81053 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/noon_suffix.tsv @@ -0,0 +1,3 @@ +בצהריים +אחרי הצהריים +אחר הצהריים \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/time_suffix.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/time_suffix.tsv new file mode 100644 index 000000000..b5799a0b9 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/time_suffix.tsv @@ -0,0 +1,8 @@ +בבוקר +לפנות בוקר +לפנות ערב +בערב +בצהריים +בלילה +אחרי הצהריים +אחר הצהריים \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/time/to_hour.tsv b/nemo_text_processing/inverse_text_normalization/he/data/time/to_hour.tsv new file mode 100644 index 000000000..5689943fd --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/time/to_hour.tsv @@ -0,0 +1,13 @@ +אחת 12 +שתיים 1 +שלוש 2 +ארבע 3 +חמש 4 +שש 5 +שבע 6 +שמונה 7 +תשע 8 +עשר 9 +אחת עשרה 10 +שתיים עשרה 11 +חצות 23 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/data/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/he/data/whitelist.tsv new file mode 100644 index 000000000..9844685c5 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/whitelist.tsv @@ -0,0 +1,20 @@ +אח״כ אחר כך +וכו׳ וכולי +בריה״מ ברית המועצות +ארה״ב ארצות הברית +עו״ד עורך דין +עו״ד עורכת דין +עו״ד עורכי דין +עו״ד עורכות דין +רו״ח רואה חשבון +רו״ח רואת חשבון +רו״ח רואי חשבון +רו״ח רואות חשבון +לפנה״ס לפני הספירה +ד״ר דוקטור +פרופ׳ פרופסור +אמא אימא +כל כול +מאיתנו מאתנו +ישארו יישארו +יתכן ייתכן \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/he/graph_utils.py b/nemo_text_processing/inverse_text_normalization/he/graph_utils.py new file mode 100644 index 000000000..072da0381 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/graph_utils.py @@ -0,0 +1,119 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from pathlib import Path + +import pynini +from pynini import Far +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA, delete_space +from nemo_text_processing.text_normalization.en.utils import load_labels + +NEMO_ALPHA_HE = pynini.union(*"אבגדהוזחטיכלמםנןסעפףצץקרשת").optimize() +delete_and = pynutil.delete("ו") +delete_optional_and = delete_and.ques + +#################### +MIN_NEG_WEIGHT = -0.0001 +MIN_POS_WEIGHT = 0.0001 +MINUS = pynini.union("מינוס").optimize() + + +def string_map_cased(input_file: str): + labels = load_labels(input_file) + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + +def apply_fst(text, fst): + """Given a string input, returns the output string + produced by traversing the path with lowest weight. + If no valid path accepts input string, returns an + error. + """ + try: + print(pynini.shortestpath(text @ fst).string()) + except pynini.FstOpError: + print(f"Error: No valid output with given input: '{text}'") + + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = kind + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far") + if self.far_exist(): + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> "pynini.FstLike": + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> "pynini.FstLike": + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> "pynini.FstLike": + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/he/taggers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py new file mode 100644 index 000000000..aaf30b32c --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/cardinal.py @@ -0,0 +1,158 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import ( + NEMO_ALPHA_HE, + GraphFst, + delete_and, + delete_optional_and, +) +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_DIGIT, + NEMO_SIGMA, + NEMO_SPACE, + delete_space, + insert_space, +) +from nemo_text_processing.text_normalization.en.utils import load_labels + + +class CardinalFst(GraphFst): + """ + Finite state transducer for classifying cardinals in Hebrew + e.g. מינוס עשרים ושלוש ("minus twenty three" in Hebrew)-> cardinal { negative: "-" integer: "23" } } + """ + + def __init__(self): + super().__init__(name="cardinal", kind="classify") + + # digits + graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) + + # teens + graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) + graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv")) + graph_ties += pynini.union( + delete_space + delete_optional_and + graph_digit, + pynutil.insert("0", weight=0.001), + ) + graph_two_digit = pynini.union(graph_teen, graph_ties) + + self.graph_two_digit = pynini.union(graph_digit, graph_ties, pynutil.add_weight(graph_teen, -0.001)) + + # hundreds + hundred_exception = pynini.string_file(get_abs_path("data/numbers/hundreds_exception.tsv")) + delete_hundred = pynutil.delete("מאות") + graph_hundred = delete_optional_and + pynini.union( + hundred_exception, + graph_digit + delete_space + delete_hundred, + pynutil.insert("0", weight=0.001), + ) + graph_hundred += delete_space + graph_hundred += pynini.union( + delete_optional_and + graph_two_digit, + pynutil.insert("0") + delete_space + delete_and + graph_digit, + pynutil.insert("00", weight=0.001), + ) + graph_hundred = pynini.union( + graph_hundred, + pynutil.insert("0") + graph_two_digit, + pynutil.insert("00") + graph_digit, + ) + self.graph_hundred = graph_hundred @ ( + pynini.closure(NEMO_DIGIT) + (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT) + ) + + # thousands + thousand_exception = pynini.string_file(get_abs_path("data/numbers/thousands_exception.tsv")) + thousand_digit = pynini.string_file(get_abs_path("data/numbers/thousands.tsv")) + delete_thousand = pynutil.delete("אלפים") | pynutil.delete("אלף", weight=0.001) + + large_number_prefix = pynini.union( + graph_hundred, + pynutil.insert("0") + graph_two_digit, + pynutil.insert("00") + thousand_digit, + ) + many_thousands = large_number_prefix + delete_space + delete_thousand + graph_thousands = delete_optional_and + pynini.union( + (pynutil.insert("00") + thousand_exception), + many_thousands, + pynutil.insert("000", weight=0.001), + ) + + self.graph_thousands = pynini.union(graph_thousands + delete_space + graph_hundred, graph_zero) + self.graph_thousands @= pynini.union( + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), + "0", + ) + + # millions + million_exceptions = pynini.string_file(get_abs_path("data/numbers/millions_exception.tsv")) + million_exceptions = pynutil.insert("00") + million_exceptions + delete_millions = pynutil.delete("מיליונים") | pynutil.delete("מיליון", weight=0.001) + many_millions = large_number_prefix + delete_space + delete_millions + graph_millions = pynini.union(many_millions, million_exceptions, pynutil.insert("000", weight=0.001)) + + graph = pynini.union( + graph_millions + delete_space + graph_thousands + delete_space + graph_hundred, + graph_zero, + ) + graph = graph @ pynini.union( + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT), + "0", + ) + + labels_exception = load_labels(get_abs_path("data/numbers/digit.tsv")) + labels_exception = list(set([x[0] for x in labels_exception] + ["אפס", "עשר", "עשרה"])) + labels_exception += ["ו" + label for label in labels_exception] + graph_exception = pynini.union(*labels_exception).optimize() + graph = ((NEMO_ALPHA_HE + NEMO_SIGMA) @ graph).optimize() + + self.graph_no_exception = graph + + ### Token insertion + minus_graph = pynutil.insert("negative: ") + pynini.cross("מינוס", '"-"') + NEMO_SPACE + optional_minus_graph = pynini.closure(minus_graph, 0, 1) + + optional_prefix_graph = pynini.closure( + pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') + insert_space, + 0, + 1, + ) + + graph_wo_small_digits = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph + + cardinal_wo_viable_hours = load_labels(get_abs_path("data/numbers/viable_hours.tsv")) + cardinal_wo_viable_hours = list(set([x[0] for x in cardinal_wo_viable_hours])) + viable_hours_exception = pynini.union(*cardinal_wo_viable_hours).optimize() + self.graph_wo_viable_hours = (pynini.project(graph, "input") - viable_hours_exception.arcsort()) @ graph + + small_number_with_minus = ( + insert_space + minus_graph + pynutil.insert('integer: "') + self.graph_no_exception + pynutil.insert('"') + ) + + big_number_with_optional_minus = ( + optional_minus_graph + pynutil.insert('integer: "') + graph_wo_small_digits + pynutil.insert('"') + ) + + graph = optional_prefix_graph + (small_number_with_minus | big_number_with_optional_minus) + + final_graph = self.add_tokens(graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/date.py b/nemo_text_processing/inverse_text_normalization/he/taggers/date.py new file mode 100644 index 000000000..cf9cacbd5 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/date.py @@ -0,0 +1,106 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import delete_extra_space, delete_space, insert_space + + +def _get_year_graph(graph_two_digits, graph_thousands): + """ + Transducer for year, e.g. twenty twenty -> 2020 + """ + year_graph = pynini.union( + (graph_two_digits + delete_space + graph_two_digits), + graph_thousands, # 20 19, 40 12, 20 20 + ) # 2012 - assuming no limit on the year + + year_graph.optimize() + return year_graph + + +class DateFst(GraphFst): + """ + Finite state transducer for classifying date in Hebrew, + e.g. אחד במאי אלף תשע מאות שמונים ושלוש -> date { day: "1" morphosyntactic_features: "ב" month: "5" year: "1983" } + e.g. מרץ אלף תשע מאות שמונים ותשע -> date { month: "מרץ" year: "1989" } + e.g. בינואר עשרים עשרים -> date { morphosyntactic_features: "ב" month: "ינואר" year: "2020" } + + Args: + cardinal: CardinalFst + ordinal: OrdinalFst + """ + + def __init__(self, cardinal: GraphFst, ordinal: GraphFst): + super().__init__(name="date", kind="classify") + + ordinal_graph = ordinal.graph + two_digits_graph = cardinal.graph_two_digit + + day_graph = pynutil.add_weight(two_digits_graph | ordinal_graph, -0.7) + day_graph = pynutil.insert('day: "') + day_graph + pynutil.insert('"') + + month_names = pynini.string_file(get_abs_path("data/months.tsv")) + month_names_graph = pynutil.insert('month: "') + month_names + pynutil.insert('"') + + month_name2number = pynini.string_file(get_abs_path("data/months_name2number.tsv")) + month_name2number_graph = pynutil.insert('month: "') + month_name2number + pynutil.insert('"') + + month_number2number = pynini.string_file(get_abs_path("data/months_ordinal2number.tsv")) + month_number2number_graph = pynutil.insert('month: "') + month_number2number + pynutil.insert('"') + + all_month_graph = month_name2number_graph | month_number2number_graph + + year_graph = _get_year_graph(two_digits_graph, cardinal.graph_thousands) + graph_year = delete_extra_space + pynutil.insert('year: "') + year_graph + pynutil.insert('"') + + prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) + delete_prefix = pynutil.delete(prefix_graph) + + graph_prefix = pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') + year_prefix_graph = ( + pynutil.insert('morphosyntactic_features: "') + + pynini.closure(prefix_graph, 0, 1) + + pynini.union("שנה", "שנת") + + pynutil.insert('"') + ) + + graph_dm = ( + pynini.closure(graph_prefix + insert_space, 0, 1) + + day_graph + + insert_space + + delete_space + + pynini.closure(delete_prefix + insert_space, 0, 1) + + month_name2number_graph + ) + + graph_dmy = ( + pynini.closure(graph_prefix + insert_space, 0, 1) + + day_graph + + insert_space + + delete_space + + pynini.closure(delete_prefix + insert_space, 0, 1) + + all_month_graph + + graph_year + ) + + graph_my = pynini.closure(graph_prefix + insert_space, 0, 1) + month_names_graph + graph_year + graph_y_only = year_prefix_graph + graph_year + + final_graph = graph_dm | graph_dmy | graph_my | graph_y_only + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py new file mode 100644 index 000000000..ecefb306a --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/decimal.py @@ -0,0 +1,144 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import MINUS, GraphFst, delete_and +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_DIGIT, + delete_extra_space, + delete_space, + delete_zero_or_one_space, + insert_space, +) + + +def get_quantity(decimal: "pynini.FstLike", cardinal_up_to_hundred: "pynini.FstLike") -> "pynini.FstLike": + """ + Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral in Hebrew, + + Args: + decimal: decimal FST + cardinal_up_to_hundred: cardinal FST + """ + numbers = cardinal_up_to_hundred @ ( + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT) + ) + + suffix_labels = ["מיליון", "מיליארד"] + suffix = pynini.union(*suffix_labels).optimize() + + res = ( + pynutil.insert('integer_part: "') + + numbers + + pynutil.insert('"') + + delete_extra_space + + pynutil.insert('quantity: "') + + suffix + + pynutil.insert('"') + ) + res |= decimal + delete_extra_space + pynutil.insert('quantity: "') + (suffix | "אלף") + pynutil.insert('"') + return res + + +class DecimalFst(GraphFst): + """ + Finite state transducer for classifying decimal in Hebrew + e.g. עשרים ושלוש וחצי -> decimal { integer_part: "23" fractional_part: "5" } + e.g. אחד נקודה שלוש -> decimal { integer_part: "1" fractional_part: "3" } + e.g. ארבע נקודה חמש מיליון -> decimal { integer_part: "4" fractional_part: "5" quantity: "מיליון" } + e.g. מינוס ארבע מאות נקודה שלוש שתיים שלוש -> decimal { negative: "true" integer_part: "400" fractional_part: "323" } + e.g. אפס נקודה שלושים ושלוש -> decimal { integer_part: "0" fractional_part: "33" } + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="decimal", kind="classify") + + prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) + optional_prefix_graph = pynini.closure( + pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') + insert_space, + 0, + 1, + ) + + # all cardinals + cardinal_graph = cardinal.graph_no_exception + + # all fractions + fractions = pynini.string_file(get_abs_path("data/numbers/decimal_fractions.tsv")) + fractions_graph = delete_zero_or_one_space + delete_and + fractions + fractions_graph = pynutil.insert('fractional_part: "') + fractions_graph + pynutil.insert('"') + + # identify decimals that can be understood as time, don't convert them to avoid ambiguity + viable_minutes_exception = pynini.string_file(get_abs_path("data/decimals/minutes_exception.tsv")) + fractions_wo_minutes = (pynini.project(fractions, "input") - viable_minutes_exception.arcsort()) @ fractions + fractions_wo_minutes = delete_zero_or_one_space + delete_and + fractions_wo_minutes + fractions_wo_minutes = pynutil.insert('fractional_part: "') + fractions_wo_minutes + pynutil.insert('"') + + graph_decimal = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + graph_decimal |= cardinal.graph_two_digit + graph_decimal = pynini.closure(graph_decimal + delete_space) + graph_decimal + self.graph = graph_decimal + + point = pynutil.delete("נקודה") + + graph_negative = pynutil.insert("negative: ") + pynini.cross(MINUS, '"true"') + delete_extra_space + optional_graph_negative = pynini.closure( + graph_negative, + 0, + 1, + ) + + graph_integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"') + graph_fractional = pynutil.insert('fractional_part: "') + graph_decimal + pynutil.insert('"') + + # integer could be an hour, but minutes cannot: convert to decimal + viable_hour_unviable_minutes = graph_integer + delete_extra_space + fractions_wo_minutes + + # integer cannot be an hour, but minutes can: convert to decimal + unviable_hour_viable_minutes = ( + pynutil.insert('integer_part: "') + + cardinal.graph_wo_viable_hours + + pynutil.insert('"') + + delete_extra_space + + fractions_graph + ) + + # minus sign followed by ambiguous decimal: convert to decimal, there is no negative time + negative_viable_time = graph_negative + graph_integer + delete_extra_space + fractions_graph + + # all decimals with fractions, not excluding anything (used in other FSTs) + all_decimals_wo_point = graph_integer + delete_extra_space + fractions_graph + + # only cases with fractional part that cannot be interpreted as time + graph_wo_point = viable_hour_unviable_minutes | unviable_hour_viable_minutes | negative_viable_time + + # all decimals with the word "point" + graph_w_point = ( + pynini.closure(graph_integer + delete_extra_space, 0, 1) + point + delete_extra_space + graph_fractional + ) + + final_graph_wo_sign = graph_w_point | graph_wo_point + self.final_graph_wo_sign = graph_w_point | all_decimals_wo_point + final_graph = optional_prefix_graph + optional_graph_negative + final_graph_wo_sign + + quantity_graph = get_quantity(self.final_graph_wo_sign, cardinal.graph_hundred) + final_graph |= optional_prefix_graph + optional_graph_negative + quantity_graph + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py b/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py new file mode 100644 index 000000000..0232c4ff6 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/measure.py @@ -0,0 +1,114 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_SPACE, + delete_extra_space, + delete_space, + delete_zero_or_one_space, + insert_space, +) + + +class MeasureFst(GraphFst): + """ + Finite state transducer for classifying measure in Hebrew + e.g. מש עשרה אחוז -> measure { cardinal { integer: "15" } units: "%" } + e.g. מינוס חמש עשרה אחוז -> measure { cardinal { negative: "-" integer: "15" } units: "%" } + e.g. שלוש מיליגרם -> measure { cardinal { integer: "3" } units: "מ״ג" } + e.g. אלף אחוז -> measure { cardinal { integer: "1000" } units: "%" } + e.g. אחוז אחד -> measure { units: "%" cardinal { integer: "1" } } + e.g. סנטימטר אחד -> measure { units: "ס״מ" cardinal { integer: "1" } } + + Args: + cardinal: CardinalFst + decimal: DecimalFst + """ + + def __init__(self, cardinal: CardinalFst, decimal: DecimalFst): + super().__init__(name="measure", kind="classify") + + # optional negative sign + optional_graph_negative = pynini.closure( + pynutil.insert("negative: ") + pynini.cross("מינוס", '"-"') + NEMO_SPACE, + 0, + 1, + ) + + prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) + optional_prefix_graph = pynini.closure( + pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') + insert_space, + 0, + 1, + ) + + # cardinal numbers + cardinal_graph = cardinal.graph_no_exception + + # Let singular apply to values > 1 as they could be part of an adjective phrase (e.g. 14 foot tall building) + subgraph_decimal = ( + pynutil.insert("decimal { ") + + optional_graph_negative + + decimal.final_graph_wo_sign + + pynutil.insert(" }") + + delete_extra_space + ) + + subgraph_cardinal = ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + pynutil.insert('integer: "') + + cardinal_graph + + pynutil.insert('"') + + pynutil.insert(" }") + + delete_extra_space + ) + + # convert units + joined_units = pynini.string_file(get_abs_path("data/measurements.tsv")) + joined_units = pynini.invert(joined_units) + joined_units = pynutil.insert('units: "') + joined_units + pynutil.insert('"') + + spaced_units = pynini.string_file(get_abs_path("data/spaced_measurements.tsv")) + spaced_units = pynini.invert(spaced_units) + spaced_units = pynutil.insert('units: "\[SPACE\]') + spaced_units + pynutil.insert('"') # noqa: W605 + + # in joint units the unit is concatenated to the number, in spaced unit separate the unit with a space + units_graph = joined_units | spaced_units + + # one graph is needed since it changed the order of the words. + # We say "ten percent" for 10% but "percent one" for 1% + one = pynini.string_map([("אחד", "1")]) + one_graph = ( + insert_space + + pynutil.insert("cardinal { ") + + pynutil.insert('integer: "') + + one + + pynutil.insert('"') + + pynutil.insert(" }") + ) + + number_graph = subgraph_decimal | subgraph_cardinal + number_unit_graph = (number_graph + units_graph) | (units_graph + delete_space + one_graph) + + final_graph = optional_prefix_graph + number_unit_graph + delete_zero_or_one_space + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py b/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py new file mode 100644 index 000000000..c7306ea43 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/ordinal.py @@ -0,0 +1,43 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_SIGMA + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for classifying ordinal in Hebrew + e.g. ראשון -> ordinal { integer: "1" } + + Args: + cardinal: CardinalFst + """ + + def __init__(self, cardinal: GraphFst): + super().__init__(name="ordinal", kind="classify") + + cardinal_graph = cardinal.graph_no_exception + graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv")) + graph = NEMO_SIGMA + graph_digit + + self.graph = graph @ cardinal_graph + + final_graph = pynutil.insert('integer: "') + self.graph + pynutil.insert('"') + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py b/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py new file mode 100644 index 000000000..b963e7b74 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/punctuation.py @@ -0,0 +1,35 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst + + +class PunctuationFst(GraphFst): + """ + Finite state transducer for classifying punctuation + e.g. a, -> tokens { name: "a" } tokens { name: "," } + """ + + def __init__(self): + super().__init__(name="punctuation", kind="classify") + + s = "!#$%&'()*+,-./:;<=>?@^_`{|}~" + punct = pynini.union(*s) + + graph = pynutil.insert('name: "') + punct + pynutil.insert('"') + + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/time.py b/nemo_text_processing/inverse_text_normalization/he/taggers/time.py new file mode 100644 index 000000000..ac4965cfc --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/time.py @@ -0,0 +1,202 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst, delete_and +from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path, integer_to_text +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_DIGIT, + delete_extra_space, + delete_space, + delete_zero_or_one_space, + insert_space, +) + + +class TimeFst(GraphFst): + """ + Finite state transducer for classifying time in Hebrew. + Conversion is made only when am / pm time is not ambiguous! + e.g. שלוש דקות לחצות -> time { minutes: "57" hours: "23" } + e.g. באחת ושתי דקות בצהריים -> time { morphosyntactic_features: "ב" hours: "1" minutes: "02" suffix: "צהריים" } + e.g. שתיים ועשרה בבוקר -> time { hours: "2" minutes: "10" suffix: "בוקר" } + e.g. שתיים ועשרה בצהריים -> time { hours: "2" minutes: "10" suffix: "צהריים" } + e.g. שתיים עשרה ושלוש דקות אחרי הצהריים -> time { hours: "12" minutes: "03" suffix: "צהריים" } + e.g. רבע לשש בערב -> time { minutes: "45" hours: "5" suffix: "ערב" } + + """ + + def __init__(self): + super().__init__(name="time", kind="classify") + + # hours, minutes, seconds, suffix, zone, style, speak_period + midnight_to_hour_graph = pynini.string_file(get_abs_path("data/time/midnight_to_hour.tsv")) + to_hour_graph = pynini.string_file(get_abs_path("data/time/to_hour.tsv")) + + minute_verbose_graph = pynini.string_file(get_abs_path("data/time/minute_verbose.tsv")) + minute_to_graph = pynini.string_file(get_abs_path("data/time/minute_to.tsv")) + minute_to_verbose_graph = pynini.string_file(get_abs_path("data/time/minute_to_verbose.tsv")) + + suffix_graph = pynini.union( + pynini.string_file(get_abs_path("data/time/day_suffix.tsv")), + pynini.string_file(get_abs_path("data/time/noon_suffix.tsv")), + pynini.string_file(get_abs_path("data/time/evening_suffix.tsv")), + pynini.string_file(get_abs_path("data/time/night_suffix.tsv")), + ) + + time_prefix = pynini.string_file(get_abs_path("data/prefix.tsv")) + time_prefix_graph = ( + pynutil.insert('morphosyntactic_features: "') + time_prefix + pynutil.insert('"') + insert_space + ) + optional_time_prefix_graph = pynini.closure(time_prefix_graph, 0, 1) + + # only used for < 1000 thousand -> 0 weight + cardinal = pynutil.add_weight(CardinalFst().graph_no_exception, weight=-0.7) + + labels_hour = [integer_to_text(x, only_fem=True)[0] for x in range(1, 13)] + labels_minute_single = [integer_to_text(x, only_fem=True)[0] for x in range(2, 10)] + labels_minute_double = [integer_to_text(x, only_fem=True)[0] for x in range(10, 60)] + + graph_hour = pynini.union(*labels_hour) @ cardinal + graph_hour |= midnight_to_hour_graph + add_leading_zero_to_double_digit = pynutil.insert("0") + NEMO_DIGIT + graph_minute_single = pynini.union(*labels_minute_single) @ cardinal @ add_leading_zero_to_double_digit + graph_minute_double = pynini.union(*labels_minute_double) @ cardinal + + final_graph_hour = pynutil.insert('hours: "') + graph_hour + pynutil.insert('"') + + graph_minute = pynini.union(pynutil.insert("00"), graph_minute_single, graph_minute_double) + + final_suffix = pynutil.insert('suffix: "') + suffix_graph + pynutil.insert('"') + final_suffix = delete_space + insert_space + final_suffix + + time_word = "דקות" + optional_delete_time = pynini.closure(delete_space + pynutil.delete(time_word), 0, 1) + graph_h_and_m = ( + final_graph_hour + + delete_space + + delete_and + + insert_space + + pynutil.insert('minutes: "') + + pynini.union(graph_minute_single, graph_minute_double, minute_verbose_graph) + + pynutil.insert('"') + + optional_delete_time + ) + + graph_special_m_to_h_suffix_time = ( + pynutil.insert('minutes: "') + + minute_to_verbose_graph + + pynutil.insert('"') + + delete_space + + pynutil.delete("ל") + + insert_space + + pynutil.insert('hours: "') + + to_hour_graph + + pynutil.insert('"') + ) + + graph_m_to_h_suffix_time = ( + pynutil.insert('minutes: "') + + pynini.union(graph_minute_single, graph_minute_double) @ minute_to_graph + + pynutil.insert('"') + + optional_delete_time + + delete_space + + pynutil.delete("ל") + + insert_space + + pynutil.insert('hours: "') + + to_hour_graph + + pynutil.insert('"') + ) + + graph_h = ( + optional_time_prefix_graph + + delete_zero_or_one_space + + final_graph_hour + + delete_extra_space + + pynutil.insert('minutes: "') + + (pynutil.insert("00") | graph_minute) + + pynutil.insert('"') + + final_suffix + ) + + midnight_graph = ( + optional_time_prefix_graph + + delete_zero_or_one_space + + pynutil.insert('hours: "') + + midnight_to_hour_graph + + pynutil.insert('"') + + insert_space + + pynutil.insert('minutes: "') + + (pynutil.insert("00") | graph_minute) + + pynutil.insert('"') + ) + + graph_midnight_and_m = ( + pynutil.insert('hours: "') + + midnight_to_hour_graph + + pynutil.insert('"') + + delete_space + + delete_and + + insert_space + + pynutil.insert('minutes: "') + + pynini.union(graph_minute_single, graph_minute_double, minute_verbose_graph) + + pynutil.insert('"') + + optional_delete_time + ) + + to_midnight_verbose_graph = ( + pynutil.insert('minutes: "') + + minute_to_verbose_graph + + pynutil.insert('"') + + delete_space + + pynutil.delete("ל") + + insert_space + + pynutil.insert('hours: "') + + to_hour_graph + + pynutil.insert('"') + ) + + graph_m_to_midnight = ( + pynutil.insert('minutes: "') + + pynini.union(graph_minute_single, graph_minute_double) @ minute_to_graph + + pynutil.insert('"') + + optional_delete_time + + delete_space + + pynutil.delete("ל") + + insert_space + + pynutil.insert('hours: "') + + to_hour_graph + + pynutil.insert('"') + ) + + final_graph_midnight = ( + optional_time_prefix_graph + + delete_zero_or_one_space + + (midnight_graph | to_midnight_verbose_graph | graph_m_to_midnight | graph_midnight_and_m) + ) + + final_graph = ( + optional_time_prefix_graph + + delete_zero_or_one_space + + (graph_h_and_m | graph_special_m_to_h_suffix_time | graph_m_to_h_suffix_time) + + final_suffix + ) + final_graph |= graph_h + final_graph |= final_graph_midnight + + final_graph = self.add_tokens(final_graph.optimize()) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..807dcf734 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/tokenize_and_classify.py @@ -0,0 +1,104 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.date import DateFst +from nemo_text_processing.inverse_text_normalization.he.taggers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.measure import MeasureFst +from nemo_text_processing.inverse_text_normalization.he.taggers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.he.taggers.punctuation import PunctuationFst +from nemo_text_processing.inverse_text_normalization.he.taggers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.he.taggers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.he.taggers.word import WordFst +from nemo_text_processing.text_normalization.en.graph_utils import delete_extra_space, delete_space, generator_main + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + whitelist: path to a file with whitelist replacements + """ + + def __init__( + self, + cache_dir: str = None, + overwrite_cache: bool = False, + whitelist: str = None, + input_case: str = None, + ): + + super().__init__(name="tokenize_and_classify", kind="classify") + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"he_itn.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logging.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + logging.info(f"Creating ClassifyFst grammars.") + + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + + ordinal = OrdinalFst(cardinal) + + decimal = DecimalFst(cardinal) + decimal_graph = decimal.fst + + measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst + date_graph = DateFst(ordinal=ordinal, cardinal=cardinal).fst + word_graph = WordFst().fst + time_graph = TimeFst().fst + whitelist_graph = WhiteListFst(input_file=whitelist).fst + punct_graph = PunctuationFst().fst + + classify = ( + pynutil.add_weight(whitelist_graph, 1.01) + | pynutil.add_weight(time_graph, 1.1) + | pynutil.add_weight(date_graph, 1.09) + | pynutil.add_weight(decimal_graph, 1.1) + | pynutil.add_weight(measure_graph, 1.1) + | pynutil.add_weight(cardinal_graph, 1.1) + | pynutil.add_weight(word_graph, 100) + ) + + punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") + token_plus_punct = ( + pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) + ) + + graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct) + graph = delete_space + graph + delete_space + + self.fst = graph.optimize() + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) + logging.info(f"ClassifyFst grammars are saved to {far_file}.") diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py new file mode 100644 index 000000000..58de7668e --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/whitelist.py @@ -0,0 +1,55 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst, string_map_cased +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import convert_space, insert_space + + +class WhiteListFst(GraphFst): + """ + Finite state transducer for classifying whitelisted tokens + e.g. misses -> tokens { name: "mrs." } + This class has highest priority among all classifier grammars. + Whitelisted tokens are defined and loaded from "data/whitelist.tsv" (unless input_file specified). + + Args: + input_file: path to a file with whitelist replacements (each line of the file: written_form\tspoken_form\n), + e.g. nemo_text_processing/inverse_text_normalization/he/data/whitelist.tsv + """ + + def __init__(self, input_file: str = None): + super().__init__(name="whitelist", kind="classify") + prefix_graph = pynini.string_file(get_abs_path("data/prefix.tsv")) + + if input_file is None: + input_file = get_abs_path("data/whitelist.tsv") + + if not os.path.exists(input_file): + raise ValueError(f"Whitelist file {input_file} not found") + + optional_prefix_graph = pynini.closure( + pynutil.insert('morphosyntactic_features: "') + prefix_graph + pynutil.insert('"') + insert_space, + 0, + 1, + ) + whitelist = string_map_cased(input_file) + graph = pynutil.insert('name: "') + convert_space(whitelist) + pynutil.insert('"') + final_graph = optional_prefix_graph + graph + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/taggers/word.py b/nemo_text_processing/inverse_text_normalization/he/taggers/word.py new file mode 100644 index 000000000..6b5394ac3 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/taggers/word.py @@ -0,0 +1,31 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE + + +class WordFst(GraphFst): + """ + Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class. + e.g. sleep -> tokens { name: "sleep" } + """ + + def __init__(self): + super().__init__(name="word", kind="classify") + word = pynutil.insert('name: "') + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert('"') + self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/utils.py b/nemo_text_processing/inverse_text_normalization/he/utils.py new file mode 100644 index 000000000..1aa996b80 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/utils.py @@ -0,0 +1,182 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +#################### +# HEBREW CONSTANTS # +#################### +units_feminine_dict = { + "0": "אפס", + "1": "אחת", + "2": "שתיים", + "3": "שלוש", + "4": "ארבע", + "5": "חמש", + "6": "שש", + "7": "שבע", + "8": "שמונה", + "9": "תשע", +} + +units_masculine_dict = { + "0": "אפס", + "1": "אחד", + "2": "שניים", + "3": "שלושה", + "4": "ארבעה", + "5": "חמישה", + "6": "שישה", + "7": "שבעה", + "8": "שמונה", + "9": "תשעה", +} + +tens_dict = { + "2": "עשרים", + "3": "שלושים", + "4": "ארבעים", + "5": "חמישים", + "6": "שישים", + "7": "שבעים", + "8": "שמונים", + "9": "תשעים", +} + +ten = { + "short": "עשר", + "long": "עשרה", +} # double pronunciation: short is 'eser' and 'asar', long is 'esre' and 'asara' + + +############# +# FUNCTIONS # +############# +def get_abs_path(rel_path): + """ + Get absolute path + + Args: + rel_path: relative path to this file + + Returns absolute path + """ + return os.path.dirname(os.path.abspath(__file__)) + "/" + rel_path + + +def augment_labels_with_punct_at_end(labels): + """ + augments labels: if key ends on a punctuation that value does not have, add a new label + where the value maintains the punctuation + + Args: + labels : input labels + Returns: + additional labels + """ + res = [] + for label in labels: + if len(label) > 1: + if label[0][-1] == "." and label[1][-1] != ".": + res.append([label[0], label[1] + "."] + label[2:]) + return res + + +def digit_by_digit(num): + + dbd = [" ".join([units_feminine_dict[digit] for digit in num])] + + # generate "1" as masculine and as feminine if exists + if units_feminine_dict["1"] in dbd[0]: + dbd.append(dbd[0].replace(units_feminine_dict["1"], units_masculine_dict["1"])) + + return dbd + + +def integer_to_text(num, only_fem=False): + if isinstance(num, int): + num = str(num) + # number is zero + if num == len(num) * "0": + return ["אפס"] + else: + # remove leading zeros from number + num = num.lstrip("0") + + # units + if len(num) == 1: + return _less_than_10(num, only_fem) + + # tenths + elif len(num) == 2: + return _less_than_100(num, only_fem) + + else: + raise Exception + + +def _less_than_10(num, only_fem=False): + """ + Returns a list of all the possible names of a number in range 0-9 + """ + + if only_fem: + return [units_feminine_dict[num]] + else: + return [units_feminine_dict[num], units_masculine_dict[num]] + + +def _less_than_100(num, only_fem=False): + """ + Returns a list of all the possible names of a number in range 0-99 + """ + + # init result + res = list() + + # split number to digits + tens, units = num + + # number is in range 0-9 + if len(num) == 1: + res.extend(_less_than_10(num)) + + # number is in range 10-99 + elif len(num) == 2: + + if num == "10": + if only_fem: + res.extend([ten["short"]]) + else: + res.extend([ten["long"], ten["short"]]) + + # number is in range 11-19 + elif tens == "1": + res.append(f'{units_feminine_dict[num[1]]} {ten["long"]}') + if not only_fem: + res.append(f'{units_masculine_dict[num[1]]} {ten["short"]}') + + else: + + # number is in range 20-99, a multiplication of 10 + if units == "0": + res.append(tens_dict[num[0]]) + + # number is in range 20-99, but not multiplication of 10 + else: + res.append(f'{tens_dict[num[0]]} {"ו"}{units_feminine_dict[num[1]]}') + if not only_fem: + res.append(f'{tens_dict[num[0]]} {"ו"}{units_masculine_dict[num[1]]}') + + return res diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py new file mode 100644 index 000000000..d26e1f703 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/cardinal.py @@ -0,0 +1,84 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import NEMO_ALPHA_HE, GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, delete_space + + +class CardinalFst(GraphFst): + """ + Finite state transducer for verbalizing cardinal in Hebrew + e.g. cardinal { prefix: "וב" integer: "3405"} -> וב-3,405 + e.g. cardinal { negative: "-" integer: "904" } -> -904 + e.g. cardinal { prefix: "כ" integer: "123" } -> כ-123 + + """ + + def __init__(self): + super().__init__(name="cardinal", kind="verbalize") + + # Need parser to group digits by threes + exactly_three_digits = NEMO_DIGIT**3 + at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) + + # Thousands separator + group_by_threes = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure() + + # Keep the prefix if exists and add a dash + optional_prefix = pynini.closure( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_ALPHA_HE, 1) + + pynutil.insert("-") + + pynutil.delete('"') + + delete_space, + 0, + 1, + ) + + # Removes the negative attribute and leaves the sign if occurs + optional_sign = pynini.closure( + pynutil.delete("negative:") + + delete_space + + pynutil.delete('"') + + pynini.accep("-") + + pynutil.delete('"') + + delete_space, + 0, + 1, + ) + + # removes integer aspect + graph = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_DIGIT, 1) # Accepts at least one digit + + pynutil.delete('"') + ) + + # Add thousands separator + graph = graph @ group_by_threes + + self.numbers = graph + + # add prefix and sign + graph = optional_prefix + optional_sign + graph + + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py new file mode 100644 index 000000000..4a1b24599 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/date.py @@ -0,0 +1,120 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_NOT_QUOTE, + NEMO_SPACE, + delete_space, + delete_zero_or_one_space, + insert_space, +) + + +class DateFst(GraphFst): + """ + Finite state transducer for verbalizing date, + e.g. { day_prefix: "ה" day: "1" month_prefix: "ב" month: "6" year: "2012" } -> ה-1.6.2012 + """ + + def __init__(self): + super().__init__(name="date", kind="verbalize") + + day_prefix = ( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.insert("-") + + pynutil.delete('"') + ) + + day = ( + pynutil.delete("day:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1, 2) + + pynutil.insert(".") + + pynutil.delete('"') + + delete_space + ) + + month_prefix = ( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + + delete_space + ) + + month = ( + pynutil.delete("month:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + + year_prefix = ( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 3) + + pynutil.delete('"') + + delete_space + ) + + year = ( + pynutil.delete("year:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + + ####################### + # DATE FORMATS GRAPHS # + ####################### + + # day and month only + graph_dm = ( + pynini.closure(day_prefix + delete_zero_or_one_space, 0, 1) + + day + + pynini.closure(delete_zero_or_one_space, 0, 1) + + month + + delete_zero_or_one_space + ) + + # day month and year + graph_dmy = graph_dm + delete_space + pynutil.insert(".") + delete_zero_or_one_space + year + + # only month and year + graph_my = ( + pynini.closure(month_prefix + delete_zero_or_one_space, 0, 1) + + month + + pynutil.insert(NEMO_SPACE) + + pynini.closure(delete_zero_or_one_space + year, 0, 1) + ) + + # only year + graph_y_only = year_prefix + insert_space + year + + final_graph = (graph_dm | graph_dmy | graph_my | graph_y_only) + delete_space + + delete_tokens = self.delete_tokens(final_graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py new file mode 100644 index 000000000..ea69ab784 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/decimal.py @@ -0,0 +1,90 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import NEMO_ALPHA_HE, GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, NEMO_NOT_QUOTE, delete_space + + +class DecimalFst(GraphFst): + """ + Finite state transducer for verbalizing decimal, + e.g. decimal { integer_part: "0" fractional_part: "33" } -> 0.33 + e.g. decimal { negative: "true" integer_part: "400" fractional_part: "323" } -> -400.323 + e.g. decimal { integer_part: "4" fractional_part: "5" quantity: "מיליון" } -> 4.5 מיליון + + """ + + def __init__(self): + super().__init__(name="decimal", kind="verbalize") + optionl_sign = pynini.closure(pynini.cross('negative: "true"', "-") + delete_space, 0, 1) + + # Need parser to group digits by threes + exactly_three_digits = NEMO_DIGIT**3 + at_most_three_digits = pynini.closure(NEMO_DIGIT, 1, 3) + + # Thousands separator + group_by_threes = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure() + + integer = ( + pynutil.delete("integer_part:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + + integer = integer @ group_by_threes + + optional_integer = pynini.closure(integer + delete_space, 0, 1) + + fractional = ( + pynutil.insert(".") + + pynutil.delete("fractional_part:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + optional_fractional = pynini.closure(fractional + delete_space, 0, 1) + + quantity = ( + pynutil.delete("quantity:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + optional_quantity = pynini.closure(pynutil.insert(" ") + quantity + delete_space, 0, 1) + + # Keep the prefix if exists and add a dash + optional_prefix = pynini.closure( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_ALPHA_HE, 1) + + pynutil.insert("-") + + pynutil.delete('"') + + delete_space, + 0, + 1, + ) + + graph = optional_prefix + optional_integer + optional_fractional + optional_quantity + self.numbers = graph + graph = optionl_sign + graph + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py new file mode 100644 index 000000000..a4aadd67b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/measure.py @@ -0,0 +1,107 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_CHAR, + NEMO_NOT_QUOTE, + NEMO_SIGMA, + NEMO_SPACE, + delete_space, +) + + +class MeasureFst(GraphFst): + """ + Finite state transducer for verbalizing measure, in Hebrew. + Some measures are concatenated to the numbers and other are don't (two measure lists) + e.g. measure { cardinal { integer: "3" } units: "מ״ג" } -> 3 מ״ג + e.g. measure { cardinal { integer: "1000" } units: "%" } -> 1,000% + e.g. measure { units: "%" cardinal { integer: "1" } } -> 1% + e.g. measure { units: "ס״מ" cardinal { integer: "1" } } -> 1 ס״מ + e.g. measure { prefix: "ל" cardinal { integer: "4" } units: "ס״מ" } -> ל-4 ס״מ + + Args: + decimal: DecimalFst + cardinal: CardinalFst + """ + + def __init__(self, decimal: GraphFst, cardinal: GraphFst): + super().__init__(name="measure", kind="verbalize") + + optional_prefix = pynini.closure( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.insert("-") + + pynutil.delete('"') + + delete_space, + 0, + 1, + ) + + # Removes the negative attribute and leaves the sign if occurs + optional_sign = pynini.closure( + pynutil.delete("negative:") + + delete_space + + pynutil.delete('"') + + pynini.accep("-") + + pynutil.delete('"') + + delete_space, + 0, + 1, + ) + + graph_decimal = ( + pynutil.delete("decimal {") + + delete_space + + optional_sign + + decimal.numbers + + delete_space + + pynutil.delete("}") + ) + + graph_cardinal = ( + pynutil.delete("cardinal {") + + delete_space + + optional_sign + + cardinal.numbers + + delete_space + + pynutil.delete("}") + ) + + unit = ( + pynutil.delete("units:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_CHAR - NEMO_SPACE, 1) + + pynutil.delete('"') + + delete_space + ) + unit @= pynini.cdrewrite( + pynini.cross("\[SPACE\]", NEMO_SPACE), "", "", NEMO_SIGMA # noqa: W605 + ) # For space separated measures. + + numbers_units = delete_space + unit + numbers_graph = (graph_cardinal | graph_decimal) + numbers_units + + one_graph = delete_space + pynutil.insert("1") + unit + pynutil.delete('cardinal { integer: "1" }') + + graph = optional_prefix + (numbers_graph | one_graph) + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py new file mode 100644 index 000000000..a85f5b019 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/ordinal.py @@ -0,0 +1,38 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, delete_space + + +class OrdinalFst(GraphFst): + """ + Finite state transducer for verbalizing ordinal in Hebrew + e.g. ordinal { integer: "10" } -> 10 + """ + + def __init__(self): + super().__init__(name="ordinal", kind="verbalize") + graph = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_DIGIT, 1) + + pynutil.delete('"') + ) + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py new file mode 100644 index 000000000..3d41b783b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/time.py @@ -0,0 +1,95 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.utils import get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import ( + NEMO_DIGIT, + NEMO_NOT_QUOTE, + delete_space, + delete_zero_or_one_space, + insert_space, +) + + +class TimeFst(GraphFst): + """ + Finite state transducer for verbalizing time in Hebrew + e.g. time { hours: "2" minutes: "55" suffix: "בלילה" } -> 2:55 בלילה + e.g. time { hours: "2" minutes: "57" suffix: "בבוקר" } -> 2:57 בבוקר + e.g. time { morphosyntactic_features: "ב" hours: "6" minutes: "32" suffix: "בערב" } -> ב-18:32 בערב + e.g. time { morphosyntactic_features: "בשעה" hours: "2" minutes: "10" suffix: "בצהריים" } -> בשעה-14:10 בצהריים + + """ + + def __init__(self): + super().__init__(name="time", kind="verbalize") + + hour_to_noon = pynini.string_file(get_abs_path("data/time/hour_to_noon.tsv")) + hour_to_evening = pynini.string_file(get_abs_path("data/time/hour_to_evening.tsv")) + hour_to_night = pynini.string_file(get_abs_path("data/time/hour_to_night.tsv")) + + day_suffixes = pynini.string_file(get_abs_path("data/time/day_suffix.tsv")) + day_suffixes = insert_space + pynutil.delete('suffix: "') + day_suffixes + pynutil.delete('"') + + noon_suffixes = pynini.string_file(get_abs_path("data/time/noon_suffix.tsv")) + noon_suffixes = insert_space + pynutil.delete('suffix: "') + noon_suffixes + pynutil.delete('"') + + evening_suffixes = pynini.string_file(get_abs_path("data/time/evening_suffix.tsv")) + evening_suffixes = insert_space + pynutil.delete('suffix: "') + evening_suffixes + pynutil.delete('"') + + night_suffixes = pynini.string_file(get_abs_path("data/time/night_suffix.tsv")) + night_suffixes = insert_space + pynutil.delete('suffix: "') + night_suffixes + pynutil.delete('"') + + hour = ( + pynutil.delete("hours:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_DIGIT, 1) + + pynutil.delete('"') + ) + + minute = ( + pynutil.delete("minutes:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_DIGIT, 1) + + pynutil.delete('"') + ) + + prefix = ( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.insert("-") + + pynutil.delete('"') + ) + + optional_prefix = pynini.closure(prefix + delete_zero_or_one_space, 0, 1) + optional_suffix = pynini.closure(delete_space + day_suffixes, 0, 1) + graph = hour + delete_space + pynutil.insert(":") + minute + optional_suffix + + for hour_to, suffix in zip( + [hour_to_noon, hour_to_evening, hour_to_night], + [noon_suffixes, evening_suffixes, night_suffixes], + ): + graph |= hour @ hour_to + delete_space + pynutil.insert(":") + minute + delete_space + suffix + + graph |= optional_prefix + graph + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py new file mode 100644 index 000000000..0223259db --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize.py @@ -0,0 +1,54 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.date import DateFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.decimal import DecimalFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.measure import MeasureFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.whitelist import WhiteListFst + + +class VerbalizeFst(GraphFst): + """ + Composes other verbalizer grammars in Hebrew. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + """ + + def __init__(self): + super().__init__(name="verbalize", kind="verbalize") + + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + + ordinal_graph = OrdinalFst().fst + + decimal = DecimalFst() + decimal_graph = decimal.fst + + measure_graph = MeasureFst(decimal=decimal, cardinal=cardinal).fst + + time_graph = TimeFst().fst + + date_graph = DateFst().fst + + whitelist_graph = WhiteListFst().fst + + graph = ( + time_graph | date_graph | measure_graph | ordinal_graph | decimal_graph | cardinal_graph | whitelist_graph + ) + self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py new file mode 100644 index 000000000..611181df4 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/verbalize_final.py @@ -0,0 +1,44 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.he.verbalizers.word import WordFst +from nemo_text_processing.text_normalization.en.graph_utils import delete_extra_space, delete_space + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence in Hebrew + """ + + def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): + super().__init__(name="verbalize_final", kind="verbalize") + verbalize = VerbalizeFst().fst + word = WordFst().fst + types = verbalize | word + graph = ( + pynutil.delete("tokens") + + delete_space + + pynutil.delete("{") + + delete_space + + types + + delete_space + + pynutil.delete("}") + ) + graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space + self.fst = graph diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py new file mode 100644 index 000000000..0607e0b37 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/whitelist.py @@ -0,0 +1,50 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import NEMO_ALPHA_HE, GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, delete_space + + +class WhiteListFst(GraphFst): + """ + Finite state transducer for verbalizing whitelist + e.g. tokens { name: "mrs." } -> mrs. + """ + + def __init__(self): + super().__init__(name="whitelist", kind="verbalize") + # Keep the prefix if exists and add a dash + optional_prefix = pynini.closure( + pynutil.delete("morphosyntactic_features:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_ALPHA_HE, 1) + + pynutil.delete('"') + + delete_space, + 0, + 1, + ) + graph = ( + pynutil.delete("name:") + + delete_space + + pynutil.delete('"') + + pynini.closure(NEMO_CHAR - " ", 1) + + pynutil.delete('"') + ) + graph = graph @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) + final_graph = optional_prefix + graph + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py new file mode 100644 index 000000000..49c61cf6a --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/verbalizers/word.py @@ -0,0 +1,34 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.he.graph_utils import GraphFst +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, delete_space + + +class WordFst(GraphFst): + """ + Finite state transducer for verbalizing plain tokens + e.g. tokens { name: "sleep" } -> sleep + """ + + def __init__(self): + super().__init__(name="word", kind="verbalize") + chars = pynini.closure(NEMO_CHAR - " ", 1) + char = pynutil.delete("name:") + delete_space + pynutil.delete('"') + chars + pynutil.delete('"') + graph = char @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) + + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index c10819908..da85318b1 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -131,6 +131,11 @@ def __init__( from nemo_text_processing.inverse_text_normalization.ja.verbalizers.verbalize_final import ( VerbalizeFinalFst, ) + elif lang == 'he': # Japanese + from nemo_text_processing.inverse_text_normalization.he.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.inverse_text_normalization.he.verbalizers.verbalize_final import ( + VerbalizeFinalFst, + ) self.tagger = ClassifyFst( cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case @@ -175,7 +180,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja'], + choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'he', 'hi', 'hy', 'mr', 'ja'], default="en", type=str, ) diff --git a/tests/nemo_text_processing/he/__init__.py b/tests/nemo_text_processing/he/__init__.py new file mode 100644 index 000000000..bc443be41 --- /dev/null +++ b/tests/nemo_text_processing/he/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..cfb6f8db0 --- /dev/null +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,138 @@ +אפס~אפס +מינוס שלוש~-3 +עשר~עשר +שלוש עשרה~13 +שלושה עשר~13 +עשרים~20 +עשרים ותשע~29 +עשרים ותשעה~29 +ארבעים~40 +מינוס ארבעים ושש~-46 +שבעים ושבעה~77 +מאה~100 +מאה ואחת~101 +מאה ועשר~110 +מאה ושש עשרה~116 +מאה עשרים~120 +מאה ועשרים~120 +כמאה עשרים וחמש~כ-125 +מאתיים~200 +מאתיים ושלוש~203 +מאתיים שלושים~230 +שלוש מאות ושלושים~330 +מינוס מאתיים שישים ושבע~-267 +ארבע מאות~400 +כחמש מאות עובדים~כ-500 עובדים +חמש מאות שבעים ותשע~579 +תשע מאות תשעים~990 +תשע מאות תשעים ותשע~999 +אלף~1,000 +אלף וארבע~1,004 +אלף עשרים ושמונה~1,028 +אלף מאה וחמש~1,105 +אלף מאה שלושים~1,130 +אלף תשע מאות תשעים ואחת~1,991 +אלפיים~2,000 +אלפיים וחמש~2,005 +אלפיים ועשר~2,010 +אלפיים ואחת עשרה~2,011 +אלפיים מאה~2,100 +אלפיים מאתיים~2,200 +מינוס אלפיים מאתיים עשרים ושתיים~-2,222 +אלפיים שלוש מאות~2,300 +אלפיים ארבע מאות ושבע~2,407 +מינוס אלפיים ארבע מאות שבעים~-2,470 +מינוס אלפיים ארבע מאות שבעים וחמש~-2,475 +שלושת אלפים~3,000 +שלושת אלפים וחמש~3,005 +שלושת אלפים ועשר~3,010 +שלושת אלפים וארבע עשרה~3,014 +שלושת אלפים מאה~3,100 +שלושת אלפים מאתיים~3,200 +מינוס שלושת אלפים שבע מאות עשרים ואחת~-3,721 +שלושת אלפים שמונה מאות~3,800 +שלושת אלפים ושמונה מאות~3,800 +שלושת אלפים תשע מאות ושבע~3,907 +מינוס שלושת אלפים מאתיים ועשרים~-3,220 +חמשת אלפים~5,000 +תשעת אלפים תשע מאות תשעים ותשע~9,999 +עשרת אלפים~10,000 +עשרת אלפים ואחת~10,001 +עשרת אלפים וחמש עשרה~10,015 +עשרת אלפים ועשרים~10,020 +עשרת אלפים עשרים ושלוש~10,023 +עשרת אלפים מאתיים~10,200 +עשרת אלפים מאתיים ואחד~10,201 +עשרת אלפים מאתיים ארבעים~10,240 +עשרת אלפים מאתיים וארבעים~10,240 +עשרת אלפים שלוש מאות חמישים~10,350 +עשרת אלפים שלוש מאות וחמישים~10,350 +שתיים עשרה אלף שש מאות~12,600 +שתיים עשרה אלף ושש מאות~12,600 +שתיים עשרה אלף שש מאות ואחת~12,601 +כשמונים ושבע אלף ועשר~כ-87,010 +תשעים ותשע אלף תשע מאות תשעים ותשע~99,999 +מאה אלף~100,000 +כמאה אלף תושבים~כ-100,000 תושבים +מאה אלף ושלוש~100,003 +מאה אלף ושתיים עשרה~100,012 +מאה אלף וארבעים~100,040 +מאה אלף ארבעים ושבע~100,047 +מאה אלף וארבעים ושבע~100,047 +מאה אלף ומאה~100,100 +מאה אלף מאה~100,100 +מאה אלף מאה שלושים ושלוש~100,133 +מאה ואחד אלף~101,000 +מאה ואחד אלף ואחת~101,001 +מאה ואחד אלף ועשר~101,010 +מאה ואחד אלף ואחת עשרה~101,011 +מאה ואחד אלף מאתיים~101,200 +כמאה ואחד אלף ומאתיים~כ-101,200 +מינוס מאה ואחת אלף מאתיים ועשרים~-101,220 +מינוס מאה ואחת אלף מאתיים עשרים~-101,220 +מינוס מאה ואחת אלף מאתיים עשרים ותשע~-101,229 +מינוס מאה ואחת אלף מאתיים עשרים ותשע~-101,229 +מאה ושתיים אלף~102,000 +מאה ושלוש אלף חמש מאות~103,500 +מאה ושלוש אלף וחמש מאות~103,500 +מאה וארבע אלף חמש מאות וארבע~104,504 +מאתיים ארבעים אלף~240,000 +מאתיים וארבעים אלף~240,000 +מאתיים חמישים וחמש אלף ושש~255,006 +מאתיים חמישים וחמש אלף וארבע מאות ושש~255,406 +מאתיים חמישים וחמש אלף ארבע מאות ושש~255,406 +חמש מאות חמישים וחמש אלף~555,000 +תשע מאות תשעים ותשע אלף תשע מאות תשעים ותשע~999,999 +מיליון~1,000,000 +מיליון ואחת~1,000,001 +מיליון ועשר~1,000,010 +מיליון חמש עשרה~1,000,015 +מיליון ושבעים~1,000,070 +מיליון שבעים~1,000,070 +מיליון ארבע מאות~1,000,400 +מיליון וארבע מאות~1,000,400 +מיליון ארבע מאות עשרים~1,000,420 +מיליון ארבע מאות ועשרים~1,000,420 +מיליון אלף~1,001,000 +מיליון שלושת אלפים~1,003,000 +מיליון ואלף~1,001,000 +מיליון אלף ואחת~1,001,001 +שלושה מיליון אלף~3,001,000 +שלושה מיליוןאלף וחמש~3,001,005 +שלושה מיליון ארבעים ושלוש אלף~3,043,000 +שלושה מיליון ארבעים ושלוש אלף ואחת~3,043,001 +שלושה מיליון ארבעים ושלוש אלף ושישים ואחת~3,043,061 +שלושה מיליון ארבעים ושלוש אלף שישים ואחת~3,043,061 +שלושה מיליון חמש מאות ארבעים ושלוש אלף~3,543,000 +שלושה מיליון חמש מאות ארבעים ושלוש אלף ושבע~3,543,007 +מינוס שלושה מיליון חמש מאות ארבעים ושלוש אלף ושבע~-3,543,007 +עשר מיליון~10 מיליון +עשרה מיליון~10 מיליון +עשרים מיליון~20 מיליון +חמש עשרה מיליון~15 מיליון +שלוש עשרה מיליון ארבעים ושלוש אלף~13,043,000 +מאה מיליון~100 מיליון +מאה עשרים ושתיים מיליון~122 מיליון +מאה עשרים ושתיים מיליון ושלוש עשרה~122,000,013 +מאה עשרים ושתיים מיליון חמישים אלף ושלוש עשרה~122,050,013 +שלוש אלף~3,000 diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_date.txt new file mode 100644 index 000000000..96b745de4 --- /dev/null +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_date.txt @@ -0,0 +1,29 @@ +אחד במאי אלף תשע מאות שמונים ושלוש~1.5.1983 +השתיים עשרה לשתיים עשרה אלף תשע מאות תשעים ואחת~ה-12.12.1991 +השתיים עשרה בדצמבר אלף תשע מאות תשעים ואחת~ה-12.12.1991 +בינואר עשרים עשרים ואחת~בינואר 2021 +בשלישי לשלישי אלף תשע מאות תשעים~ב-3.3.1990 +העשירי באפריל~ה-10.4 +אחד במאי~1.5 +הראשון לחמישי~הראשון לחמישי +יוני אלפיים וחמש עשרה~יוני 2015 +ביוני אלפיים וחמש עשרה~ביוני 2015 +מתחיל בספטמבר עשרים עשרים~מתחיל בספטמבר 2020 +בשבעה עשר באוגוסט~ב-17.8 +בשבעה עשר באוגוסט עשרים שלושים~ב-17.8.2030 +בשבעה עשר לשמיני עשרים שלושים~ב-17.8.2030 +עשרים ושישי לרביעי עשרים עשרים וארבע~26.4.2024 +עשרים ושש לרביעי עשרים עשרים וארבע~26.4.2024 +עשרים ושישי לאפריל עשרים עשרים וארבע~26.4.2024 +עשרים ושש באפריל עשרים עשרים וארבע~26.4.2024 +עשרים ושישי לרביעי עשרים וארבע~26.4.24 +עשרים ושש לרביעי עשרים וארבע~26.4.24 +עשרים ושישי לאפריל עשרים וארבע~26.4.24 +עשרים ושש באפריל עשרים וארבע~26.4.24 +עשרים ושישה באפריל עשרים וארבע~26.4.24 +אנשים לא ידעו אחד מהשני~אנשים לא ידעו אחד מהשני +בשבעה באוקטובר~ב-7.10 +בשנת אלפיים וחמש~בשנת 2005 +משנת עשרים עשרים ואחת~משנת 2021 +השנה אלפיים ושלוש~השנה 2003 +שנת אלפיים וארבע~שנת 2004 \ No newline at end of file diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_decimal.txt new file mode 100644 index 000000000..b864e264a --- /dev/null +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_decimal.txt @@ -0,0 +1,66 @@ +חמש נקודה שתיים מיליון~5.2 מיליון +מאה שישים וארבע נקודה חמישים ושמונה אלף~164.58 אלף +ארבע מאות מיליון~400 מיליון +חמישים מיליארד~50 מיליארד +ארבע מאות וחמש מיליארד~405 מיליארד +ארבע נקודה שמונים וחמש מיליארד~4.85 מיליארד +מאה מיליארד~100 מיליארד +מאה ועשר מיליארד~110 מיליארד +מאה שלושים ושתיים מיליארד~132 מיליארד +אחד נקודה שמונים וארבע מיליארד~1.84 מיליארד +אחד נקודה שמונים ואחת מיליארד~1.81 מיליארד +אחד נקודה חמש תשע מיליארד~1.59 מיליארד +אחד נקודה ארבע חמש שלוש מיליארד~1.453 מיליארד +אחד נקודה שבעים ושתיים מיליארד~1.72 מיליארד +אחד נקודה שתיים חמש מיליארד~1.25 מיליארד +שלוש עשרה מיליארד~13 מיליארד +שלושים מיליארד~30 מיליארד +אלפיים שמונה מאות וחמש נקודה שמונה שבע שלוש מיליון~2,805.873 מיליון +עשרה מיליון~10 מיליון +עשר מיליון~10 מיליון +חמש מיליון~5 מיליון +חמש מאות מיליון~500 מיליון +שתיים עשרה מיליון~12 מיליון +שניים עשר מיליון~12 מיליון +שלוש עשרה מיליון~13 מיליון +ארבע מיליון~4 מיליון +ארבעים וחמש מיליון~45 מיליון +חמש עשרה מיליארד~15 מיליארד +שני מיליון~2 מיליון +שתי מיליון~2 מיליון +שמונה מיליון~8 מיליון +מינוס שישים נקודה שתיים ארבע אפס אפס~-60.2400 +אפס נקודה עשרים ושש~0.26 +אפס נקודה שתיים שש~0.26 +שישים נקודה שתיים~60.2 +שמונה עשרה נקודה שמונים וחמש~18.85 +שמונה עשרה נקודה חמש אפס~18.50 +שמונה עשרה נקודה חמישים ושש~18.56 +שמונה עשרה נקודה תשע~18.9 +שמונה עשרה נקודה אפס חמש~18.05 +שמונה עשרה נקודה שתיים עשרה~18.12 +שמונה עשרה נקודה אפס אחד~18.01 +שמונה עשרה נקודה אפס אפס אפס~18.000 +שמונה עשרה נקודה שש~18.6 +שמונה עשרה נקודה שלוש אפס אפס~18.300 +שמונה עשרה נקודה שלושים ושש~18.36 +שמונה עשרה נקודה שתיים חמש~18.25 +שמונה עשרה נקודה עשרים ושתיים~18.22 +שמונה מאות ושמונה עשרה נקודה שלוש אפס שלוש~818.303 +שמונה מאות ושמונה נקודה שמונה~808.8 +שמונה מאות ושמונה נקודה אפס~808.0 +שמונה מאות שמונים ושמונה נקודה אחד~888.1 +שמונה מאות שמונים וארבע נקודה שלוש~884.3 +שמונה מאות שמונים ושתיים נקודה שמונה~882.8 +שמונה מאות שמונים ושתיים נקודה אפס~882.0 +שמונה מאות ארבעים וחמש נקודה תשעים וארבע~845.94 +שבעים ותשע וחצי~79.5 +שתיים ורבע~שתיים ורבע +שלוש ועשירית~3.1 +מינוס שלוש וחצי~-3.5 +עשר ושתי עשיריות~10.2 +שתיים ושלושת רבעי~2.75 +שתיים עשרה אלף ושתיים עשרה נקודה שתיים עשרה~12,012.12 +שתים עשרה אלף ושתים עשרה נקודה שתים עשרה~12,012.12 +שתיים ועשירית~2.1 +אחת ועשירית~1.1 diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_measure.txt new file mode 100644 index 000000000..3d0a40a07 --- /dev/null +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_measure.txt @@ -0,0 +1,10 @@ +מינוס חמש עשרה אחוז~-15% +חמש עשרה אחוז~15% +מינוס שתים עשרה נקודה חמש מעלות ~-12.5° +שתיים עשרה נקודה חמש מעלות~12.5° +שתיים עשרה נקודה חמש מעלות צלסיוס~12.5°C +אלף אחוזים~1,000% +אחוז אחד~1% +מאתיים חמישים גרם~250 ג׳ +סנטימטר אחד~1 ס״מ +שלוש מיליגרם~3 מ״ג diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_time.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_time.txt new file mode 100644 index 000000000..0f6464445 --- /dev/null +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_time.txt @@ -0,0 +1,34 @@ +בשעה חמש בצהריים~בשעה 17:00 בצהריים +בחמש בצהריים~ב-17:00 בצהריים +רבע לשש בבוקר~5:45 בבוקר +בתשע בבוקר~ב-9:00 בבוקר +השעה עשרים וחמישה לאחת בצהריים~השעה 12:35 בצהריים +נפגשנו באחת ושתי דקות בצהריים~נפגשנו ב-13:02 בצהריים +נפגשנו באחת ושלוש דקות בצהריים~נפגשנו ב-13:03 בצהריים +נפגשנו באחת וחמישה בצהריים~נפגשנו ב-13:05 בצהריים +שתיים ועשרה בבוקר~2:10 בבוקר +בשעה שתיים ועשרה בצהריים~בשעה 14:10 בצהריים +בשתיים ועשרה אחרי הצהריים~ב-14:10 אחרי הצהריים +שלוש ודקה בצהריים~15:01 בצהריים +ארבע ושלוש דקות אחרי הצהריים~16:03 אחרי הצהריים +שש ועשרים דקות בערב~18:20 בערב +בשש וחצי בערב~ב-18:30 בערב +חמישה לשלוש בבוקר~2:55 בבוקר +רבע לשש בערב~17:45 בערב +שלוש בצהריים~15:00 בצהריים +אחת לפנות בוקר~1:00 לפנות בוקר +אתמול בחמש אחרי הצהריים יצאנו עם אמא למכולת ובדרך ראינו שהגן שלנו סגור~אתמול ב-17:00 אחרי הצהריים יצאנו עם אמא למכולת ובדרך ראינו שהגן שלנו סגור +חמישה לחצות~23:55 +ברבע לחצות~ב-23:45 +בשעה חצות ועשרה~בשעה 0:10 +בחצות ודקה~ב-0:01 +חצות ושתיים עשרה דקות~0:12 +שלוש דקות לחצות~23:57 +חצות ושתי דקות~0:02 +חצות~0:00 +דקה לשלוש בצהריים~14:59 בצהריים +הפגישה זזה משבע בבוקר לשמונה וחצי בבוקר~הפגישה זזה מ-7:00 בבוקר ל-8:30 בבוקר +באחת בלילה~ב-1:00 בלילה +חמש לפנות ערב~17:00 לפנות ערב +בשלוש לפנות בוקר~ב-3:00 לפנות בוקר +עשרים לחמש אחרי הצהריים~16:40 אחרי הצהריים \ No newline at end of file diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_whitelist.txt new file mode 100644 index 000000000..67e4d6560 --- /dev/null +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_cases_whitelist.txt @@ -0,0 +1,4 @@ +בשנת שבעים לפני הספירה~בשנת 70 לפנה״ס +יש מאתיים חמישים עורכי דין חדשים~יש 250 עו״ד חדשים +ישראל היא המדינה החמישים ואחת של ארצות הברית~ישראל היא המדינה ה-51 של ארה״ב +דוקטור שמילוביץ רשם לי תרופה חדשה~ד״ר שמילוביץ רשם לי תרופה חדשה \ No newline at end of file diff --git a/tests/nemo_text_processing/he/data_inverse_text_normalization/test_full_sentences.txt b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_full_sentences.txt new file mode 100644 index 000000000..1bf28b0fb --- /dev/null +++ b/tests/nemo_text_processing/he/data_inverse_text_normalization/test_full_sentences.txt @@ -0,0 +1,56 @@ +אתמול בשעה שבע וחצי בבוקר היה לי תור לרופא~אתמול בשעה 7:30 בבוקר היה לי תור לרופא +הגעתי למרפאה בשבע ורבע בבוקר כדי לא לאחר~הגעתי למרפאה ב-7:15 בבוקר כדי לא לאחר +אמרתי לרופא שאני בן חמישים ושלוש שאני נשוי ויש לי שלושה ילדים.~אמרתי לרופא שאני בן 53 שאני נשוי ויש לי שלושה ילדים. +אמרתי לו שיש לי כאבים ביד, אז הוא בדק אותי ~אמרתי לו שיש לי כאבים ביד, אז הוא בדק אותי +הוא אמר שיש חשד לשבר באמה של בין שני סנטימטר לארבע סנטימטר ושנצטרך לעשות צילום כדי לדעת~הוא אמר שיש חשד לשבר באמה של בין 2 ס״מ ל-4 ס״מ ושנצטרך לעשות צילום כדי לדעת +בינתיים הוא רשם לי עשר מיליגרם של משככי כאבים~בינתיים הוא רשם לי 10 מ״ג של משככי כאבים +הוא אמר שזה מאוד נפוץ ושליותר מעשר אחוז מהאוכלוסיה יש את זה~הוא אמר שזה מאוד נפוץ ושליותר מ-10% מהאוכלוסיה יש את זה +הוא העריך את סיכויי ההחלמה בשמונים ושלוש נקודה שש אחוז~הוא העריך את סיכויי ההחלמה ב-83.6% +בסוף המפגש הוא קבע ביקורת לשמיני באוגוסט~בסוף המפגש הוא קבע ביקורת ל-8.8 +יש לי חמישה תפוחים~יש לי חמישה תפוחים +אף אחד לא רוצה~אף אחד לא רוצה +בכל כיתה יש עשרים, עשרים ושתיים תלמידים~בכל כיתה יש 20 , 22 תלמידים +בכל כיתה יש עשרים - עשרים ושתיים תלמידים~בכל כיתה יש 20 - 22 תלמידים +אחת עשרה אלף שבע מאות חמישים ושש~11,756 +ע"פ הנתונים החדשים שקיבלנו הייתה עלייה של שלושים נקודה שתיים עשרה אחוז במכירות~ע"פ הנתונים החדשים שקיבלנו הייתה עלייה של 30.12% במכירות +אני בטוח בזה במאה אחוז~אני בטוח בזה ב-100% +יש לזה שלושים ותשע נקודה שישים ושבע אחוז הצלחה~יש לזה 39.67% הצלחה +יהי אפסילון אפס ויהי איקס~יהי אפסילון אפס ויהי איקס +לשתינו יש שתי בנות~לשתינו יש שתי בנות +היום יום שני ומחר יום שלישי~היום יום שני ומחר יום שלישי +שלוש וחצי קילוגרם~3.5 ק״ג +חמש ורבע סנטימטר~5.25 ס״מ +שמונה ושלושת רבעי~8.75 +שמונה ורבע מיליון~8.25 מיליון +שתיים וחצי~שתיים וחצי +שתיים וחצי מיליון~2.5 מיליון +בשתיים וחצי מיליון~ב-2.5 מיליון +שתיים וחצי בבוקר~2:30 בבוקר +מינוס שלוש וחצי אחוז~-3.5% +שלוש וחצי~שלוש וחצי +עשרת אלפים ומאתיים ארבעים~10,240 +אפס מאופס~אפס מאופס +הוא מתנהג כמו אפס.... בקיצור כל עניין האפס~הוא מתנהג כמו אפס.... בקיצור כל עניין האפס +מאה שישים וארבע נקודה חמישים ושמונה אלף~164.58 אלף +הפגישה זזה משבע וחצי בבוקר לשמונה~הפגישה זזה מ-7:30 בבוקר לשמונה +על סמך זה יצאנו ביום ראשון~על סמך זה יצאנו ביום ראשון +צעירים היו בגיל שלושים שלושים וחמש~צעירים היו בגיל 30 35 +אולי שניים שלושה~אולי שניים שלושה +בן הראשון שלי שנולד~בן הראשון שלי שנולד +אנחנו היינו איזה חמישה עשר איש~אנחנו היינו איזה 15 איש +התחילו לחזור וחזרו אחד אחד~התחילו לחזור וחזרו אחד אחד +וזה היה כבר אולי שעה תשע~וזה היה כבר אולי שעה תשע +אני מדבר על שמונה עשר באפריל~אני מדבר על 18.4 +שמונה עשר בינואר~18.1 +הייתה נראית כעת שתיים עשרה שלוש עשרה~הייתה נראית כעת 12 13 +היה בערך בעשירי בעשירי למאי~היה בערך בעשירי ב-10.5 +באמצע הלילה שתיים בלילה~באמצע הלילה 2:00 בלילה +למחרת בשעה חמש~למחרת בשעה חמש +בשנת אלף תשע מאות ארבעים ושמונה~בשנת 1948 +באלף תשע מאות ארבעים ושמונה~ב-1,948 +ארבע מאות וחמישים מיליגרם~450 מ״ג +ארבע וחצי~ארבע וחצי +יהי אפסילון אפס ויהי איקס~יהי אפסילון אפס ויהי איקס +לשתינו יש שתי בנות~לשתינו יש שתי בנות +מחר בשש וחצי בבוקר נעלה על האוטובוסים ונצא לטיול השנתי. בשעה שמונה נגיע למצדה ונתחיל לטפס למעלה, נהיה שם עד אחת וחצי בצהריים, אולי רבע לשתיים, ונרד בשביל הנחש~מחר ב-6:30 בבוקר נעלה על האוטובוסים ונצא לטיול השנתי. בשעה שמונה נגיע למצדה ונתחיל לטפס למעלה, נהיה שם עד 13:30 בצהריים , אולי 1:45 , ונרד בשביל הנחש +יש לי חמישה תפוחים~יש לי חמישה תפוחים \ No newline at end of file diff --git a/tests/nemo_text_processing/he/test_cardinal.py b/tests/nemo_text_processing/he/test_cardinal.py new file mode 100644 index 000000000..4700725b1 --- /dev/null +++ b/tests/nemo_text_processing/he/test_cardinal.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestCardinal: + inverse_normalizer_he = InverseNormalizer(lang='he', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('he/data_inverse_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_he.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/he/test_date.py b/tests/nemo_text_processing/he/test_date.py new file mode 100644 index 000000000..73c183e7b --- /dev/null +++ b/tests/nemo_text_processing/he/test_date.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestDate: + inverse_normalizer_he = InverseNormalizer(lang='he', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('he/data_inverse_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_he.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/he/test_decimal.py b/tests/nemo_text_processing/he/test_decimal.py new file mode 100644 index 000000000..125fc31d0 --- /dev/null +++ b/tests/nemo_text_processing/he/test_decimal.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestDecimal: + inverse_normalizer_he = InverseNormalizer(lang='he', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('he/data_inverse_text_normalization/test_cases_decimal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_he.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/he/test_full_sentences.py b/tests/nemo_text_processing/he/test_full_sentences.py new file mode 100644 index 000000000..0bc9251a7 --- /dev/null +++ b/tests/nemo_text_processing/he/test_full_sentences.py @@ -0,0 +1,31 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestFullSentences: + inverse_normalizer_he = InverseNormalizer(lang='he', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('he/data_inverse_text_normalization/test_full_sentences.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_he.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/he/test_measure.py b/tests/nemo_text_processing/he/test_measure.py new file mode 100644 index 000000000..1649effa7 --- /dev/null +++ b/tests/nemo_text_processing/he/test_measure.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestMeasure: + inverse_normalizer_he = InverseNormalizer(lang='he', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('he/data_inverse_text_normalization/test_cases_measure.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_he.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/he/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/he/test_sparrowhawk_inverse_text_normalization.sh new file mode 100644 index 000000000..bce2e24b9 --- /dev/null +++ b/tests/nemo_text_processing/he/test_sparrowhawk_inverse_text_normalization.sh @@ -0,0 +1,61 @@ +#! /bin/sh + +PROJECT_DIR=/workspace/tests + +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +PROJECT_DIR=${2:-"/workspace/tests/"} + +runtest () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + # read test file + while read testcase; do + IFS='~' read spoken written <<< $testcase + denorm_pred=$(echo $spoken | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') + + # trim white space + written="$(echo -e "${written}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$spoken" "$written" "$denorm_pred" + done < "$input" +} + +testITNCardinal() { + input=$PROJECT_DIR/he/data_inverse_text_normalization/test_cases_cardinal.txt + runtest $input +} + +testITNDate() { + input=$PROJECT_DIR/he/data_inverse_text_normalization/test_cases_date.txt + runtest $input +} + +testITNDecimal() { + input=$PROJECT_DIR/he/data_inverse_text_normalization/test_cases_decimal.txt + runtest $input +} + + +testITNTime() { + input=$PROJECT_DIR/he/data_inverse_text_normalization/test_cases_time.txt + runtest $input +} + +testITNMeasure() { + input=$PROJECT_DIR/he/data_inverse_text_normalization/test_cases_measure.txt + runtest $input +} + + +testITNWhitelist() { + input=$PROJECT_DIR/he/data_inverse_text_normalization/test_cases_whitelist.txt + runtest $input +} + + +# Load shUnit2 +. $PROJECT_DIR/../shunit2/shunit2 diff --git a/tests/nemo_text_processing/he/test_time.py b/tests/nemo_text_processing/he/test_time.py new file mode 100644 index 000000000..f3bba67b5 --- /dev/null +++ b/tests/nemo_text_processing/he/test_time.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestTime: + inverse_normalizer_en = InverseNormalizer(lang='he', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('he/data_inverse_text_normalization/test_cases_time.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_en.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/he/test_whitelist.py b/tests/nemo_text_processing/he/test_whitelist.py new file mode 100644 index 000000000..fb14c2a58 --- /dev/null +++ b/tests/nemo_text_processing/he/test_whitelist.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestCardinal: + inverse_normalizer_he = InverseNormalizer(lang='he', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('he/data_inverse_text_normalization/test_cases_whitelist.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_he.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 445c71c98..2e10e81f2 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -101,6 +101,7 @@ def parse_args(): 'ar', 'it', 'es_en', + 'he', 'hi', 'hy', 'mr', @@ -290,6 +291,13 @@ def parse_args(): from nemo_text_processing.inverse_text_normalization.mr.verbalizers.verbalize import ( VerbalizeFst as ITNVerbalizeFst, ) + elif args.language == 'he': + from nemo_text_processing.inverse_text_normalization.he.taggers.tokenize_and_classify import ( + ClassifyFst as ITNClassifyFst, + ) + from nemo_text_processing.inverse_text_normalization.he.verbalizers.verbalize import ( + VerbalizeFst as ITNVerbalizeFst, + ) elif args.language == 'hy': from nemo_text_processing.inverse_text_normalization.hy.taggers.tokenize_and_classify import ( ClassifyFst as ITNClassifyFst, @@ -319,6 +327,8 @@ def parse_args(): ClassifyFst as TNClassifyFst, ) from nemo_text_processing.text_normalization.rw.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst + else: + raise KeyError(f"Language {args.language} is not defined for export.") output_dir = os.path.join(args.output_dir, f"{args.language}_{args.grammars}_{args.input_case}") export_grammars( output_dir=output_dir, From a63ca92445993b115e2d58e86a40280035971e3b Mon Sep 17 00:00:00 2001 From: tbartley94 Date: Tue, 18 Nov 2025 15:50:29 -0800 Subject: [PATCH 2/3] Hebrew itn (#368) * rebase to main Signed-off-by: tbartley94 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rebase Signed-off-by: tbartley94 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rebasing Signed-off-by: tbartley94 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * responding to formatting pr Signed-off-by: tbartley94 * isort and moving string map to string file Signed-off-by: tbartley94 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * forgot init Signed-off-by: tbartley94 --------- Signed-off-by: tbartley94 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../inverse_text_normalization/he/data/__init__.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/__init__.py diff --git a/nemo_text_processing/inverse_text_normalization/he/data/__init__.py b/nemo_text_processing/inverse_text_normalization/he/data/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. From 03cb0634c5b8627bff7966af9dbc3ed557d7b02c Mon Sep 17 00:00:00 2001 From: tbartley94 Date: Tue, 18 Nov 2025 16:05:03 -0800 Subject: [PATCH 3/3] Hebrew itn (#369) * rebase to main Signed-off-by: tbartley94 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rebase Signed-off-by: tbartley94 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rebasing Signed-off-by: tbartley94 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * responding to formatting pr Signed-off-by: tbartley94 * isort and moving string map to string file Signed-off-by: tbartley94 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * forgot init Signed-off-by: tbartley94 * more inits Signed-off-by: tbartley94 --------- Signed-off-by: tbartley94 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../he/data/decimals/__init__.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 nemo_text_processing/inverse_text_normalization/he/data/decimals/__init__.py diff --git a/nemo_text_processing/inverse_text_normalization/he/data/decimals/__init__.py b/nemo_text_processing/inverse_text_normalization/he/data/decimals/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/he/data/decimals/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License.