Skip to content

Commit 5e73dc1

Browse files
committed
Add ensure_ascii option
This option lets you configure what to do with unknown characters. They can be converted to ? (the default) or passed through. This also adds tests, and fixes a bug where slug tests were not run.
1 parent 90830f7 commit 5e73dc1

File tree

2 files changed

+15
-7
lines changed

2 files changed

+15
-7
lines changed

cutlet/cutlet.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ def __init__(self, system='hepburn'):
8585
self.use_wo = (self.system in ('hepburn', 'nihon'))
8686

8787
self.use_foreign_spelling = True
88+
self.ensure_ascii = True
8889

8990
def add_exception(self, key, val):
9091
self.exceptions[key] = val
@@ -189,11 +190,15 @@ def romaji_word(self, word):
189190
if word.char_type == 6 or word.char_type == 7: # hiragana/katakana
190191
kana = jaconv.kata2hira(word.surface)
191192
return self.map_kana(kana)
192-
elif word.char_type == 2: # kanji we don't know, like 彁
193+
194+
# At this point this is an unknown word and not kana. Could be
195+
# unknown kanji, could be hangul, cyrillic, something else.
196+
# By default ensure ascii by replacing with ?, but allow pass-through.
197+
if self.ensure_ascii:
193198
out = '?' * len(word.surface)
194199
return out
195-
# At this point it could be hangul or cyrillic or something
196-
return word.surface
200+
else:
201+
return word.surface
197202

198203
if word.feature.pos1 == '補助記号':
199204
# If it's punctuation we don't recognize, just discard it

cutlet/test/test_basic.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@
6262
("ケメコデラックス", "Kemekoderakkusu"),
6363
("プププランド", "Pupupurando"),
6464
# Add some non-Japanese tests
65-
("панда", "Панда"),
66-
("팬더", "팬더"),
65+
("панда", "?????"),
66+
("팬더", "??"),
6767
("「彁」は幽霊文字のひとつ", '"?" wa yuurei moji no hitotsu'),
6868
]
6969

@@ -78,10 +78,13 @@
7878
"kagawa-game-jourei-pabukome-wa-sansei-tasuu-dakara-saiketsu-shite-wa-to-hatsugen-shita-no-wa-dare-datta-no-ka"),
7979
("コトヤマ「よふかしのうた」3巻発売記念のPV公開、期間限定で1巻の無料配信も",
8080
"koto-yama-yo-fukashi-no-uta-3-kan-hatsubai-kinen-no-p-v-koukai-kikan-gentei-de-1-kan-no-muryou-haishin-mo"),
81+
# Include some unks
82+
("彁は幽霊文字", "wa-yuurei-moji"),
83+
("パンダはロシア語でпанда", "panda-wa-rossiya-go-de"),
8184
]
8285

8386
NON_FOREIGN = [
84-
("カツカレーは美味しい", "Katsu karee wa oishii")
87+
("カツカレーは美味しい", "Katsu karee wa oishii"),
8588
]
8689

8790
@pytest.mark.parametrize('ja, roma', WORDS)
@@ -112,7 +115,7 @@ def test_romaji_slugs(ja, roma):
112115
assert cut.slug(ja) == roma
113116

114117
@pytest.mark.parametrize('ja, roma', NON_FOREIGN)
115-
def test_romaji_slugs(ja, roma):
118+
def test_romaji_non_foreign(ja, roma):
116119
cut = Cutlet()
117120
cut.use_foreign_spelling = False
118121
assert cut.romaji(ja) == roma

0 commit comments

Comments
 (0)