Skip to content

Commit d962cda

Browse files
committed
Deal with odoriji
This doesn't handle them all that well but it shouldn't blow up any more
1 parent 262e92f commit d962cda

File tree

3 files changed

+41
-0
lines changed

3 files changed

+41
-0
lines changed

cutlet/cutlet.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
SUTEGANA = 'ゃゅょぁぃぅぇぉ'
1212
PUNCT = '\'".!?(),;:-'
13+
ODORI = '々〃ゝゞヽゞ'
1314

1415
SYSTEMS = {
1516
'hepburn': HEPBURN,
@@ -255,13 +256,32 @@ def map_kana(self, kana):
255256
return out
256257

257258
def get_single_mapping(self, pk, kk, nk):
259+
# handle odoriji
260+
# NOTE: This is very rarely useful at present because odoriji are not
261+
# left in readings for dictionary words, and we can't follow kana
262+
# across word boundaries.
263+
if kk in ODORI:
264+
if kk in 'ゝヽ':
265+
if pk: return pk
266+
else: return '' # invalid but be nice
267+
if kk in 'ゞヾ': # repeat with voicing
268+
if not pk: return ''
269+
vv = add_dakuten(pk)
270+
if vv: return self.table[vv]
271+
else: return ''
272+
# remaining are 々 for kanji and 〃 for symbols, but we can't
273+
# infer their span reliably (or handle rendaku)
274+
return ''
275+
276+
258277
# handle digraphs
259278
if pk and (pk + kk) in self.table:
260279
return self.table[pk + kk]
261280
if nk and (kk + nk) in self.table:
262281
return ''
263282

264283
if nk and nk in SUTEGANA:
284+
if kk == 'っ': return '' # never valid, just ignore
265285
return self.table[kk][:-1] + self.table[nk]
266286
if kk in SUTEGANA:
267287
return ''

cutlet/mapping.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,3 +188,12 @@
188188
NIHONSHIKI['ぢ'] = 'di'
189189
NIHONSHIKI['づ'] = 'du'
190190

191+
UNVOICED = 'かきくけこさしすせそたちつてとはひふへほ'
192+
VOICED = 'がぎぐげござじずぜぞだぢづでどばびぶべぼ'
193+
194+
def add_dakuten(kk):
195+
ii = UNVOICED.index(kk)
196+
if ii is None:
197+
return ii
198+
return VOICED[ii]
199+

cutlet/test/test_basic.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,13 +85,25 @@
8585
("齋藤タヶオ", "Saitou ta ke o"),
8686
# っー
8787
("ずっーと", "Zu--to"),
88+
8889
# don't add spaces around apostrophe if it wasn't there
8990
("McDonald's", "McDonald's"),
9091
("Text McDonald's text", "Text McDonald's text"),
92+
9193
# Following are quote weirdness. Not good but hard to fix.
9294
# An issue is that ," or .' is a single token.
9395
("It's 'delicious.'", "It's ' delicious .'"),
9496
('"Hello," he said.', '" Hello ," he said.'),
97+
98+
# this is a very strange typo
99+
("アトランテッィク", "Atoranteku"),
100+
101+
# odoriji. Note at this point these rarely work properly, they mainly
102+
# don't blow up.
103+
('くゞる', 'Kuguru'), # note this is actually in unidic-lite
104+
('くヽる', 'Ku ru'),
105+
('今度クヾペへ行こう', 'Kondo kugupe e ikou'), # made up word
106+
('彁々', '?'),
95107
]
96108

97109
SENTENCES_KUNREI = [

0 commit comments

Comments
 (0)