Deal with odoriji

polm · polm · commit d962cda7c25b · 2021-10-20T23:15:09.000+09:00
This doesn't handle them all that well but it shouldn't blow up any more
diff --git a/cutlet/cutlet.py b/cutlet/cutlet.py
@@ -10,6 +10,7 @@
 
 SUTEGANA = 'ゃゅょぁぃぅぇぉ'
 PUNCT = '\'".!?(),;:-'
+ODORI = '々〃ゝゞヽゞ'
 
 SYSTEMS = {
         'hepburn': HEPBURN,
@@ -255,13 +256,32 @@ def map_kana(self, kana):
         return out
 
     def get_single_mapping(self, pk, kk, nk):
+        # handle odoriji
+        # NOTE: This is very rarely useful at present because odoriji are not
+        # left in readings for dictionary words, and we can't follow kana
+        # across word boundaries. 
+        if kk in ODORI:
+            if kk in 'ゝヽ':
+                if pk: return pk
+                else: return '' # invalid but be nice
+            if kk in 'ゞヾ': # repeat with voicing
+                if not pk: return ''
+                vv = add_dakuten(pk)
+                if vv: return self.table[vv]
+                else: return ''
+            # remaining are 々 for kanji and 〃 for symbols, but we can't
+            # infer their span reliably (or handle rendaku)
+            return ''
+        
+
         # handle digraphs
         if pk and (pk + kk) in self.table:
             return self.table[pk + kk]
         if nk and (kk + nk) in self.table:
             return ''
 
         if nk and nk in SUTEGANA:
+            if kk == 'っ': return '' # never valid, just ignore
             return self.table[kk][:-1] + self.table[nk]
         if kk in SUTEGANA:
             return ''
diff --git a/cutlet/mapping.py b/cutlet/mapping.py
@@ -188,3 +188,12 @@
 NIHONSHIKI['ぢ'] = 'di'
 NIHONSHIKI['づ'] = 'du'
 
+UNVOICED = 'かきくけこさしすせそたちつてとはひふへほ'
+VOICED =   'がぎぐげござじずぜぞだぢづでどばびぶべぼ'
+
+def add_dakuten(kk):
+    ii = UNVOICED.index(kk)
+    if ii is None:
+        return ii
+    return VOICED[ii]
+
diff --git a/cutlet/test/test_basic.py b/cutlet/test/test_basic.py
@@ -85,13 +85,25 @@
         ("齋藤タヶオ", "Saitou ta ke o"),
         # っー
         ("ずっーと", "Zu--to"),
+
         # don't add spaces around apostrophe if it wasn't there
         ("McDonald's", "McDonald's"),
         ("Text McDonald's text", "Text McDonald's text"),
+
         # Following are quote weirdness. Not good but hard to fix.
         # An issue is that ," or .' is a single token.
         ("It's 'delicious.'", "It's ' delicious .'"),
         ('"Hello," he said.', '" Hello ," he said.'),
+
+        # this is a very strange typo
+        ("アトランテッィク", "Atoranteku"),
+
+        # odoriji. Note at this point these rarely work properly, they mainly
+        # don't blow up.
+        ('くゞる', 'Kuguru'), # note this is actually in unidic-lite
+        ('くヽる', 'Ku ru'),
+        ('今度クヾペへ行こう', 'Kondo kugupe e ikou'), # made up word
+        ('彁々', '?'),
         ]
 
 SENTENCES_KUNREI = [