# https://github.com/polm/cutlet/blob/master/cutlet/cutlet.py from dataclasses import dataclass from fugashi import Tagger from num2kana import Convert import mojimoji import re import unicodedata HEPBURN = { chr(12449):'a', #ァ chr(12450):'a', #ア chr(12451):'i', #ィ chr(12452):'i', #イ chr(12453):'ɯ', #ゥ chr(12454):'ɯ', #ウ chr(12455):'e', #ェ chr(12456):'e', #エ chr(12457):'o', #ォ chr(12458):'o', #オ chr(12459):'ka', #カ chr(12460):'ɡa', #ガ chr(12461):'ki', #キ chr(12462):'ɡi', #ギ chr(12463):'kɯ', #ク chr(12464):'ɡɯ', #グ chr(12465):'ke', #ケ chr(12466):'ɡe', #ゲ chr(12467):'ko', #コ chr(12468):'ɡo', #ゴ chr(12469):'sa', #サ chr(12470):'za', #ザ chr(12471):'ɕi', #シ chr(12472):'dʑi', #ジ chr(12473):'sɨ', #ス chr(12474):'zɨ', #ズ chr(12475):'se', #セ chr(12476):'ze', #ゼ chr(12477):'so', #ソ chr(12478):'zo', #ゾ chr(12479):'ta', #タ chr(12480):'da', #ダ chr(12481):'tɕi', #チ chr(12482):'dʑi', #ヂ # chr(12483) #ッ chr(12484):'tsɨ', #ツ chr(12485):'zɨ', #ヅ chr(12486):'te', #テ chr(12487):'de', #デ chr(12488):'to', #ト chr(12489):'do', #ド chr(12490):'na', #ナ chr(12491):'ɲi', #ニ chr(12492):'nɯ', #ヌ chr(12493):'ne', #ネ chr(12494):'no', #ノ chr(12495):'ha', #ハ chr(12496):'ba', #バ chr(12497):'pa', #パ chr(12498):'çi', #ヒ chr(12499):'bi', #ビ chr(12500):'pi', #ピ chr(12501):'ɸɯ', #フ chr(12502):'bɯ', #ブ chr(12503):'pɯ', #プ chr(12504):'he', #ヘ chr(12505):'be', #ベ chr(12506):'pe', #ペ chr(12507):'ho', #ホ chr(12508):'bo', #ボ chr(12509):'po', #ポ chr(12510):'ma', #マ chr(12511):'mi', #ミ chr(12512):'mɯ', #ム chr(12513):'me', #メ chr(12514):'mo', #モ chr(12515):'ja', #ャ chr(12516):'ja', #ヤ chr(12517):'jɯ', #ュ chr(12518):'jɯ', #ユ chr(12519):'jo', #ョ chr(12520):'jo', #ヨ chr(12521):'ra', #ラ chr(12522):'ri', #リ chr(12523):'rɯ', #ル chr(12524):'re', #レ chr(12525):'ro', #ロ chr(12526):'wa', #ヮ chr(12527):'wa', #ワ chr(12528):'i', #ヰ chr(12529):'e', #ヱ chr(12530):'o', #ヲ # chr(12531) #ン chr(12532):'vɯ', #ヴ chr(12533):'ka', #ヵ chr(12534):'ke', #ヶ } assert len(HEPBURN) == 84 and all(i in {12483, 12531} or chr(i) in HEPBURN for i in range(12449, 12535)) for k, v in list(HEPBURN.items()): HEPBURN[chr(ord(k)-96)] = v assert len(HEPBURN) == 84*2 HEPBURN.update({ chr(12535):'va', #ヷ chr(12536):'vi', #ヸ chr(12537):'ve', #ヹ chr(12538):'vo', #ヺ }) assert len(HEPBURN) == 84*2+4 and all(chr(i) in HEPBURN for i in range(12535, 12539)) HEPBURN.update({ chr(12784):'kɯ', #ㇰ chr(12785):'ɕi', #ㇱ chr(12786):'sɨ', #ㇲ chr(12787):'to', #ㇳ chr(12788):'nɯ', #ㇴ chr(12789):'ha', #ㇵ chr(12790):'çi', #ㇶ chr(12791):'ɸɯ', #ㇷ chr(12792):'he', #ㇸ chr(12793):'ho', #ㇹ chr(12794):'mɯ', #ㇺ chr(12795):'ra', #ㇻ chr(12796):'ri', #ㇼ chr(12797):'rɯ', #ㇽ chr(12798):'re', #ㇾ chr(12799):'ro', #ㇿ }) assert len(HEPBURN) == 84*2+4+16 and all(chr(i) in HEPBURN for i in range(12784, 12800)) HEPBURN.update({ chr(12452)+chr(12455):'je', #イェ chr(12454)+chr(12451):'wi', #ウィ chr(12454)+chr(12455):'we', #ウェ chr(12454)+chr(12457):'wo', #ウォ chr(12461)+chr(12455):'kʲe', #キェ chr(12461)+chr(12515):'kʲa', #キャ chr(12461)+chr(12517):'kʲɨ', #キュ chr(12461)+chr(12519):'kʲo', #キョ chr(12462)+chr(12515):'ɡʲa', #ギャ chr(12462)+chr(12517):'ɡʲɨ', #ギュ chr(12462)+chr(12519):'ɡʲo', #ギョ chr(12463)+chr(12449):'kʷa', #クァ chr(12463)+chr(12451):'kʷi', #クィ chr(12463)+chr(12455):'kʷe', #クェ chr(12463)+chr(12457):'kʷo', #クォ chr(12464)+chr(12449):'ɡʷa', #グァ chr(12464)+chr(12451):'ɡʷi', #グィ chr(12464)+chr(12455):'ɡʷe', #グェ chr(12464)+chr(12457):'ɡʷo', #グォ chr(12471)+chr(12455):'ɕe', #シェ chr(12471)+chr(12515):'ɕa', #シャ chr(12471)+chr(12517):'ɕɨ', #シュ chr(12471)+chr(12519):'ɕo', #ショ chr(12472)+chr(12455):'dʑe', #ジェ chr(12472)+chr(12515):'dʑa', #ジャ chr(12472)+chr(12517):'dʑɨ', #ジュ chr(12472)+chr(12519):'dʑo', #ジョ chr(12481)+chr(12455):'tɕe', #チェ chr(12481)+chr(12515):'tɕa', #チャ chr(12481)+chr(12517):'tɕɨ', #チュ chr(12481)+chr(12519):'tɕo', #チョ chr(12482)+chr(12515):'dʑa', #ヂャ chr(12482)+chr(12517):'dʑɨ', #ヂュ chr(12482)+chr(12519):'dʑo', #ヂョ chr(12484)+chr(12449):'tsa', #ツァ chr(12484)+chr(12451):'tsi', #ツィ chr(12484)+chr(12455):'tse', #ツェ chr(12484)+chr(12457):'tso', #ツォ chr(12486)+chr(12451):'ti', #ティ chr(12486)+chr(12517):'tʲɨ', #テュ chr(12487)+chr(12451):'di', #ディ chr(12487)+chr(12517):'dʲɨ', #デュ chr(12488)+chr(12453):'tɯ', #トゥ chr(12489)+chr(12453):'dɯ', #ドゥ chr(12491)+chr(12455):'ɲe', #ニェ chr(12491)+chr(12515):'ɲa', #ニャ chr(12491)+chr(12517):'ɲɨ', #ニュ chr(12491)+chr(12519):'ɲo', #ニョ chr(12498)+chr(12455):'çe', #ヒェ chr(12498)+chr(12515):'ça', #ヒャ chr(12498)+chr(12517):'çɨ', #ヒュ chr(12498)+chr(12519):'ço', #ヒョ chr(12499)+chr(12515):'bʲa', #ビャ chr(12499)+chr(12517):'bʲɨ', #ビュ chr(12499)+chr(12519):'bʲo', #ビョ chr(12500)+chr(12515):'pʲa', #ピャ chr(12500)+chr(12517):'pʲɨ', #ピュ chr(12500)+chr(12519):'pʲo', #ピョ chr(12501)+chr(12449):'ɸa', #ファ chr(12501)+chr(12451):'ɸi', #フィ chr(12501)+chr(12455):'ɸe', #フェ chr(12501)+chr(12457):'ɸo', #フォ chr(12501)+chr(12517):'ɸʲɨ', #フュ chr(12501)+chr(12519):'ɸʲo', #フョ chr(12511)+chr(12515):'mʲa', #ミャ chr(12511)+chr(12517):'mʲɨ', #ミュ chr(12511)+chr(12519):'mʲo', #ミョ chr(12522)+chr(12515):'rʲa', #リャ chr(12522)+chr(12517):'rʲɨ', #リュ chr(12522)+chr(12519):'rʲo', #リョ chr(12532)+chr(12449):'va', #ヴァ chr(12532)+chr(12451):'vi', #ヴィ chr(12532)+chr(12455):'ve', #ヴェ chr(12532)+chr(12457):'vo', #ヴォ chr(12532)+chr(12517):'vʲɨ', #ヴュ chr(12532)+chr(12519):'vʲo', #ヴョ }) assert len(HEPBURN) == 84*2+4+16+76 for k, v in list(HEPBURN.items()): if len(k) != 2: continue a, b = k assert a in HEPBURN and b in HEPBURN, (a, b) a = chr(ord(a)-96) b = chr(ord(b)-96) assert a in HEPBURN and b in HEPBURN, (a, b) HEPBURN[a+b] = v assert len(HEPBURN) == 84*2+4+16+76*2 HEPBURN.update({ # symbols # 'ー': '-', # 長音符, only used when repeated '。': '.', '、': ',', '?': '?', '!': '!', '「': '"', '」': '"', '『': '"', '』': '"', ':': ':', '(': '(', ')': ')', '《': '(', '》': ')', '【': '[', '】': ']', '・': ' ',#'/', ',': ',', '~': '—', '〜': '—', '—': '—', '«': '«', '»': '»', # other '゚': '', # combining handakuten by itself, just discard '゙': '', # combining dakuten by itself }) def add_dakuten(kk): """Given a kana (single-character string), add a dakuten.""" try: # ii = 'かきくけこさしすせそたちつてとはひふへほ'.index(kk) ii = 'カキクケコサシスセソタチツテトハヒフヘホ'.index(kk) return 'ガギグゲゴザジズゼゾダヂヅデドバビブベボ'[ii] # return 'がぎぐげござじずぜぞだぢづでどばびぶべぼ'[ii] except ValueError: # this is normal if the input is nonsense return None SUTEGANA = 'ャュョァィゥェォ' #'ゃゅょぁぃぅぇぉ' PUNCT = '\'".!?(),;:-' ODORI = '々〃ゝゞヽゞ' @dataclass class Token: surface: str space: bool # if a space should follow def __str__(self): sp = " " if self.space else "" return f"{self.surface}{sp}" class Katsu: def __init__(self): """Create a Katsu object, which holds configuration as well as tokenizer state. Typical usage: ```python katsu = Katsu() roma = katsu.romaji("カツカレーを食べた") # "Cutlet curry wo tabeta" ``` """ self.tagger = Tagger() self.table = dict(HEPBURN) # make a copy so we can modify it self.exceptions = {} def romaji(self, text): """Build a complete string from input text.""" if not text: return '' text = self._normalize_text(text) words = self.tagger(text) tokens = self._romaji_tokens(words) out = ''.join([str(tok) for tok in tokens]) return re.sub(r'\s+', ' ', out.strip()) def phonemize(self, texts): # espeak-ng API return [self.romaji(text) for text in texts] def _normalize_text(self, text): """Given text, normalize variations in Japanese. This specifically removes variations that are meaningless for romaji conversion using the following steps: - Unicode NFKC normalization - Full-width Latin to half-width - Half-width katakana to full-width """ # perform unicode normalization text = re.sub(r'[〜~](?=\d)', 'から', text) # wave dash range text = unicodedata.normalize('NFKC', text) # convert all full-width alphanum to half-width, since it can go out as-is text = mojimoji.zen_to_han(text, kana=False) # replace half-width katakana with full-width text = mojimoji.han_to_zen(text, digit=False, ascii=False) return ''.join([(' '+Convert(t)) if t.isdigit() else t for t in re.findall(r'\d+|\D+', text)]) def _romaji_tokens(self, words): """Build a list of tokens from input nodes.""" out = [] for wi, word in enumerate(words): po = out[-1] if out else None pw = words[wi - 1] if wi > 0 else None nw = words[wi + 1] if wi < len(words) - 1 else None roma = self._romaji_word(word) tok = Token(roma, False) # handle punctuation with atypical spacing surface = word.surface#['orig'] if surface in '「『' or roma in '([': if po: po.space = True elif surface in '」』' or roma in ']).,?!:': if po: po.space = False tok.space = True elif roma == ' ': tok.space = False else: tok.space = True out.append(tok) # remove any leftover sokuon for tok in out: tok.surface = tok.surface.replace(chr(12483), '') return out def _romaji_word(self, word): """Return the romaji for a single word (node).""" surface = word.surface#['orig'] if surface in self.exceptions: return self.exceptions[surface] assert not surface.isdigit(), surface if surface.isascii(): return surface kana = word.feature.pron or word.feature.kana or surface if word.is_unk: if word.char_type == 7: # katakana pass elif word.char_type == 3: # symbol return ''.join(map(lambda c: self.table.get(c, c), surface)) else: return '' # TODO: silently fail out = '' for ki, char in enumerate(kana): nk = kana[ki + 1] if ki < len(kana) - 1 else None pk = kana[ki - 1] if ki > 0 else None out += self._get_single_mapping(pk, char, nk) return out def _get_single_mapping(self, pk, kk, nk): """Given a single kana and its neighbors, return the mapped romaji.""" # handle odoriji # NOTE: This is very rarely useful at present because odoriji are not # left in readings for dictionary words, and we can't follow kana # across word boundaries. if kk in ODORI: if kk in 'ゝヽ': if pk: return pk else: return '' # invalid but be nice if kk in 'ゞヾ': # repeat with voicing if not pk: return '' vv = add_dakuten(pk) if vv: return self.table[vv] else: return '' # remaining are 々 for kanji and 〃 for symbols, but we can't # infer their span reliably (or handle rendaku) return '' # handle digraphs if pk and (pk + kk) in self.table: return self.table[pk + kk] if nk and (kk + nk) in self.table: return '' if nk and nk in SUTEGANA: if kk == 'ッ': return '' # never valid, just ignore return self.table[kk][:-1] + self.table[nk] if kk in SUTEGANA: return '' if kk == 'ー': # 長音符 return 'ː' if ord(kk) in {12387, 12483}: # っ or ッ tnk = self.table.get(nk) if tnk and tnk[0] in 'bdɸɡhçijkmnɲoprstɯvwz': return tnk[0] return kk if ord(kk) in {12435, 12531}: # ん or ン # https://en.wikipedia.org/wiki/N_(kana) # m before m,p,b # ŋ before k,g # ɲ before ɲ,tɕ,dʑ # n before n,t,d,r,z # ɴ otherwise tnk = self.table.get(nk) if tnk: if tnk[0] in 'mpb': return 'm' elif tnk[0] in 'kɡ': return 'ŋ' elif any(tnk.startswith(p) for p in ('ɲ','tɕ','dʑ')): return 'ɲ' elif tnk[0] in 'ntdrz': return 'n' return 'ɴ' return self.table.get(kk, '')