""" dictionary.py Provides the Dictionary class which implements Reader using dictionary lookup. """ from difflib import ndiff import jaconv from chirptext import deko from speach import ttlig from speach.ttlig import RubyFrag, RubyToken from yomikata import utils from config.config import ASCII_SPACE_TOKEN from yomikata.reader import Reader class Dictionary(Reader): def __init__(self, tagger: str = "unidic") -> None: """Create a Dictionary object to apply furigana using Dictionary lookup Object holds configuration and tokenizer state. Typical usage: ```python reader = Dictionary() furi = Dictionary.furigana("お前はもう死んでいる") # "お{前/まえ}はもう{死/し}んでいる" ``` Args: tagger (str, optional): Tokenizing dictionary to be used。 Defaults to `unidic`. `juman`, `ipadic`, 'sudachi' also possible. """ if tagger == "unidic": import fugashi self.tagger = fugashi.Tagger() self.token_to_surface = lambda word: word.surface self.token_to_pos = lambda word: word.feature.pos1 self.token_to_kana = ( lambda word: jaconv.kata2hira(str(word)) if (word.feature.kana == "*" or word.feature.kana is None) else jaconv.kata2hira(str(word.feature.kana)) ) elif tagger == "ipadic": import fugashi import ipadic self.tagger = fugashi.GenericTagger(ipadic.MECAB_ARGS) self.token_to_surface = lambda word: word.surface self.token_to_pos = lambda word: word.feature[0] self.token_to_kana = ( lambda word: jaconv.kata2hira(str(word.feature[7])) if len(word.feature) >= 8 else jaconv.kata2hira(str(word.surface)) ) elif tagger == "juman": import fugashi import jumandic self.tagger = fugashi.GenericTagger(jumandic.MECAB_ARGS) self.token_to_surface = lambda word: word.surface self.token_to_pos = lambda word: word.feature[0] self.token_to_kana = ( lambda word: word.feature[5] if word.feature[5] != "*" else jaconv.kata2hira(str(word)) ) elif tagger == "sudachi": from sudachipy import dictionary as sudachidict from sudachipy import tokenizer as sudachitokenizer tokenizer_obj = sudachidict.Dictionary(dict="full").create() mode = sudachitokenizer.Tokenizer.SplitMode.C self.tagger = lambda s: tokenizer_obj.tokenize(s, mode) self.token_to_surface = lambda word: word.surface() self.token_to_pos = lambda word: word.part_of_speech()[0] self.token_to_kana = lambda word: jaconv.kata2hira( utils.standardize_text(str(word.reading_form())) ) def furigana(self, text: str) -> str: text = utils.standardize_text(text) text = text.replace(" ", ASCII_SPACE_TOKEN) rubytoken = utils.parse_furigana(text) output = "" for group in rubytoken.groups: if isinstance(group, ttlig.RubyFrag): output += f"{{{group.text}/{group.furi}}}" else: group = group.replace("{", "").replace("}", "") for word in self.tagger(group): kana = self.token_to_kana(word) surface = self.token_to_surface(word) pos = self.token_to_pos(word) if (surface == kana) or pos in ["記号", "補助記号", "特殊"]: output += surface else: output += Dictionary.furi_to_ruby(surface, kana).to_code() output = output.replace(ASCII_SPACE_TOKEN, " ") return output @staticmethod def furi_to_ruby(surface, kana): """Combine a surface string and a kana string to a RubyToken object with furigana. Args: surface (str): Surface string kana (str): Kana string Returns: RubyToken: RubyToken object with furigana This code is modified from the version in the part of speach library: https://github.com/neocl/speach/ https://github.com/neocl/speach/blob/main/speach/ttlig.py :copyright: (c) 2018 Le Tuan Anh :license: MIT """ def common_substring_from_right(string1, string2): i = -1 # start from the end of strings while -i <= min(len(string1), len(string2)): if string1[i] != string2[i]: # if characters don't match, break break i -= 1 # decrement i to move towards start return string1[i + 1 :] if i != -1 else "" # return common substring def assert_rubytoken_kana_match(ruby: RubyToken, kana: str) -> None: assert ( "".join( [token.furi if isinstance(token, RubyFrag) else token for token in ruby.groups] ) == kana ) original_kana = kana final_text = common_substring_from_right(surface, kana) if final_text: surface = surface[: -len(final_text)] kana = kana[: -len(final_text)] ruby = RubyToken(surface=surface) if deko.is_kana(surface): ruby.append(surface) if final_text: ruby.append(final_text) assert_rubytoken_kana_match(ruby, original_kana) return ruby edit_seq = ndiff(surface, kana) kanji = "" text = "" furi = "" before = "" expected = "" for item in edit_seq: if item.startswith("- "): # flush text if needed if expected and kanji and furi: ruby.append(RubyFrag(text=kanji, furi=furi)) kanji = "" furi = "" print(ruby) if text: ruby.append(text) text = "" kanji += item[2:] elif item.startswith("+ "): if expected and item[2:] == expected: if expected and kanji and furi: ruby.append(RubyFrag(text=kanji, furi=furi)) kanji = "" furi = "" ruby.append(item[2:]) expected = "" else: furi += item[2:] elif item.startswith(" "): if before == "-" and not furi: # shifting happened expected = item[2:] furi += item[2:] else: text += item[2:] # flush if possible if kanji and furi: ruby.append(RubyFrag(text=kanji, furi=furi)) kanji = "" furi = "" else: # possible error? pass before = item[0] # end for if kanji: if furi: ruby.append(RubyFrag(text=kanji, furi=furi)) else: ruby.append(kanji) elif text: ruby.append(text) if final_text: ruby.append(final_text) assert_rubytoken_kana_match(ruby, original_kana) return ruby