yomikata-demo / yomikata /dictionary.py
Sam Passaglia
initial commit
9aba307
raw
history blame
No virus
3.45 kB
"""
dictionary.py
Provides the Dictionary class which implements Reader using dictionary lookup.
"""
import fugashi
import ipadic
import jaconv
import jumandic
from speach import ttlig
from sudachipy import dictionary as sudachidict
from sudachipy import tokenizer as sudachitokenizer
from config.config import ASCII_SPACE_TOKEN
from yomikata import utils
from yomikata.reader import Reader
tokenizer_obj = sudachidict.Dictionary(dict="full").create()
mode = sudachitokenizer.Tokenizer.SplitMode.C
taggers = {}
taggers["ipadic"] = fugashi.GenericTagger(ipadic.MECAB_ARGS)
taggers["juman"] = fugashi.GenericTagger(jumandic.MECAB_ARGS)
taggers["unidic"] = fugashi.Tagger()
taggers["sudachi"] = lambda s: tokenizer_obj.tokenize(s, mode)
token_to_kana = {
"ipadic": lambda word: jaconv.kata2hira(str(word.feature[7]))
if len(word.feature) >= 8
else jaconv.kata2hira(str(word.surface)),
"juman": lambda word: word.feature[5]
if word.feature[5] != "*"
else jaconv.kata2hira(str(word)),
"unidic": lambda word: jaconv.kata2hira(str(word))
if (word.feature.kana == "*" or word.feature.kana is None)
else jaconv.kata2hira(str(word.feature.kana)),
"sudachi": lambda word: jaconv.kata2hira(
utils.standardize_text(str(word.reading_form()))
),
}
token_to_surface = {
"ipadic": lambda word: word.surface,
"juman": lambda word: word.surface,
"unidic": lambda word: word.surface,
"sudachi": lambda word: word.surface(),
}
token_to_pos = {
"ipadic": lambda word: word.feature[0],
"juman": lambda word: word.feature[0],
"unidic": lambda word: word.feature.pos1,
"sudachi": lambda word: word.part_of_speech()[0],
}
class Dictionary(Reader):
def __init__(self, tagger: str = "unidic") -> None:
"""Create a Dictionary object to apply furigana using Dictionary lookup
Object holds configuration and tokenizer state.
Typical usage:
```python
reader = Dictionary()
furi = Dictionary.furigana("お前はもう死んでいる")
# "お{前/まえ}はもう{死/し}んでいる"
```
Args:
tagger (str, optional): Tokenizing dictionary to be used。 Defaults to `unidic`. `juman`, `ipadic`, 'sudachi' also possible.
"""
self.tagger = taggers[tagger]
self.token_to_kana = token_to_kana[tagger]
self.token_to_surface = token_to_surface[tagger]
self.token_to_pos = token_to_pos[tagger]
def furigana(self, text: str) -> str:
text = utils.standardize_text(text)
text = text.replace(" ", ASCII_SPACE_TOKEN)
rubytoken = utils.parse_furigana(text)
output = ""
for group in rubytoken.groups:
if isinstance(group, ttlig.RubyFrag):
output += f"{{{group.text}/{group.furi}}}"
else:
group = group.replace("{", "").replace("}", "")
for word in self.tagger(group):
kana = self.token_to_kana(word)
surface = self.token_to_surface(word)
pos = self.token_to_pos(word)
if (surface == kana) or pos in ["記号", "補助記号", "特殊"]:
output += surface
else:
output += ttlig.RubyToken.from_furi(surface, kana).to_code()
output = output.replace(ASCII_SPACE_TOKEN, " ")
return output