Spaces:

passaglia
/

yomikata-demo

Build error

App Files Files Community

yomikata-demo / yomikata /dictionary.py

Sam Passaglia

initial commit

9aba307 over 1 year ago

raw

history blame

No virus

3.45 kB

	"""
	dictionary.py
	Provides the Dictionary class which implements Reader using dictionary lookup.
	"""

	import fugashi
	import ipadic
	import jaconv
	import jumandic
	from speach import ttlig
	from sudachipy import dictionary as sudachidict
	from sudachipy import tokenizer as sudachitokenizer

	from config.config import ASCII_SPACE_TOKEN
	from yomikata import utils
	from yomikata.reader import Reader

	tokenizer_obj = sudachidict.Dictionary(dict="full").create()
	mode = sudachitokenizer.Tokenizer.SplitMode.C

	taggers = {}
	taggers["ipadic"] = fugashi.GenericTagger(ipadic.MECAB_ARGS)
	taggers["juman"] = fugashi.GenericTagger(jumandic.MECAB_ARGS)
	taggers["unidic"] = fugashi.Tagger()
	taggers["sudachi"] = lambda s: tokenizer_obj.tokenize(s, mode)

	token_to_kana = {
	"ipadic": lambda word: jaconv.kata2hira(str(word.feature[7]))
	if len(word.feature) >= 8
	else jaconv.kata2hira(str(word.surface)),
	"juman": lambda word: word.feature[5]
	if word.feature[5] != "*"
	else jaconv.kata2hira(str(word)),
	"unidic": lambda word: jaconv.kata2hira(str(word))
	if (word.feature.kana == "*" or word.feature.kana is None)
	else jaconv.kata2hira(str(word.feature.kana)),
	"sudachi": lambda word: jaconv.kata2hira(
	utils.standardize_text(str(word.reading_form()))
	),
	}

	token_to_surface = {
	"ipadic": lambda word: word.surface,
	"juman": lambda word: word.surface,
	"unidic": lambda word: word.surface,
	"sudachi": lambda word: word.surface(),
	}

	token_to_pos = {
	"ipadic": lambda word: word.feature[0],
	"juman": lambda word: word.feature[0],
	"unidic": lambda word: word.feature.pos1,
	"sudachi": lambda word: word.part_of_speech()[0],
	}


	class Dictionary(Reader):
	def __init__(self, tagger: str = "unidic") -> None:
	"""Create a Dictionary object to apply furigana using Dictionary lookup
	Object holds configuration and tokenizer state.

	Typical usage:

	```python
	reader = Dictionary()
	furi = Dictionary.furigana("お前はもう死んでいる")
	# "お{前/まえ}はもう{死/し}んでいる"
	```

	Args:
	tagger (str, optional): Tokenizing dictionary to be used。 Defaults to `unidic`. `juman`, `ipadic`, 'sudachi' also possible.
	"""

	self.tagger = taggers[tagger]
	self.token_to_kana = token_to_kana[tagger]
	self.token_to_surface = token_to_surface[tagger]
	self.token_to_pos = token_to_pos[tagger]

	def furigana(self, text: str) -> str:
	text = utils.standardize_text(text)
	text = text.replace(" ", ASCII_SPACE_TOKEN)
	rubytoken = utils.parse_furigana(text)
	output = ""

	for group in rubytoken.groups:
	if isinstance(group, ttlig.RubyFrag):
	output += f"{{{group.text}/{group.furi}}}"
	else:
	group = group.replace("{", "").replace("}", "")
	for word in self.tagger(group):
	kana = self.token_to_kana(word)
	surface = self.token_to_surface(word)
	pos = self.token_to_pos(word)
	if (surface == kana) or pos in ["記号", "補助記号", "特殊"]:
	output += surface
	else:
	output += ttlig.RubyToken.from_furi(surface, kana).to_code()
	output = output.replace(ASCII_SPACE_TOKEN, " ")
	return output