Spaces:

Shakhovak
/

RU_ACCENT

Running

App Files Files Community

RU_ACCENT / app.py

Shakhovak

Create app.py

17bc303 10 months ago

raw

history blame

No virus

5.34 kB

	import json
	import os
	import re
	from os.path import join as join_path
	import gradio as gr

	from text_split import split_by_sentences


	class RUAccent:
	vowels = "аеёиоуыэюя"

	def __init__(self):
	self.omographs = None
	self.accents = None
	self.workdir = os.getcwd()

	def load(self, custom_dict=None, custom_homographs=None):
	if custom_homographs is None:
	custom_homographs = {}

	if custom_dict is None:
	custom_dict = {}

	self.omographs = json.load(
	open(
	join_path(self.workdir, "dictionaries", "omographs.json"),
	encoding="utf-8",
	)
	)

	self.omographs.update(custom_homographs)

	self.accents = json.load(
	open(
	join_path(self.workdir, "dictionaries", "accents.json"),
	encoding="utf-8",
	)
	)

	self.accents.update(custom_dict)

	# self.yo_words = json.load(open("dictionaries/yo_words.json"), encoding='utf-8')

	def split_by_words(self, string):
	result = re.findall(r"\w(?:\+\w+)\|[^\w\s]+", string.lower())
	return [res for res in result if res]

	def process_all(self, text):
	"""Ядро всей программы. Тут текст проходит через ряд функций,
	где по итогу получается строка с проставленными ударениями
	Input:
	text: string

	Output:
	accented_sentence: list[string]
	omographs_list: list[string]
	unknown_list: list[string]
	"""
	accented_sentence = []
	omographs_list = []
	unknown_list = []

	sentences = split_by_sentences(text)
	outputs = []
	for sentence in sentences:
	text = self.split_by_words(sentence)
	# processed_text = self._process_yo(text)

	# processed_text = self._process_omographs(text)
	founded_omographs = self._process_omographs(text)
	omographs_list.extend(founded_omographs)

	processed_text, unknown_words = self._process_accent(
	text, founded_omographs
	)
	unknown_list.extend(unknown_words)

	processed_text = " ".join(processed_text)
	processed_text = self.delete_spaces_before_punc(processed_text)
	# outputs.append(processed_text)

	accented_sentence.append(processed_text)
	# " ".join(outputs)

	omographs_list = [
	f"{key}: {value}" for elem in omographs_list for key, value in elem.items()
	]
	return accented_sentence, omographs_list, unknown_list

	def _process_yo(self, text):
	splitted_text = text

	for i, word in enumerate(splitted_text):
	splitted_text[i] = self.yo_words.get(word, word)
	return splitted_text

	def _process_omographs(self, text):
	splitted_text = text

	founded_omographs = []
	for i, word in enumerate(splitted_text):
	variants = self.omographs.get(word)
	if variants:
	founded_omographs.append({word: variants})

	# for omograph in founded_omographs:
	# splitted_text[omograph["position"]] = f"<w>{splitted_text[omograph['position']]}</w>"
	# cls = omograph["variants"][0] # Just take the first variant from the dictionary
	# splitted_text[omograph["position"]] = cls
	# return splitted_text
	return founded_omographs

	def _process_accent(self, text, founded_omographs):
	splitted_text = text
	unknown_words = []
	for i, word in enumerate(splitted_text):
	stressed_word = self.accents.get(word, word)
	if stressed_word == word:
	# if len(word) > 4:
	if sum(word.count(vowel) for vowel in RUAccent.vowels) > 1:
	unknown_words.append(word)
	splitted_text[i] = word

	elif stressed_word != word and word in [
	list(d.keys())[0] for d in founded_omographs
	]:
	splitted_text[i] = word

	else:
	splitted_text[i] = stressed_word

	# stressed_word = self.accents.get(word, word)
	# splitted_text[i] = stressed_word

	return splitted_text, unknown_words

	def delete_spaces_before_punc(self, text):
	punc = "!\"#$%&'()*,./:;<=>?@[\\]^_`{\|}~"
	for char in punc:
	text = text.replace(" " + char, char)
	return text


	ru_accent = RUAccent()
	ru_accent.load()

	title = "Демо для модели расстановки ударения на русском языке"


	description = "Для расстановки ударения те"

	outputs = [
	gr.Textbox(label="Обработанный текст"),
	gr.Textbox(label="Омографы"),
	gr.Textbox(label="Нет в словаре"),
	]

	theme = "huggingface"

	interface = gr.Interface(
	fn=ru_accent.process_all,
	inputs=gr.Textbox(label="текст для расстановкит ударения"),
	outputs=outputs,
	title=title,
	description=description,
	)

	if __name__ == "__main__":
	interface.launch(debug=True, share=True)