import json import os import re from os.path import join as join_path import gradio as gr from text_split import split_by_sentences class RUAccent: vowels = "аеёиоуыэюя" def __init__(self): self.omographs = None self.accents = None self.workdir = os.getcwd() def load(self, custom_dict=None, custom_homographs=None): if custom_homographs is None: custom_homographs = {} if custom_dict is None: custom_dict = {} self.omographs = json.load( open( join_path(self.workdir, "dictionaries", "omographs.json"), encoding="utf-8", ) ) self.omographs.update(custom_homographs) self.accents = json.load( open( join_path(self.workdir, "dictionaries", "accents.json"), encoding="utf-8", ) ) self.accents.update(custom_dict) # self.yo_words = json.load(open("dictionaries/yo_words.json"), encoding='utf-8') def split_by_words(self, string): result = re.findall(r"\w*(?:\+\w+)*|[^\w\s]+", string.lower()) return [res for res in result if res] def process_all(self, text): """Ядро всей программы. Тут текст проходит через ряд функций, где по итогу получается строка с проставленными ударениями Input: text: string Output: accented_sentence: list[string] omographs_list: list[string] unknown_list: list[string] """ accented_sentence = [] omographs_list = [] unknown_list = [] sentences = split_by_sentences(text) outputs = [] for sentence in sentences: text = self.split_by_words(sentence) # processed_text = self._process_yo(text) # processed_text = self._process_omographs(text) founded_omographs = self._process_omographs(text) omographs_list.extend(founded_omographs) processed_text, unknown_words = self._process_accent( text, founded_omographs ) unknown_list.extend(unknown_words) processed_text = " ".join(processed_text) processed_text = self.delete_spaces_before_punc(processed_text) # outputs.append(processed_text) accented_sentence.append(processed_text) # " ".join(outputs) omographs_list = [ f"{key}: {value}" for elem in omographs_list for key, value in elem.items() ] return accented_sentence, omographs_list, unknown_list def _process_yo(self, text): splitted_text = text for i, word in enumerate(splitted_text): splitted_text[i] = self.yo_words.get(word, word) return splitted_text def _process_omographs(self, text): splitted_text = text founded_omographs = [] for i, word in enumerate(splitted_text): variants = self.omographs.get(word) if variants: founded_omographs.append({word: variants}) # for omograph in founded_omographs: # splitted_text[omograph["position"]] = f"{splitted_text[omograph['position']]}" # cls = omograph["variants"][0] # Just take the first variant from the dictionary # splitted_text[omograph["position"]] = cls # return splitted_text return founded_omographs def _process_accent(self, text, founded_omographs): splitted_text = text unknown_words = [] for i, word in enumerate(splitted_text): stressed_word = self.accents.get(word, word) if stressed_word == word: # if len(word) > 4: if sum(word.count(vowel) for vowel in RUAccent.vowels) > 1: unknown_words.append(word) splitted_text[i] = word elif stressed_word != word and word in [ list(d.keys())[0] for d in founded_omographs ]: splitted_text[i] = word else: splitted_text[i] = stressed_word # stressed_word = self.accents.get(word, word) # splitted_text[i] = stressed_word return splitted_text, unknown_words def delete_spaces_before_punc(self, text): punc = "!\"#$%&'()*,./:;<=>?@[\\]^_`{|}~" for char in punc: text = text.replace(" " + char, char) return text ru_accent = RUAccent() ru_accent.load() title = "Демо для модели расстановки ударения на русском языке" description = "Для расстановки ударения те" outputs = [ gr.Textbox(label="Обработанный текст"), gr.Textbox(label="Омографы"), gr.Textbox(label="Нет в словаре"), ] theme = "huggingface" interface = gr.Interface( fn=ru_accent.process_all, inputs=gr.Textbox(label="текст для расстановкит ударения"), outputs=outputs, title=title, description=description, ) if __name__ == "__main__": interface.launch(debug=True, share=True)