Spaces:

Shakhovak
/

RU_ACCENT

Sleeping

App Files Files Community

shakhovak commited on Dec 18, 2023

Commit

25e5c2b

1 Parent(s): d14b8fb

new dict

Browse files

Files changed (4) hide show

app.py +48 -16
dictionaries/file_norm.json +3 -0
dictionaries/file_omo.json +3 -0
requirements.txt +0 -10

app.py CHANGED Viewed

@@ -15,18 +15,27 @@ class RUAccent:
         self.workdir = os.getcwd()
     def load(self, custom_accent=None, custom_omographs=None):
         if custom_omographs is None:
             custom_omographs = {}
         if custom_accent is None:
             custom_accent = {}
-        self.omographs = json.load(open(join_path(self.workdir, "dictionaries", "file_omo.json"), encoding='utf-8'))
         self.omographs.update(custom_omographs)
-        self.accents = json.load(open(join_path(self.workdir, "dictionaries", "file_norm.json"), encoding='utf-8'))
         self.accents.update(custom_accent)
@@ -59,7 +68,9 @@ class RUAccent:
             founded_omographs = self._process_omographs(text)
             omographs_list.extend(founded_omographs)
-            processed_text, unknown_words = self._process_accent(text, founded_omographs)
             unknown_list.extend(unknown_words)
             processed_text = " ".join(processed_text)
@@ -67,7 +78,9 @@ class RUAccent:
             accented_sentence.append(processed_text)
-        omographs_list = [f"{key}: {value}" for elem in omographs_list for key, value in elem.items()]
         return accented_sentence, list(set(omographs_list)), list(set(unknown_list))
     def _process_yo(self, text):
@@ -84,9 +97,7 @@ class RUAccent:
         for i, word in enumerate(splitted_text):
             variants = self.omographs.get(word)
             if variants:
-                founded_omographs.append(
-                    {word: self.omographs[word]["acc_variants"]}
-                )
         return founded_omographs
@@ -115,11 +126,32 @@ class RUAccent:
             text = text.replace(" " + char, char)
         return text
-# # Example usage:
-# ru_accent = RUAccent()
-# ru_accent.load()
-#
-# text_to_process = "В этом замке совершенно нет ни одного замка. Наверно я не буду ругаться с нига нига нига из-за этого сучонка"
-# processed_text = ru_accent.process_all(text_to_process)
-#
-# print(processed_text)

         self.workdir = os.getcwd()
     def load(self, custom_accent=None, custom_omographs=None):
         if custom_omographs is None:
             custom_omographs = {}
         if custom_accent is None:
             custom_accent = {}
+        self.omographs = json.load(
+            open(
+                join_path(self.workdir, "dictionaries", "file_omo.json"),
+                encoding="utf-8",
+            )
+        )
         self.omographs.update(custom_omographs)
+        self.accents = json.load(
+            open(
+                join_path(self.workdir, "dictionaries", "file_norm.json"),
+                encoding="utf-8",
+            )
+        )
         self.accents.update(custom_accent)
             founded_omographs = self._process_omographs(text)
             omographs_list.extend(founded_omographs)
+            processed_text, unknown_words = self._process_accent(
+                text, founded_omographs
+            )
             unknown_list.extend(unknown_words)
             processed_text = " ".join(processed_text)
             accented_sentence.append(processed_text)
+        omographs_list = [
+            f"{key}: {value}" for elem in omographs_list for key, value in elem.items()
+        ]
         return accented_sentence, list(set(omographs_list)), list(set(unknown_list))
     def _process_yo(self, text):
         for i, word in enumerate(splitted_text):
             variants = self.omographs.get(word)
             if variants:
+                founded_omographs.append({word: self.omographs[word]["acc_variants"]})
         return founded_omographs
             text = text.replace(" " + char, char)
         return text
+ru_accent = RUAccent()
+ru_accent.load()
+title = "Демо для модели расстановки ударения на русском языке"
+description = "Для расстановки ударения необходимо ввести текст в поле ниже. Алгоритм обработает текст и выдаст текст с ударениями, а также 2 списка: омографы, если они есть в тексте и слов, не найденных в словаре."
+examples = ["Я иду в замок повесить замок."]
+outputs = [
+    gr.Textbox(label="Обработанный текст"),
+    gr.Textbox(label="Омографы"),
+    gr.Textbox(label="Нет в словаре"),
+]
+theme = "huggingface"
+interface = gr.Interface(
+    fn=ru_accent.process_all,
+    inputs=gr.Textbox(label="текст для расстановки ударения"),
+    outputs=outputs,
+    examples=examples,
+    title=title,
+    description=description,
+)
+if __name__ == "__main__":
+    interface.launch(debug=True, share=True)

dictionaries/file_norm.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ebf4187d80e9702f94253d81a48fa3a14d484e2befaeb939fdca99eb6c42f1d5
+size 178087540

dictionaries/file_omo.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba98b20c885cee2f54da731bb068df53fa6960bd3c8ef36417d8f6ffc90acbff
+size 4240115

requirements.txt DELETED Viewed

@@ -1,10 +0,0 @@
-blinker==1.7.0
-click==8.1.7
-colorama==0.4.6
-Flask==3.0.0
-importlib-metadata==7.0.0
-itsdangerous==2.1.2
-Jinja2==3.1.2
-MarkupSafe==2.1.3
-Werkzeug==3.0.1
-zipp==3.17.0