Spaces:

tartuNLP
/

LexiconEnhancedLemmatization

Runtime error

App Files Files Community

adorkin commited on May 24, 2022

Commit

33cb8c0

•

0 Parent(s):

Inital commit

Browse files

Files changed (6) hide show

.gitattributes +27 -0
README.md +11 -0
app.py +80 -0
requirements.txt +2 -0
vb_stanza_no_compound_no_deriv.pt +3 -0
vb_stanza_symbols.pt +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,27 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+title: Lexicon-enhanced lemmatization for Estonian
+colorFrom: black
+colorTo: blue
+sdk: gradio
+sdk_version: 2.9.0
+app_file: app.py
+python_version: 3.7
+pinned: false
+license: afl-3.0
+---

app.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from typing import Dict, List
+import gradio as gr
+from lexenlem.preprocessing.adhoc import AdHocLemmatizer
+models: Dict[str, AdHocLemmatizer] = {
+    "Lemmatize": AdHocLemmatizer(path="vb_stanza_no_compound_no_deriv.pt", use_stanza=True),
+    "Lemmatize with special symbols": AdHocLemmatizer(
+        path="vb_stanza_symbols.pt", use_stanza=True, allow_compound_separator=True, allow_derivation_sign=True
+    )
+}
+def predict(text: str, model_name: str) -> List[str]:
+    if model_name not in models:
+        raise RuntimeError("Unknown model")
+    return models[model_name](text)
+gradio_ui = gr.Interface(
+    fn=predict,
+    title="Lexicon-enhanced lemmatization for Estonian",
+    description="The purpose of this demo is to demonstrate the results of"
+                " Lexicon-Enhanced Neural Lemmatization for Estonian developed by TartuNLP research group."
+                " The idea is to utilize the input of an external resource"
+                " (a `lexicon` — Vabamorf morphological analyzer in this particular case)"
+                " as an additional input to improve the results of a neural lemmatizer model. Said additional input"
+                " is a concatenation of one or more lemma candidates provided by Vabamorf. Morphological features and"
+                " the part of speech are provided by Stanza in this demo, although it's possible to use native Vabamorf"
+                " features as well (the results, however, are going to be slightly worse).\n\n"
+                " The lexicon-enhanced lemmatizer itself is based on an older version of Stanza. The models were"
+                " trained on the Estonian Dependency Treebank version 2.7.\n\n"
+                " Two variants of lemmatization are provided in the demo: regular lemmatization and lemmatization with"
+                " special symbols, which are `=` and `_`, denoting morphological derivation and separating parts of"
+                " compound words respectively. The latter was trained on the original data with Vabamorf set to output"
+                " these special symbols, while the latter was trained with `=` and `_` removed from the data and"
+                " vabamorf output.",
+    inputs=[
+        gr.inputs.Textbox(lines=7, label="Input text in the box below", placeholder="Text to lemmatize"),
+        gr.inputs.Radio(list(models.keys()), label="Lemmatization type")
+    ],
+    outputs=[
+        gr.outputs.Textbox()
+    ],
+    examples=[
+        [
+            "Ekspositsioonid võiksid alata juba kunstihotellide fuajeedest.",
+            "Lemmatize"
+        ],
+        [
+            "Ekspositsioonid võiksid alata juba kunstihotellide fuajeedest.",
+            "Lemmatize with special symbols"
+        ],
+        [
+            "Kõik uuritavad võeti vastu TÜ üld- ja molekulaarpatoloogia instituudis inimesegeneetika uurimisrühmas.",
+            "Lemmatize with special symbols"
+        ],
+        [
+            "Peamiselt viimasele toetub ka järgnev artikkel.",
+            "Lemmatize"
+        ],
+        [
+            "Arutletakse selle üle, mida ülearuse rahaga peale hakata.",
+            "Lemmatize"
+        ],
+        [
+            "Väikesele poisile tuuakse apteegist söögiisu tõstmiseks kalamaksaõli.",
+            "Lemmatize"
+        ],
+        [
+            "Tulevased beebid olid justkui peegeldusena pilgu beebisinas ja veel mingi ähmane lubadus.",
+            "Lemmatize"
+        ],
+    ],
+    allow_screenshot=False,
+    allow_flagging="never",
+)
+gradio_ui.launch(debug=False, enable_queue=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ git+https://github.com/slowwavesleep/lexicon-enhanced-lemmatization.git
2	+ gradio==2.9.0

vb_stanza_no_compound_no_deriv.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10317eb285db2e93c73f8a6c4484492db401d17d5600692e24845d88f24f29b9
+size 3606526

vb_stanza_symbols.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0dd6ada415cf8e89034a4a3710f4ba069e982b6af239560310007bdd1ad69466
+size 3606654