Spaces:
Runtime error
Runtime error
Commit
•
33cb8c0
0
Parent(s):
Inital commit
Browse files- .gitattributes +27 -0
- README.md +11 -0
- app.py +80 -0
- requirements.txt +2 -0
- vb_stanza_no_compound_no_deriv.pt +3 -0
- vb_stanza_symbols.pt +3 -0
.gitattributes
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
19 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Lexicon-enhanced lemmatization for Estonian
|
3 |
+
colorFrom: black
|
4 |
+
colorTo: blue
|
5 |
+
sdk: gradio
|
6 |
+
sdk_version: 2.9.0
|
7 |
+
app_file: app.py
|
8 |
+
python_version: 3.7
|
9 |
+
pinned: false
|
10 |
+
license: afl-3.0
|
11 |
+
---
|
app.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, List
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
from lexenlem.preprocessing.adhoc import AdHocLemmatizer
|
5 |
+
|
6 |
+
models: Dict[str, AdHocLemmatizer] = {
|
7 |
+
"Lemmatize": AdHocLemmatizer(path="vb_stanza_no_compound_no_deriv.pt", use_stanza=True),
|
8 |
+
"Lemmatize with special symbols": AdHocLemmatizer(
|
9 |
+
path="vb_stanza_symbols.pt", use_stanza=True, allow_compound_separator=True, allow_derivation_sign=True
|
10 |
+
)
|
11 |
+
}
|
12 |
+
|
13 |
+
|
14 |
+
def predict(text: str, model_name: str) -> List[str]:
|
15 |
+
if model_name not in models:
|
16 |
+
raise RuntimeError("Unknown model")
|
17 |
+
return models[model_name](text)
|
18 |
+
|
19 |
+
|
20 |
+
gradio_ui = gr.Interface(
|
21 |
+
fn=predict,
|
22 |
+
title="Lexicon-enhanced lemmatization for Estonian",
|
23 |
+
description="The purpose of this demo is to demonstrate the results of"
|
24 |
+
" Lexicon-Enhanced Neural Lemmatization for Estonian developed by TartuNLP research group."
|
25 |
+
" The idea is to utilize the input of an external resource"
|
26 |
+
" (a `lexicon` — Vabamorf morphological analyzer in this particular case)"
|
27 |
+
" as an additional input to improve the results of a neural lemmatizer model. Said additional input"
|
28 |
+
" is a concatenation of one or more lemma candidates provided by Vabamorf. Morphological features and"
|
29 |
+
" the part of speech are provided by Stanza in this demo, although it's possible to use native Vabamorf"
|
30 |
+
" features as well (the results, however, are going to be slightly worse).\n\n"
|
31 |
+
" The lexicon-enhanced lemmatizer itself is based on an older version of Stanza. The models were"
|
32 |
+
" trained on the Estonian Dependency Treebank version 2.7.\n\n"
|
33 |
+
" Two variants of lemmatization are provided in the demo: regular lemmatization and lemmatization with"
|
34 |
+
" special symbols, which are `=` and `_`, denoting morphological derivation and separating parts of"
|
35 |
+
" compound words respectively. The latter was trained on the original data with Vabamorf set to output"
|
36 |
+
" these special symbols, while the latter was trained with `=` and `_` removed from the data and"
|
37 |
+
" vabamorf output.",
|
38 |
+
inputs=[
|
39 |
+
gr.inputs.Textbox(lines=7, label="Input text in the box below", placeholder="Text to lemmatize"),
|
40 |
+
gr.inputs.Radio(list(models.keys()), label="Lemmatization type")
|
41 |
+
],
|
42 |
+
outputs=[
|
43 |
+
gr.outputs.Textbox()
|
44 |
+
],
|
45 |
+
examples=[
|
46 |
+
[
|
47 |
+
"Ekspositsioonid võiksid alata juba kunstihotellide fuajeedest.",
|
48 |
+
"Lemmatize"
|
49 |
+
],
|
50 |
+
[
|
51 |
+
"Ekspositsioonid võiksid alata juba kunstihotellide fuajeedest.",
|
52 |
+
"Lemmatize with special symbols"
|
53 |
+
],
|
54 |
+
[
|
55 |
+
"Kõik uuritavad võeti vastu TÜ üld- ja molekulaarpatoloogia instituudis inimesegeneetika uurimisrühmas.",
|
56 |
+
"Lemmatize with special symbols"
|
57 |
+
],
|
58 |
+
[
|
59 |
+
"Peamiselt viimasele toetub ka järgnev artikkel.",
|
60 |
+
"Lemmatize"
|
61 |
+
],
|
62 |
+
[
|
63 |
+
"Arutletakse selle üle, mida ülearuse rahaga peale hakata.",
|
64 |
+
"Lemmatize"
|
65 |
+
],
|
66 |
+
[
|
67 |
+
"Väikesele poisile tuuakse apteegist söögiisu tõstmiseks kalamaksaõli.",
|
68 |
+
"Lemmatize"
|
69 |
+
],
|
70 |
+
[
|
71 |
+
"Tulevased beebid olid justkui peegeldusena pilgu beebisinas ja veel mingi ähmane lubadus.",
|
72 |
+
"Lemmatize"
|
73 |
+
],
|
74 |
+
],
|
75 |
+
allow_screenshot=False,
|
76 |
+
allow_flagging="never",
|
77 |
+
)
|
78 |
+
|
79 |
+
|
80 |
+
gradio_ui.launch(debug=False, enable_queue=True)
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
git+https://github.com/slowwavesleep/lexicon-enhanced-lemmatization.git
|
2 |
+
gradio==2.9.0
|
vb_stanza_no_compound_no_deriv.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:10317eb285db2e93c73f8a6c4484492db401d17d5600692e24845d88f24f29b9
|
3 |
+
size 3606526
|
vb_stanza_symbols.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0dd6ada415cf8e89034a4a3710f4ba069e982b6af239560310007bdd1ad69466
|
3 |
+
size 3606654
|