adorkin commited on
Commit
33cb8c0
0 Parent(s):

Inital commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.onnx filter=lfs diff=lfs merge=lfs -text
13
+ *.ot filter=lfs diff=lfs merge=lfs -text
14
+ *.parquet filter=lfs diff=lfs merge=lfs -text
15
+ *.pb filter=lfs diff=lfs merge=lfs -text
16
+ *.pt filter=lfs diff=lfs merge=lfs -text
17
+ *.pth filter=lfs diff=lfs merge=lfs -text
18
+ *.rar filter=lfs diff=lfs merge=lfs -text
19
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
20
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
21
+ *.tflite filter=lfs diff=lfs merge=lfs -text
22
+ *.tgz filter=lfs diff=lfs merge=lfs -text
23
+ *.wasm filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Lexicon-enhanced lemmatization for Estonian
3
+ colorFrom: black
4
+ colorTo: blue
5
+ sdk: gradio
6
+ sdk_version: 2.9.0
7
+ app_file: app.py
8
+ python_version: 3.7
9
+ pinned: false
10
+ license: afl-3.0
11
+ ---
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+
3
+ import gradio as gr
4
+ from lexenlem.preprocessing.adhoc import AdHocLemmatizer
5
+
6
+ models: Dict[str, AdHocLemmatizer] = {
7
+ "Lemmatize": AdHocLemmatizer(path="vb_stanza_no_compound_no_deriv.pt", use_stanza=True),
8
+ "Lemmatize with special symbols": AdHocLemmatizer(
9
+ path="vb_stanza_symbols.pt", use_stanza=True, allow_compound_separator=True, allow_derivation_sign=True
10
+ )
11
+ }
12
+
13
+
14
+ def predict(text: str, model_name: str) -> List[str]:
15
+ if model_name not in models:
16
+ raise RuntimeError("Unknown model")
17
+ return models[model_name](text)
18
+
19
+
20
+ gradio_ui = gr.Interface(
21
+ fn=predict,
22
+ title="Lexicon-enhanced lemmatization for Estonian",
23
+ description="The purpose of this demo is to demonstrate the results of"
24
+ " Lexicon-Enhanced Neural Lemmatization for Estonian developed by TartuNLP research group."
25
+ " The idea is to utilize the input of an external resource"
26
+ " (a `lexicon` — Vabamorf morphological analyzer in this particular case)"
27
+ " as an additional input to improve the results of a neural lemmatizer model. Said additional input"
28
+ " is a concatenation of one or more lemma candidates provided by Vabamorf. Morphological features and"
29
+ " the part of speech are provided by Stanza in this demo, although it's possible to use native Vabamorf"
30
+ " features as well (the results, however, are going to be slightly worse).\n\n"
31
+ " The lexicon-enhanced lemmatizer itself is based on an older version of Stanza. The models were"
32
+ " trained on the Estonian Dependency Treebank version 2.7.\n\n"
33
+ " Two variants of lemmatization are provided in the demo: regular lemmatization and lemmatization with"
34
+ " special symbols, which are `=` and `_`, denoting morphological derivation and separating parts of"
35
+ " compound words respectively. The latter was trained on the original data with Vabamorf set to output"
36
+ " these special symbols, while the latter was trained with `=` and `_` removed from the data and"
37
+ " vabamorf output.",
38
+ inputs=[
39
+ gr.inputs.Textbox(lines=7, label="Input text in the box below", placeholder="Text to lemmatize"),
40
+ gr.inputs.Radio(list(models.keys()), label="Lemmatization type")
41
+ ],
42
+ outputs=[
43
+ gr.outputs.Textbox()
44
+ ],
45
+ examples=[
46
+ [
47
+ "Ekspositsioonid võiksid alata juba kunstihotellide fuajeedest.",
48
+ "Lemmatize"
49
+ ],
50
+ [
51
+ "Ekspositsioonid võiksid alata juba kunstihotellide fuajeedest.",
52
+ "Lemmatize with special symbols"
53
+ ],
54
+ [
55
+ "Kõik uuritavad võeti vastu TÜ üld- ja molekulaarpatoloogia instituudis inimesegeneetika uurimisrühmas.",
56
+ "Lemmatize with special symbols"
57
+ ],
58
+ [
59
+ "Peamiselt viimasele toetub ka järgnev artikkel.",
60
+ "Lemmatize"
61
+ ],
62
+ [
63
+ "Arutletakse selle üle, mida ülearuse rahaga peale hakata.",
64
+ "Lemmatize"
65
+ ],
66
+ [
67
+ "Väikesele poisile tuuakse apteegist söögiisu tõstmiseks kalamaksaõli.",
68
+ "Lemmatize"
69
+ ],
70
+ [
71
+ "Tulevased beebid olid justkui peegeldusena pilgu beebisinas ja veel mingi ähmane lubadus.",
72
+ "Lemmatize"
73
+ ],
74
+ ],
75
+ allow_screenshot=False,
76
+ allow_flagging="never",
77
+ )
78
+
79
+
80
+ gradio_ui.launch(debug=False, enable_queue=True)
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ git+https://github.com/slowwavesleep/lexicon-enhanced-lemmatization.git
2
+ gradio==2.9.0
vb_stanza_no_compound_no_deriv.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10317eb285db2e93c73f8a6c4484492db401d17d5600692e24845d88f24f29b9
3
+ size 3606526
vb_stanza_symbols.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dd6ada415cf8e89034a4a3710f4ba069e982b6af239560310007bdd1ad69466
3
+ size 3606654