Spaces:
Build error
Build error
Sam Passaglia
commited on
Commit
β’
d41e82b
1
Parent(s):
5be9747
minor
Browse files- app.py +42 -21
- requirements.txt +1 -1
app.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
"""app.py
|
2 |
streamlit demo of yomikata"""
|
|
|
|
|
3 |
import pandas as pd
|
4 |
import spacy
|
5 |
import streamlit as st
|
@@ -8,9 +10,9 @@ from speach import ttlig
|
|
8 |
from yomikata import utils
|
9 |
from yomikata.dictionary import Dictionary
|
10 |
from yomikata.utils import parse_furigana
|
11 |
-
from pathlib import Path
|
12 |
|
13 |
-
|
|
|
14 |
def add_border(html: str):
|
15 |
WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.5rem; padding: 1rem; margin-bottom: 1.0rem; display: inline-block">{}</div>"""
|
16 |
html = html.replace("\n", " ")
|
@@ -23,46 +25,65 @@ def get_random_sentence():
|
|
23 |
df = pd.read_csv(Path(TEST_DATA_DIR, "test_optimized_strict_heteronyms.csv"))
|
24 |
return df.sample(1).iloc[0].sentence
|
25 |
|
26 |
-
|
|
|
27 |
def get_dbert_prediction_and_heteronym_list(text):
|
28 |
from yomikata.dbert import dBert
|
29 |
|
30 |
reader = dBert()
|
31 |
return reader.furigana(text), reader.heteronyms
|
32 |
|
33 |
-
|
|
|
34 |
def get_stats():
|
35 |
from config import config
|
36 |
from yomikata.utils import load_dict
|
|
|
37 |
stats = load_dict(Path(config.STORES_DIR, "dbert/training_performance.json"))
|
38 |
|
39 |
-
global_accuracy = stats[
|
40 |
|
41 |
-
stats = stats[
|
42 |
heteronyms = stats.keys()
|
43 |
|
44 |
-
accuracy = [stats[heteronym][
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
-
|
47 |
|
48 |
-
|
49 |
|
50 |
-
df =
|
51 |
|
52 |
-
df = df[
|
53 |
|
54 |
-
df
|
55 |
-
|
56 |
-
df = df.rename(columns={'readings':'readings (test corr./total)'})
|
57 |
|
58 |
-
df= df.sort_values(
|
59 |
|
60 |
-
df.index += 1
|
61 |
|
62 |
return global_accuracy, df
|
63 |
|
64 |
|
65 |
-
@st.
|
66 |
def furigana_to_spacy(text_with_furigana):
|
67 |
tokens = parse_furigana(text_with_furigana)
|
68 |
ents = []
|
@@ -116,9 +137,7 @@ label_colors = {
|
|
116 |
reading: colors[i % len(colors)]
|
117 |
for i, reading in enumerate(set([item["label"] for item in spacy_dict["ents"]]))
|
118 |
}
|
119 |
-
html = spacy.displacy.render(
|
120 |
-
spacy_dict, style="ent", manual=True, options={"colors": label_colors}
|
121 |
-
)
|
122 |
|
123 |
if len(spacy_dict["ents"]) > 0:
|
124 |
st.markdown("**Yomikata** found and disambiguated the following heteronyms:")
|
@@ -164,7 +183,9 @@ if st.button("π² Randomize the input sentence"):
|
|
164 |
# Stats section
|
165 |
global_accuracy, stats_df = get_stats()
|
166 |
|
167 |
-
st.subheader(
|
|
|
|
|
168 |
|
169 |
st.dataframe(stats_df)
|
170 |
|
|
|
1 |
"""app.py
|
2 |
streamlit demo of yomikata"""
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
import pandas as pd
|
6 |
import spacy
|
7 |
import streamlit as st
|
|
|
10 |
from yomikata import utils
|
11 |
from yomikata.dictionary import Dictionary
|
12 |
from yomikata.utils import parse_furigana
|
|
|
13 |
|
14 |
+
|
15 |
+
@st.cache
|
16 |
def add_border(html: str):
|
17 |
WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.5rem; padding: 1rem; margin-bottom: 1.0rem; display: inline-block">{}</div>"""
|
18 |
html = html.replace("\n", " ")
|
|
|
25 |
df = pd.read_csv(Path(TEST_DATA_DIR, "test_optimized_strict_heteronyms.csv"))
|
26 |
return df.sample(1).iloc[0].sentence
|
27 |
|
28 |
+
|
29 |
+
@st.cache
|
30 |
def get_dbert_prediction_and_heteronym_list(text):
|
31 |
from yomikata.dbert import dBert
|
32 |
|
33 |
reader = dBert()
|
34 |
return reader.furigana(text), reader.heteronyms
|
35 |
|
36 |
+
|
37 |
+
@st.cache
|
38 |
def get_stats():
|
39 |
from config import config
|
40 |
from yomikata.utils import load_dict
|
41 |
+
|
42 |
stats = load_dict(Path(config.STORES_DIR, "dbert/training_performance.json"))
|
43 |
|
44 |
+
global_accuracy = stats["test"]["accuracy"]
|
45 |
|
46 |
+
stats = stats["test"]["heteronym_performance"]
|
47 |
heteronyms = stats.keys()
|
48 |
|
49 |
+
accuracy = [stats[heteronym]["accuracy"] for heteronym in heteronyms]
|
50 |
+
|
51 |
+
readings = [
|
52 |
+
"γ".join(
|
53 |
+
[
|
54 |
+
"{reading} ({correct}/{n})".format(
|
55 |
+
reading=reading,
|
56 |
+
correct=stats[heteronym]["readings"][reading]["found"][reading],
|
57 |
+
n=stats[heteronym]["readings"][reading]["n"],
|
58 |
+
)
|
59 |
+
for reading in stats[heteronym]["readings"].keys()
|
60 |
+
if (
|
61 |
+
stats[heteronym]["readings"][reading]["found"][reading] != 0
|
62 |
+
or reading != "<OTHER>"
|
63 |
+
)
|
64 |
+
]
|
65 |
+
)
|
66 |
+
for heteronym in heteronyms
|
67 |
+
]
|
68 |
|
69 |
+
# if reading != '<OTHER>'
|
70 |
|
71 |
+
df = pd.DataFrame({"heteronym": heteronyms, "accuracy": accuracy, "readings": readings})
|
72 |
|
73 |
+
df = df[df["readings"].str.contains("γ")]
|
74 |
|
75 |
+
df["readings"] = df["readings"].str.replace("<OTHER>", "Other")
|
76 |
|
77 |
+
df = df.rename(columns={"readings": "readings (test corr./total)"})
|
|
|
|
|
78 |
|
79 |
+
df = df.sort_values("accuracy", ascending=False, ignore_index=True)
|
80 |
|
81 |
+
df.index += 1
|
82 |
|
83 |
return global_accuracy, df
|
84 |
|
85 |
|
86 |
+
@st.cache
|
87 |
def furigana_to_spacy(text_with_furigana):
|
88 |
tokens = parse_furigana(text_with_furigana)
|
89 |
ents = []
|
|
|
137 |
reading: colors[i % len(colors)]
|
138 |
for i, reading in enumerate(set([item["label"] for item in spacy_dict["ents"]]))
|
139 |
}
|
140 |
+
html = spacy.displacy.render(spacy_dict, style="ent", manual=True, options={"colors": label_colors})
|
|
|
|
|
141 |
|
142 |
if len(spacy_dict["ents"]) > 0:
|
143 |
st.markdown("**Yomikata** found and disambiguated the following heteronyms:")
|
|
|
183 |
# Stats section
|
184 |
global_accuracy, stats_df = get_stats()
|
185 |
|
186 |
+
st.subheader(
|
187 |
+
f"{len(stats_df)} heteronyms supported, with a global accuracy of {global_accuracy:.0%}"
|
188 |
+
)
|
189 |
|
190 |
st.dataframe(stats_df)
|
191 |
|
requirements.txt
CHANGED
@@ -15,5 +15,5 @@ transformers>=4.25.1
|
|
15 |
datasets>=2.7.1
|
16 |
pynvml==11.4.1
|
17 |
sentencepiece>=0.1.97
|
18 |
-
streamlit
|
19 |
rich
|
|
|
15 |
datasets>=2.7.1
|
16 |
pynvml==11.4.1
|
17 |
sentencepiece>=0.1.97
|
18 |
+
streamlit==1.17.0
|
19 |
rich
|