Sam Passaglia commited on
Commit
d41e82b
β€’
1 Parent(s): 5be9747
Files changed (2) hide show
  1. app.py +42 -21
  2. requirements.txt +1 -1
app.py CHANGED
@@ -1,5 +1,7 @@
1
  """app.py
2
  streamlit demo of yomikata"""
 
 
3
  import pandas as pd
4
  import spacy
5
  import streamlit as st
@@ -8,9 +10,9 @@ from speach import ttlig
8
  from yomikata import utils
9
  from yomikata.dictionary import Dictionary
10
  from yomikata.utils import parse_furigana
11
- from pathlib import Path
12
 
13
- @st.cache_data
 
14
  def add_border(html: str):
15
  WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.5rem; padding: 1rem; margin-bottom: 1.0rem; display: inline-block">{}</div>"""
16
  html = html.replace("\n", " ")
@@ -23,46 +25,65 @@ def get_random_sentence():
23
  df = pd.read_csv(Path(TEST_DATA_DIR, "test_optimized_strict_heteronyms.csv"))
24
  return df.sample(1).iloc[0].sentence
25
 
26
- @st.cache_data
 
27
  def get_dbert_prediction_and_heteronym_list(text):
28
  from yomikata.dbert import dBert
29
 
30
  reader = dBert()
31
  return reader.furigana(text), reader.heteronyms
32
 
33
- @st.cache_data
 
34
  def get_stats():
35
  from config import config
36
  from yomikata.utils import load_dict
 
37
  stats = load_dict(Path(config.STORES_DIR, "dbert/training_performance.json"))
38
 
39
- global_accuracy = stats['test']['accuracy']
40
 
41
- stats = stats['test']['heteronym_performance']
42
  heteronyms = stats.keys()
43
 
44
- accuracy = [stats[heteronym]['accuracy'] for heteronym in heteronyms]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- readings = [ "、".join(["{reading} ({correct}/{n})".format(reading=reading, correct=stats[heteronym]['readings'][reading]['found'][reading], n=stats[heteronym]['readings'][reading]['n']) for reading in stats[heteronym]['readings'].keys() if (stats[heteronym]['readings'][reading]['found'][reading] !=0 or reading != '<OTHER>')]) for heteronym in heteronyms ]
47
 
48
- #if reading != '<OTHER>'
49
 
50
- df = pd.DataFrame({'heteronym': heteronyms, 'accuracy': accuracy, 'readings': readings} )
51
 
52
- df = df[df['readings'].str.contains('、')]
53
 
54
- df['readings'] = df['readings'].str.replace('<OTHER>', 'Other')
55
-
56
- df = df.rename(columns={'readings':'readings (test corr./total)'})
57
 
58
- df= df.sort_values('accuracy', ascending=False, ignore_index=True)
59
 
60
- df.index += 1
61
 
62
  return global_accuracy, df
63
 
64
 
65
- @st.cache_data
66
  def furigana_to_spacy(text_with_furigana):
67
  tokens = parse_furigana(text_with_furigana)
68
  ents = []
@@ -116,9 +137,7 @@ label_colors = {
116
  reading: colors[i % len(colors)]
117
  for i, reading in enumerate(set([item["label"] for item in spacy_dict["ents"]]))
118
  }
119
- html = spacy.displacy.render(
120
- spacy_dict, style="ent", manual=True, options={"colors": label_colors}
121
- )
122
 
123
  if len(spacy_dict["ents"]) > 0:
124
  st.markdown("**Yomikata** found and disambiguated the following heteronyms:")
@@ -164,7 +183,9 @@ if st.button("🎲 Randomize the input sentence"):
164
  # Stats section
165
  global_accuracy, stats_df = get_stats()
166
 
167
- st.subheader(f"{len(stats_df)} heteronyms supported, with a global accuracy of {global_accuracy:.0%}")
 
 
168
 
169
  st.dataframe(stats_df)
170
 
 
1
  """app.py
2
  streamlit demo of yomikata"""
3
+ from pathlib import Path
4
+
5
  import pandas as pd
6
  import spacy
7
  import streamlit as st
 
10
  from yomikata import utils
11
  from yomikata.dictionary import Dictionary
12
  from yomikata.utils import parse_furigana
 
13
 
14
+
15
+ @st.cache
16
  def add_border(html: str):
17
  WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.5rem; padding: 1rem; margin-bottom: 1.0rem; display: inline-block">{}</div>"""
18
  html = html.replace("\n", " ")
 
25
  df = pd.read_csv(Path(TEST_DATA_DIR, "test_optimized_strict_heteronyms.csv"))
26
  return df.sample(1).iloc[0].sentence
27
 
28
+
29
+ @st.cache
30
  def get_dbert_prediction_and_heteronym_list(text):
31
  from yomikata.dbert import dBert
32
 
33
  reader = dBert()
34
  return reader.furigana(text), reader.heteronyms
35
 
36
+
37
+ @st.cache
38
  def get_stats():
39
  from config import config
40
  from yomikata.utils import load_dict
41
+
42
  stats = load_dict(Path(config.STORES_DIR, "dbert/training_performance.json"))
43
 
44
+ global_accuracy = stats["test"]["accuracy"]
45
 
46
+ stats = stats["test"]["heteronym_performance"]
47
  heteronyms = stats.keys()
48
 
49
+ accuracy = [stats[heteronym]["accuracy"] for heteronym in heteronyms]
50
+
51
+ readings = [
52
+ "、".join(
53
+ [
54
+ "{reading} ({correct}/{n})".format(
55
+ reading=reading,
56
+ correct=stats[heteronym]["readings"][reading]["found"][reading],
57
+ n=stats[heteronym]["readings"][reading]["n"],
58
+ )
59
+ for reading in stats[heteronym]["readings"].keys()
60
+ if (
61
+ stats[heteronym]["readings"][reading]["found"][reading] != 0
62
+ or reading != "<OTHER>"
63
+ )
64
+ ]
65
+ )
66
+ for heteronym in heteronyms
67
+ ]
68
 
69
+ # if reading != '<OTHER>'
70
 
71
+ df = pd.DataFrame({"heteronym": heteronyms, "accuracy": accuracy, "readings": readings})
72
 
73
+ df = df[df["readings"].str.contains("、")]
74
 
75
+ df["readings"] = df["readings"].str.replace("<OTHER>", "Other")
76
 
77
+ df = df.rename(columns={"readings": "readings (test corr./total)"})
 
 
78
 
79
+ df = df.sort_values("accuracy", ascending=False, ignore_index=True)
80
 
81
+ df.index += 1
82
 
83
  return global_accuracy, df
84
 
85
 
86
+ @st.cache
87
  def furigana_to_spacy(text_with_furigana):
88
  tokens = parse_furigana(text_with_furigana)
89
  ents = []
 
137
  reading: colors[i % len(colors)]
138
  for i, reading in enumerate(set([item["label"] for item in spacy_dict["ents"]]))
139
  }
140
+ html = spacy.displacy.render(spacy_dict, style="ent", manual=True, options={"colors": label_colors})
 
 
141
 
142
  if len(spacy_dict["ents"]) > 0:
143
  st.markdown("**Yomikata** found and disambiguated the following heteronyms:")
 
183
  # Stats section
184
  global_accuracy, stats_df = get_stats()
185
 
186
+ st.subheader(
187
+ f"{len(stats_df)} heteronyms supported, with a global accuracy of {global_accuracy:.0%}"
188
+ )
189
 
190
  st.dataframe(stats_df)
191
 
requirements.txt CHANGED
@@ -15,5 +15,5 @@ transformers>=4.25.1
15
  datasets>=2.7.1
16
  pynvml==11.4.1
17
  sentencepiece>=0.1.97
18
- streamlit>=1.18.1
19
  rich
 
15
  datasets>=2.7.1
16
  pynvml==11.4.1
17
  sentencepiece>=0.1.97
18
+ streamlit==1.17.0
19
  rich