kargaranamir commited on
Commit
715fd06
1 Parent(s): f462b08

Upadte GlotLID

Browse files
Files changed (5) hide show
  1. README.md +4 -4
  2. app.py +118 -43
  3. assets/GlotLID_logo.svg +0 -0
  4. assets/language_names.json +0 -0
  5. constants.py +1 -1
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: GlotLID
3
- emoji:
4
- colorFrom: indigo
5
- colorTo: purple
6
  sdk: streamlit
7
  sdk_version: 1.27.2
8
  app_file: app.py
 
1
  ---
2
+ title: GlotLID Space
3
+ emoji: 📐
4
+ colorFrom: yellow
5
+ colorTo: red
6
  sdk: streamlit
7
  sdk_version: 1.27.2
8
  app_file: app.py
app.py CHANGED
@@ -17,7 +17,7 @@ import fasttext
17
  import altair as alt
18
  from altair import X, Y, Scale
19
  import base64
20
-
21
 
22
  @st.cache_resource
23
  def load_sp():
@@ -28,16 +28,39 @@ def load_sp():
28
  sp = load_sp()
29
 
30
  def get_script(text):
31
- """Get the writing system of given text.
32
 
33
  Args:
34
  text: The text to be preprocessed.
35
 
36
  Returns:
37
- The writing system of text.
38
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- return sp(text)[0]
41
 
42
  @st.cache_data
43
  def render_svg(svg):
@@ -55,17 +78,45 @@ def convert_df(df):
55
 
56
 
57
  @st.cache_resource
58
- def load_model(model_name):
59
- model_path = hf_hub_download(repo_id=model_name, filename="model.bin")
60
  model = fasttext.load_model(model_path)
61
  return model
62
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
- model = load_model(constants.MODEL_NAME)
 
 
 
65
 
 
 
 
66
 
67
- def compute(sentences):
68
- """Computes the language labels for the given sentences.
 
 
 
 
 
 
 
 
 
 
69
 
70
  Args:
71
  sentences: A list of sentences.
@@ -74,81 +125,105 @@ def compute(sentences):
74
  A list of language probablities and labels for the given sentences.
75
  """
76
  progress_text = "Computing Language..."
 
77
  my_bar = st.progress(0, text=progress_text)
78
 
79
- BATCH_SIZE = 1
80
  probs = []
81
  labels = []
82
- preprocessed_sentences = sentences
83
 
84
- for first_index in range(0, len(preprocessed_sentences), BATCH_SIZE):
85
 
86
- outputs = model.predict(preprocessed_sentences[first_index : first_index + BATCH_SIZE])
 
 
 
 
87
 
88
- # BATCH_SIZE = 1
89
- outputs_labels = outputs[0][0]
90
- outputs_probs = outputs[1][0]
 
91
 
92
- probs = probs + [max(min(o, 1), 0) for o in outputs_probs]
93
- labels = labels + outputs_labels
 
 
 
 
 
 
94
 
95
  my_bar.progress(
96
- min((first_index + BATCH_SIZE) / len(preprocessed_sentences), 1),
97
  text=progress_text,
98
  )
99
  my_bar.empty()
100
  return probs, labels
101
 
 
102
 
103
  render_svg(open("assets/GlotLID_logo.svg").read())
104
 
105
  tab1, tab2 = st.tabs(["Input a Sentence", "Upload a File"])
106
 
107
  with tab1:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  sent = st.text_input(
109
  "Sentence:", placeholder="Enter a sentence.", on_change=None
110
  )
111
 
112
  # TODO: Check if this is needed!
 
113
  clicked = st.button("Submit")
114
 
115
  if sent:
116
- probs, labels = compute([sent])
 
 
117
  prob = probs[0]
118
  label = labels[0]
119
 
120
- ORANGE_COLOR = "#FF8000"
121
- fig, ax = plt.subplots(figsize=(8, 1))
122
- fig.patch.set_facecolor("none")
123
- ax.set_facecolor("none")
124
-
125
- ax.spines["left"].set_color(ORANGE_COLOR)
126
- ax.spines["bottom"].set_color(ORANGE_COLOR)
127
- ax.tick_params(axis="x", colors=ORANGE_COLOR)
128
-
129
- ax.spines[["right", "top"]].set_visible(False)
130
-
131
- ax.barh(y=[0], width=[prob], color=ORANGE_COLOR)
132
- ax.set_xlim(0, 1)
133
- ax.set_ylim(-1, 1)
134
- ax.set_title(f"Langauge is: {label}", color=ORANGE_COLOR)
135
- ax.get_yaxis().set_visible(False)
136
- ax.set_xlabel("Confidence", color=ORANGE_COLOR)
137
- st.pyplot(fig)
138
-
139
  print(sent)
140
  with open("logs.txt", "a") as f:
141
  f.write(sent + "\n")
142
-
143
  with tab2:
 
 
 
 
 
 
 
 
 
 
144
  file = st.file_uploader("Upload a file", type=["txt"])
145
  if file is not None:
146
- df = pd.read_csv(file, sep="\t", header=None)
147
  df.columns = ["Sentence"]
148
  df.reset_index(drop=True, inplace=True)
149
 
150
  # TODO: Run the model
151
- df['Probs'], df["Language"] = compute(df["Sentence"].tolist())
 
152
 
153
  # A horizontal rule
154
  st.markdown("""---""")
@@ -158,7 +233,7 @@ with tab2:
158
  .mark_area(color="darkorange", opacity=0.5)
159
  .encode(
160
  x=X(field="index", title="Sentence Index"),
161
- y=Y("Probs", scale=Scale(domain=[0, 1])),
162
  )
163
  )
164
  st.altair_chart(chart.interactive(), use_container_width=True)
 
17
  import altair as alt
18
  from altair import X, Y, Scale
19
  import base64
20
+ import json
21
 
22
  @st.cache_resource
23
  def load_sp():
 
28
  sp = load_sp()
29
 
30
  def get_script(text):
31
+ """Get the writing systems of given text.
32
 
33
  Args:
34
  text: The text to be preprocessed.
35
 
36
  Returns:
37
+ The main script and list of all scripts.
38
  """
39
+ res = sp(text)
40
+ main_script = res[0] if res[0] else 'Zyyy'
41
+ all_scripts_dict = res[2]['details']
42
+ if all_scripts_dict:
43
+ all_scripts = list(all_scripts_dict.keys())
44
+ else:
45
+ all_scripts = 'Zyyy'
46
+
47
+ return main_script, all_scripts
48
+
49
+
50
+ @st.cache_data
51
+ def language_names(json_path):
52
+ with open(json_path, 'r') as json_file:
53
+ data = json.load(json_file)
54
+ return data
55
+
56
+ label2name = language_names("assets/language_names.json")
57
+
58
+ def get_name(label):
59
+ """Get the name of language from label"""
60
+ iso_3 = label.split('_')[0]
61
+ name = label2name[iso_3]
62
+ return name
63
 
 
64
 
65
  @st.cache_data
66
  def render_svg(svg):
 
78
 
79
 
80
  @st.cache_resource
81
+ def load_GlotLID_v1(model_name, file_name):
82
+ model_path = hf_hub_download(repo_id=model_name, filename=file_name)
83
  model = fasttext.load_model(model_path)
84
  return model
85
 
86
+ @st.cache_resource
87
+ def load_GlotLID_v2(model_name, file_name):
88
+ model_path = hf_hub_download(repo_id=model_name, filename=file_name)
89
+ model = fasttext.load_model(model_path)
90
+ return model
91
+
92
+
93
+ model_1 = load_GlotLID_v1(constants.MODEL_NAME, "model_v1.bin")
94
+ model_2 = load_GlotLID_v2(constants.MODEL_NAME, "model_v2.bin")
95
+
96
+ @st.cache_resource
97
+ def plot(label, prob):
98
 
99
+ ORANGE_COLOR = "#FF8000"
100
+ fig, ax = plt.subplots(figsize=(8, 1))
101
+ fig.patch.set_facecolor("none")
102
+ ax.set_facecolor("none")
103
 
104
+ ax.spines["left"].set_color(ORANGE_COLOR)
105
+ ax.spines["bottom"].set_color(ORANGE_COLOR)
106
+ ax.tick_params(axis="x", colors=ORANGE_COLOR)
107
 
108
+ ax.spines[["right", "top"]].set_visible(False)
109
+
110
+ ax.barh(y=[0], width=[prob], color=ORANGE_COLOR)
111
+ ax.set_xlim(0, 1)
112
+ ax.set_ylim(-1, 1)
113
+ ax.set_title(f"Label: {label}, Language: {get_name(label)}", color=ORANGE_COLOR)
114
+ ax.get_yaxis().set_visible(False)
115
+ ax.set_xlabel("Confidence", color=ORANGE_COLOR)
116
+ st.pyplot(fig)
117
+
118
+ def compute(sentences, version = 'v2'):
119
+ """Computes the language probablities and labels for the given sentences.
120
 
121
  Args:
122
  sentences: A list of sentences.
 
125
  A list of language probablities and labels for the given sentences.
126
  """
127
  progress_text = "Computing Language..."
128
+ model_choice = model_2 if version == 'v2' else model_1
129
  my_bar = st.progress(0, text=progress_text)
130
 
 
131
  probs = []
132
  labels = []
 
133
 
134
+ for index, sent in enumerate(sentences):
135
 
136
+ output = model_choice.predict(sent)
137
+
138
+ output_label = output[0][0].split('__')[-1]
139
+ output_prob = max(min(output[1][0], 1), 0)
140
+ output_label_language = output_label.split('_')[0]
141
 
142
+ # script control
143
+ if version in ['v2'] and output_label_language!= 'zxx':
144
+ main_script, all_scripts = get_script(sent)
145
+ output_label_script = output_label.split('_')[1]
146
 
147
+ if output_label_script not in all_scripts:
148
+ output_label_script = main_script
149
+ output_label = f"und_{output_label_script}"
150
+ output_prob = 0
151
+
152
+
153
+ labels = labels + [output_label]
154
+ probs = probs + [output_prob]
155
 
156
  my_bar.progress(
157
+ min((index) / len(sentences), 1),
158
  text=progress_text,
159
  )
160
  my_bar.empty()
161
  return probs, labels
162
 
163
+ st.markdown("[![Duplicate Space](https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=&logoWidth=14)](https://huggingface.co/spaces/cis-lmu/glotlid-space?duplicate=true)")
164
 
165
  render_svg(open("assets/GlotLID_logo.svg").read())
166
 
167
  tab1, tab2 = st.tabs(["Input a Sentence", "Upload a File"])
168
 
169
  with tab1:
170
+
171
+ # choice = st.radio(
172
+ # "Set granularity level",
173
+ # ["default", "merge", "individual"],
174
+ # captions=["enable both macrolanguage and its varieties (default)", "merge macrolanguage and its varieties into one label", "remove macrolanguages - only shows individual langauges"],
175
+ # )
176
+
177
+ version = st.radio(
178
+ "Choose model",
179
+ ["v1", "v2"],
180
+ captions=["GlotLID version 1", "GlotLID version 2 (more data and languages)"],
181
+ index = 1,
182
+ key = 'version_tab1',
183
+ horizontal = True
184
+ )
185
+
186
  sent = st.text_input(
187
  "Sentence:", placeholder="Enter a sentence.", on_change=None
188
  )
189
 
190
  # TODO: Check if this is needed!
191
+
192
  clicked = st.button("Submit")
193
 
194
  if sent:
195
+ sent = sent.replace('\n', '')
196
+
197
+ probs, labels = compute([sent], version=version)
198
  prob = probs[0]
199
  label = labels[0]
200
 
201
+ # plot
202
+ plot(label, prob)
203
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  print(sent)
205
  with open("logs.txt", "a") as f:
206
  f.write(sent + "\n")
 
207
  with tab2:
208
+
209
+ version = st.radio(
210
+ "Choose model",
211
+ ["v1", "v2"],
212
+ captions=["GlotLID version 1", "GlotLID version 2 (more data and languages)"],
213
+ index = 1,
214
+ key = 'version_tab2',
215
+ horizontal = True
216
+ )
217
+
218
  file = st.file_uploader("Upload a file", type=["txt"])
219
  if file is not None:
220
+ df = pd.read_csv(file, sep="¦\t¦", header=None)
221
  df.columns = ["Sentence"]
222
  df.reset_index(drop=True, inplace=True)
223
 
224
  # TODO: Run the model
225
+ df['Prob'], df["Label"] = compute(df["Sentence"].tolist(), version= version)
226
+ df['Language'] = df["Label"].apply(get_name)
227
 
228
  # A horizontal rule
229
  st.markdown("""---""")
 
233
  .mark_area(color="darkorange", opacity=0.5)
234
  .encode(
235
  x=X(field="index", title="Sentence Index"),
236
+ y=Y("Prob", scale=Scale(domain=[0, 1])),
237
  )
238
  )
239
  st.altair_chart(chart.interactive(), use_container_width=True)
assets/GlotLID_logo.svg CHANGED
assets/language_names.json ADDED
The diff for this file is too large to render. See raw diff
 
constants.py CHANGED
@@ -1,4 +1,4 @@
1
  CHOICE_TEXT = "Input Text"
2
  CHOICE_FILE = "Upload File"
3
  TITLE = "GlotLID: Language Identification for Around 2000 Languages"
4
- MODEL_NAME = "cis-lmu/GlotLID"
 
1
  CHOICE_TEXT = "Input Text"
2
  CHOICE_FILE = "Upload File"
3
  TITLE = "GlotLID: Language Identification for Around 2000 Languages"
4
+ MODEL_NAME = "cis-lmu/glotlid"