chrisjay commited on
Commit
db3a1a4
1 Parent(s): 6b9259e

plot per language

Browse files
Files changed (2) hide show
  1. app.py +65 -23
  2. data +1 -1
app.py CHANGED
@@ -1,4 +1,5 @@
1
  from ctypes.wintypes import LANGID
 
2
  from email.policy import default
3
  import pycountry
4
  import os
@@ -179,7 +180,16 @@ def get_metadata_json(path):
179
  except Exception:
180
  return []
181
 
182
- def show_records():
 
 
 
 
 
 
 
 
 
183
  repo.git_pull()
184
  REPOSITORY_DATA_DIR = os.path.join(REPOSITORY_DIR,'data')
185
  repo_recordings = [os.path.join(REPOSITORY_DATA_DIR,f.name) for f in os.scandir(REPOSITORY_DATA_DIR)] if os.path.isdir(REPOSITORY_DATA_DIR) else []
@@ -188,29 +198,8 @@ def show_records():
188
  audio_repo = [a.replace('data/data/','https://huggingface.co/datasets/chrisjay/crowd-speech-africa/resolve/main/data/') for a in audio_repo]
189
  metadata_all = [get_metadata_json(os.path.join(f,'metadata.jsonl')) for f in repo_recordings]
190
  metadata_all = [m for m in metadata_all if m!=[]]
191
- audios_all = audio_repo
192
-
193
- langs=[m['language_name'] for m in metadata_all]
194
- lang_dict = Counter(langs)
195
- lang_dict.update({'All others':0})
196
- all_langs = list(lang_dict.keys())
197
- langs_count = [lang_dict[k] for k in all_langs]
198
- y_pos = np.arange(len(all_langs))
199
- plt.barh(all_langs, langs_count)
200
- plt.ylabel("Language")
201
- plt.xlabel('Number of audio samples')
202
- plt.title('Distribution of audio samples over languages')
203
-
204
- #audios = [a for a in audios_all]
205
- #texts = [m['text'] for m in metadata_all]
206
- #numbers = [m['number'] for m in metadata_all]
207
 
208
- html = f"""<div class="infoPoint">
209
- <h1> Hooray! We have collected {len(metadata_all)} samples!</h1>
210
- """
211
-
212
- return html,plt
213
-
214
 
215
 
216
  def display_records():
@@ -315,9 +304,62 @@ with block:
315
  </div>
316
  """)
317
  plot = gr.Plot(type="matplotlib")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
 
 
 
 
 
 
 
 
 
 
 
 
 
319
  #listen = gr.Button("Listen")
320
  listen_tab.select(show_records,inputs=[],outputs=[display_html,plot])
 
 
 
 
 
 
321
  gr.Markdown(ARTICLE)
322
 
323
  block.launch()
1
  from ctypes.wintypes import LANGID
2
+ from curses import meta
3
  from email.policy import default
4
  import pycountry
5
  import os
180
  except Exception:
181
  return []
182
 
183
+
184
+ def plot_bar(value,name,x_name,y_name,title):
185
+
186
+ plt.barh(name, value)
187
+ plt.ylabel(y_name)
188
+ plt.xlabel(x_name)
189
+ plt.title(title)
190
+ return plt
191
+
192
+ def get_metadata_of_dataset():
193
  repo.git_pull()
194
  REPOSITORY_DATA_DIR = os.path.join(REPOSITORY_DIR,'data')
195
  repo_recordings = [os.path.join(REPOSITORY_DATA_DIR,f.name) for f in os.scandir(REPOSITORY_DATA_DIR)] if os.path.isdir(REPOSITORY_DATA_DIR) else []
198
  audio_repo = [a.replace('data/data/','https://huggingface.co/datasets/chrisjay/crowd-speech-africa/resolve/main/data/') for a in audio_repo]
199
  metadata_all = [get_metadata_json(os.path.join(f,'metadata.jsonl')) for f in repo_recordings]
200
  metadata_all = [m for m in metadata_all if m!=[]]
201
+ return metadata_all
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
 
 
 
 
 
 
203
 
204
 
205
  def display_records():
304
  </div>
305
  """)
306
  plot = gr.Plot(type="matplotlib")
307
+ metadata_all = get_metadata_of_dataset()
308
+
309
+ def show_records():
310
+ langs=[m['language_name'] for m in metadata_all]
311
+ all_genders = [m['gender'] for m in metadata_all
312
+ ]
313
+ lang_dict = Counter(langs)
314
+ lang_dict.update({'All others':0})
315
+ all_langs = list(lang_dict.keys())
316
+ langs_count = [lang_dict[k] for k in all_langs]
317
+ plt_ = plot_bar(langs_count,all_langs,'Number of audio samples',"Language",'Distribution of audio samples over languages')
318
+ html = f"""<div class="infoPoint">
319
+ <h1> Hooray! We have collected {len(metadata_all)} samples!</h1>
320
+ """
321
+
322
+ return html,plt_
323
+
324
+
325
+
326
+ languages = list(Counter([m['language_name'] for m in metadata_all]).keys())
327
+ for language in languages:
328
+ with gr.Row() as row_lang:
329
+ metadata_for_language = [m for m in metadata_all if m['language_name']==language]
330
+ gender_for_language = [m['gender'] for m in metadata_for_language]
331
+ digits_for_language = [m['number'] for m in metadata_for_language]
332
+ gender_for_language = [g if g!="" else 'Not given' for g in gender_for_language]
333
+
334
+ digits_dict = Counter(digits_for_language)
335
+ gender_dict = Counter(gender_for_language)
336
+
337
+ digits_name_for_language = list(digits_dict.keys())
338
+ digits_count_for_language = [digits_dict[k] for k in digits_name_for_language]
339
+
340
+ gender_name_for_language = list(gender_dict.keys())
341
+ gender_count_for_language = [gender_dict[k] for k in gender_name_for_language]
342
 
343
+ plot_digits = gr.Plot(type="matplotlib")
344
+ plot_gender = gr.Plot(type="matplotlib")
345
+
346
+ def plot_metadata_for_language():
347
+ plt_digits = plot_bar(digits_count_for_language,digits_name_for_language,'Number of audio samples',"Digit",f"Distribution of audio samples over digits for {language.upper()} ")
348
+ plt_gender = plot_bar(gender_count_for_language,gender_name_for_language,'Number of audio samples',"Gender",f"Distribution of audio samples over digits for {language.upper()}")
349
+ return plt_digits, plt_gender
350
+
351
+
352
+ row_lang.select(plot_metadata_for_language,inputs=[],outputs=[plot_digits,plot_gender])
353
+
354
+
355
  #listen = gr.Button("Listen")
356
  listen_tab.select(show_records,inputs=[],outputs=[display_html,plot])
357
+
358
+
359
+ # Have a list of the languages. lang
360
+ # We want digits per language and gender per language
361
+ # for l in range(len(lang),step =4)
362
+ # with Row().... d
363
  gr.Markdown(ARTICLE)
364
 
365
  block.launch()
data CHANGED
@@ -1 +1 @@
1
- Subproject commit 7fa5d2a2751934a30d76f6dbd20b3295766ba39e
1
+ Subproject commit af4ec56533825ccc0877c32d8ad73301181e8e98