chrisjay commited on
Commit
73257d5
1 Parent(s): af6c493

emails and country included

Browse files
Files changed (4) hide show
  1. app.py +49 -27
  2. app3.py +0 -39
  3. article.py +4 -5
  4. requirements.txt +2 -1
app.py CHANGED
@@ -1,5 +1,6 @@
1
  from ctypes.wintypes import LANGID
2
  from email.policy import default
 
3
  import os
4
  import csv
5
  import random
@@ -18,8 +19,10 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
18
  NUMBER_DIR = './number'
19
  number_files = [f.name for f in os.scandir(NUMBER_DIR)]
20
 
 
21
 
22
  DATASET_REPO_URL = "https://huggingface.co/datasets/chrisjay/crowd-speech-africa"
 
23
  REPOSITORY_DIR = "data"
24
  LOCAL_DIR = 'data_local'
25
  os.makedirs(LOCAL_DIR,exist_ok=True)
@@ -48,7 +51,7 @@ with open('app.css','r') as f:
48
 
49
 
50
 
51
- def save_record(language,text,record,number,age,gender,accent,number_history,current_number,done_recording):
52
  number_history = number_history or [0]
53
 
54
  # Save text and its corresponding record to flag
@@ -56,7 +59,6 @@ def save_record(language,text,record,number,age,gender,accent,number_history,cur
56
  speaker_metadata['gender'] = gender if gender!=GENDER[0] else ''
57
  speaker_metadata['age'] = age if age !='' else ''
58
  speaker_metadata['accent'] = accent if accent!='' else ''
59
- import pdb;pdb.set_trace()
60
  default_record = None
61
  if not done_recording:
62
  if language!=None and language!='Choose language' and record is not None and number is not None:
@@ -77,7 +79,8 @@ def save_record(language,text,record,number,age,gender,accent,number_history,cur
77
  'language_name':language,'language_id':lang_id,
78
  'number':current_number, 'text':text,'frequency':record[0],
79
  'age': speaker_metadata['age'],'gender': speaker_metadata['gender'],
80
- 'accent': speaker_metadata['accent']
 
81
  }
82
 
83
  dump_json(metadata,json_file_path)
@@ -102,7 +105,7 @@ def save_record(language,text,record,number,age,gender,accent,number_history,cur
102
  token=HF_TOKEN
103
  )
104
 
105
- output = f'Recording successfully saved!'
106
 
107
  # Choose the next number
108
  number_history.append(current_number)
@@ -112,9 +115,34 @@ def save_record(language,text,record,number,age,gender,accent,number_history,cur
112
 
113
  next_number_image = f'number/{next_number}.jpg'
114
  else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  done_recording=True
116
  next_number = 0 # the default number
117
- next_number_image = f'number/best.gif'
 
118
  output_string = "<html> <body> <div class='output' style='color:green; font-size:13px'>"+output+"</div> </body> </html>"
119
  return output_string,next_number_image,number_history,next_number,done_recording,default_record
120
 
@@ -129,6 +157,7 @@ def save_record(language,text,record,number,age,gender,accent,number_history,cur
129
  # return output_string, previous image and state
130
  return output_string, number,number_history,current_number,done_recording,default_record
131
  else:
 
132
  # Stop submitting recording (best.gif is displaying)
133
  output = '🙌 You have finished all recording! Thank You. You can reload to start again (maybe in another language).'
134
  output_string = "<div class='finished'>"+output+"</div>"
@@ -187,40 +216,33 @@ markdown="""
187
 
188
  > Record numbers 0-9 in your African language.
189
 
190
- 1. Choose your African language
191
- 2. Fill in the speaker metadata (age, gender, accent). This is optional but important to build better speech models.
192
- 3. You will see the image of a number __(this is the number you will record)__.
193
- 4. Fill in the word of that number (optional)
194
- 5. Click record and say the number in your African language.
195
- 6. Click ‘Submit’. It will save your record and go to the next number.
196
- 7. Repeat 4-7
197
- 8. Leave a ❤ in the Space, if you found it fun.
198
- """
199
- SORTED_LANGUAGES = sorted([lang_.title() for lang_ in list(DEFAULT_LANGS.keys())])
200
- LANGAUGES_CHOOSE = """
201
- <label for="langs"> Choose your language </label>
202
- <input type="text" id="langs" name="AfricanLanguages" list="languagesList">
203
-
204
- <datalist id='languagesList'>
205
  """
206
- for lang in SORTED_LANGUAGES:
207
- LANGAUGES_CHOOSE+= f"<option> {lang} </option> \n"
208
- LANGAUGES_CHOOSE+="</datalist>"
209
 
210
 
211
  # Interface design begins
212
  block = gr.Blocks(css=BLOCK_CSS)
213
  with block:
214
  gr.Markdown(markdown)
 
215
  with gr.Tabs():
216
 
217
  with gr.TabItem('Record'):
218
  with gr.Row():
219
- language = gr.HTML(LANGAUGES_CHOOSE)
220
- #language = gr.inputs.Dropdown(choices = sorted([lang_.title() for lang_ in list(DEFAULT_LANGS.keys())]),label="Choose language",default="Choose language")
221
  age = gr.inputs.Textbox(placeholder='e.g. 21',label="Your age (optional)",default='')
222
  gender = gr.inputs.Dropdown(choices=GENDER, type="value", default=None, label="Gender (optional)")
223
- accent = gr.inputs.Textbox(label="Accent (optional)",default='')
 
224
 
225
  number = gr.Image('number/0.jpg',image_mode="L")
226
  text = gr.inputs.Textbox(placeholder='e.g. `one` is `otu` in Igbo or `ọkan` in Yoruba',label="How is the number called in your language (optional)")
@@ -233,7 +255,7 @@ with block:
233
  save = gr.Button("Submit")
234
 
235
 
236
- save.click(save_record, inputs=[language,text,record,number,age,gender,accent,state,current_number,done_recording],outputs=[output_result,number,state,current_number,done_recording,record])
237
 
238
  with gr.TabItem('Listen') as listen_tab:
239
  gr.Markdown("Listen to the recordings contributed. You can find them <a href='https://huggingface.co/datasets/chrisjay/crowd-speech-africa' target='blank'>here</a>.")
 
1
  from ctypes.wintypes import LANGID
2
  from email.policy import default
3
+ import pycountry
4
  import os
5
  import csv
6
  import random
 
19
  NUMBER_DIR = './number'
20
  number_files = [f.name for f in os.scandir(NUMBER_DIR)]
21
 
22
+ DEFAULT_LIST_OF_COUNTRIES = [country.name for country in pycountry.countries]
23
 
24
  DATASET_REPO_URL = "https://huggingface.co/datasets/chrisjay/crowd-speech-africa"
25
+ EMAILS_REPO_URL="https://huggingface.co/datasets/chrisjay/african-digits-recording-sprint-email"
26
  REPOSITORY_DIR = "data"
27
  LOCAL_DIR = 'data_local'
28
  os.makedirs(LOCAL_DIR,exist_ok=True)
 
51
 
52
 
53
 
54
+ def save_record(language,text,record,number,age,gender,accent,number_history,current_number,country,email,done_recording):
55
  number_history = number_history or [0]
56
 
57
  # Save text and its corresponding record to flag
 
59
  speaker_metadata['gender'] = gender if gender!=GENDER[0] else ''
60
  speaker_metadata['age'] = age if age !='' else ''
61
  speaker_metadata['accent'] = accent if accent!='' else ''
 
62
  default_record = None
63
  if not done_recording:
64
  if language!=None and language!='Choose language' and record is not None and number is not None:
 
79
  'language_name':language,'language_id':lang_id,
80
  'number':current_number, 'text':text,'frequency':record[0],
81
  'age': speaker_metadata['age'],'gender': speaker_metadata['gender'],
82
+ 'accent': speaker_metadata['accent'],
83
+ 'country':country
84
  }
85
 
86
  dump_json(metadata,json_file_path)
 
105
  token=HF_TOKEN
106
  )
107
 
108
+ output = f'Recording successfully saved! On to the next one...'
109
 
110
  # Choose the next number
111
  number_history.append(current_number)
 
115
 
116
  next_number_image = f'number/{next_number}.jpg'
117
  else:
118
+ email_metadata_name = get_unique_name()
119
+ EMAIL_SAVE_FILE = os.path.join(LOCAL_DIR,f"{email_metadata_name}.json")
120
+ # Write metadata.json to file
121
+ email_metadata = {'id':email_metadata_name,'email':email,
122
+ 'language_name':language,'language_id':lang_id,
123
+ 'age': speaker_metadata['age'],'gender': speaker_metadata['gender'],
124
+ 'accent': speaker_metadata['accent'],
125
+ 'country':country
126
+ }
127
+
128
+ dump_json(email_metadata,EMAIL_SAVE_FILE)
129
+
130
+ # Upload the metadata
131
+ repo_json_path = os.path.join('emails',f"{email_metadata_name}.json")
132
+ _ = upload_file(path_or_fileobj = EMAIL_SAVE_FILE,
133
+ path_in_repo =repo_json_path,
134
+ repo_id='chrisjay/african-digits-recording-sprint-email',
135
+ repo_type='dataset',
136
+ token=HF_TOKEN
137
+ )
138
+ # Delete the email from local repo
139
+ if os.path.exists(EMAIL_SAVE_FILE):
140
+ os.remove(EMAIL_SAVE_FILE)
141
+ #-------------------
142
  done_recording=True
143
  next_number = 0 # the default number
144
+ next_number_image = f'number/best.gif'
145
+ output = "You have finished all recording! You can reload to start again."
146
  output_string = "<html> <body> <div class='output' style='color:green; font-size:13px'>"+output+"</div> </body> </html>"
147
  return output_string,next_number_image,number_history,next_number,done_recording,default_record
148
 
 
157
  # return output_string, previous image and state
158
  return output_string, number,number_history,current_number,done_recording,default_record
159
  else:
160
+
161
  # Stop submitting recording (best.gif is displaying)
162
  output = '🙌 You have finished all recording! Thank You. You can reload to start again (maybe in another language).'
163
  output_string = "<div class='finished'>"+output+"</div>"
 
216
 
217
  > Record numbers 0-9 in your African language.
218
 
219
+ 1. Fill in your email. This is completely optional. We need this to track your progress for the prize.
220
+ 2. Choose your African language
221
+ 3. Fill in the speaker metadata (age, gender, accent). This is optional but important to build better speech models.
222
+ 4. You will see the image of a number __(this is the number you will record)__.
223
+ 5. Fill in the word of that number (optional)
224
+ 6. Click record and say the number in your African language.
225
+ 7. Click ‘Submit’. It will save your record and go to the next number.
226
+ 8. Repeat 4-7
227
+ 9. Leave a ❤ in the Space, if you found it fun.
 
 
 
 
 
 
228
  """
 
 
 
229
 
230
 
231
  # Interface design begins
232
  block = gr.Blocks(css=BLOCK_CSS)
233
  with block:
234
  gr.Markdown(markdown)
235
+ email = gr.inputs.Textbox(placeholder='your email',label="Email (if you want join the sprint)",default='')
236
  with gr.Tabs():
237
 
238
  with gr.TabItem('Record'):
239
  with gr.Row():
240
+
241
+ language = gr.inputs.Dropdown(choices = sorted([lang_.title() for lang_ in list(DEFAULT_LANGS.keys())]),label="Choose language",default="Choose language")
242
  age = gr.inputs.Textbox(placeholder='e.g. 21',label="Your age (optional)",default='')
243
  gender = gr.inputs.Dropdown(choices=GENDER, type="value", default=None, label="Gender (optional)")
244
+ accent = gr.inputs.Textbox(label="Accent (optional)",default='')
245
+ country = gr.Dropdown(choices=[''] + sorted(DEFAULT_LIST_OF_COUNTRIES),type='value',default=None,label="Country you are recording from (optional)")
246
 
247
  number = gr.Image('number/0.jpg',image_mode="L")
248
  text = gr.inputs.Textbox(placeholder='e.g. `one` is `otu` in Igbo or `ọkan` in Yoruba',label="How is the number called in your language (optional)")
 
255
  save = gr.Button("Submit")
256
 
257
 
258
+ save.click(save_record, inputs=[language,text,record,number,age,gender,accent,state,current_number,country,email,done_recording],outputs=[output_result,number,state,current_number,done_recording,record])
259
 
260
  with gr.TabItem('Listen') as listen_tab:
261
  gr.Markdown("Listen to the recordings contributed. You can find them <a href='https://huggingface.co/datasets/chrisjay/crowd-speech-africa' target='blank'>here</a>.")
app3.py DELETED
@@ -1,39 +0,0 @@
1
- import os
2
- import gradio as gr
3
-
4
-
5
- #HF_TOKEN = os.environ.get("HF_TOKEN")
6
- #print("is none?", HF_TOKEN is None)
7
-
8
- def get_record(language,text,record):
9
- # Save text and its corresponding record to flag
10
-
11
- text =text.strip()
12
-
13
- #output_string = "<html> <body> <div class='output'>"+f'Record for text {text} successfully saved to dataset! Thank You.'+"</div> </body> </html>"
14
- output_string = f'Record for text - {text} - successfully saved to dataset! Thank You.'
15
- return output_string
16
-
17
- title = 'African Crowdsource Speech'
18
- description = 'A platform to contribute to your African language by recording your voice'
19
-
20
- markdown = """# African Crowdsource Speech
21
-
22
- A platform to contribute to your African language by recording your voice
23
- """
24
-
25
- # Get a dropdown of all African languages
26
-
27
- # Interface design begins
28
- #import pdb; pdb.set_trace()
29
- iface = gr.Interface(fn=get_record,
30
- inputs=[gr.inputs.Textbox(placeholder='Choose your language'),
31
- gr.inputs.Textbox(placeholder='Write your text'),
32
- gr.inputs.Audio(source="microphone",label='Record your voice')
33
- ],
34
- outputs = "text",
35
- title=title,
36
- description=description,
37
- theme='huggingface'
38
- )
39
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
article.py CHANGED
@@ -14,11 +14,10 @@ This dataset will boost speech technologies (like speech-to-text, text-to-speech
14
 
15
  **About the dataset**
16
 
17
- The data (metadat,text, and audio recording) are uploaded to [a public Hugging Face dataset](https://huggingface.co/datasets/chrisjay/crowd-speech-africa).
18
-
19
- We do not collect your name, address or other sensitive information.
20
-
21
- If for some reason you want to remove your entry, please reach out by email.
22
 
23
  **Contact**
24
 
 
14
 
15
  **About the dataset**
16
 
17
+ - The data (metadat,text, and audio recording) are uploaded to [a public Hugging Face dataset](https://huggingface.co/datasets/chrisjay/crowd-speech-africa).
18
+ - We do not collect your name, address or other sensitive information.
19
+ - If for some reason you want to remove your entry, please reach out by email.
20
+ - Your email, if given, is used only to keep track of your progress in order to give the prizes to the top scorers. They are temporarily stored in [this private dataset](https://huggingface.co/datasets/chrisjay/african-digits-recording-sprint-email) and immediately deleted after the sprint.
 
21
 
22
  **Contact**
23
 
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  pandas
2
- scipy
 
 
1
  pandas
2
+ scipy
3
+ pycountry