mj-new commited on
Commit
56bfb5d
1 Parent(s): 40b66e2

Batch recording alpha (missing prompts generation/retrieval

Browse files
Files changed (4) hide show
  1. __pycache__/helpers.cpython-310.pyc +0 -0
  2. app.py +175 -58
  3. helpers.py +47 -0
  4. requirements.txt +1 -1
__pycache__/helpers.cpython-310.pyc ADDED
Binary file (625 Bytes). View file
 
app.py CHANGED
@@ -12,6 +12,7 @@ import uuid
12
  from datetime import date,datetime
13
  from huggingface_hub import Repository, upload_file
14
  import shutil
 
15
 
16
  HF_TOKEN_WRITE = os.environ.get("HF_TOKEN_WRITE")
17
  print("HF_TOKEN_WRITE", HF_TOKEN_WRITE)
@@ -43,17 +44,23 @@ def get_unique_name():
43
  return ''.join([random.choice(string.ascii_letters
44
  + string.digits) for n in range(32)])
45
 
46
- def save_recording_and_meta(project_name, recording, transcript, language):
 
 
 
 
 
 
47
  #, name, age, gender):
48
  # TODO save user data in the next version
49
 
50
  speaker_metadata={}
51
- speaker_metadata['gender'] = "test" #gender if gender!=GENDER[0] else ''
52
- speaker_metadata['age'] = "test" #age if age !='' else ''
53
- speaker_metadata['accent'] = "test" #accent if accent!='' else ''
 
 
54
 
55
- lang_id = language.lower()
56
-
57
  # TODO get ISO-693-1 codes
58
  transcript =transcript.strip()
59
 
@@ -81,9 +88,10 @@ def save_recording_and_meta(project_name, recording, transcript, language):
81
  now = datetime.now()
82
  timestamp_str = now.strftime("%d/%m/%Y %H:%M:%S")
83
  metadata= {'id':uuid_name,'audio_file': audio_fn,
84
- 'language_name':language,'language_id':lang_id,
85
  'transcript':transcript,'age': speaker_metadata['age'],
86
  'gender': speaker_metadata['gender'],'accent': speaker_metadata['accent'],
 
87
  "date":today_ymd, "timestamp": timestamp_str }
88
 
89
  dump_json(metadata, json_file_path)
@@ -109,7 +117,9 @@ def save_recording_and_meta(project_name, recording, transcript, language):
109
  )
110
 
111
  output = print(f"Recording {audio_fn} and meta file {meta_fn} successfully saved to repo!")
112
- return
 
 
113
 
114
  def whisper_model_change(radio_whisper_model):
115
  whisper_model = whisper.load_model(radio_whisper_model)
@@ -147,17 +157,17 @@ def process_pipeline(audio):
147
  tts_out = synthesize_speech(gpt_out)
148
  return(tts_out)
149
 
150
- def transcribe(audio, language, whisper_model, whisper_model_type):
151
  if not whisper_model:
152
  whisper_model=init_whisper_model(whisper_model_type)
153
 
154
- print(f"Transcribing {audio} for language {language} and model {whisper_model_type}")
155
  audio = whisper.load_audio(audio)
156
  audio = whisper.pad_or_trim(audio)
157
 
158
  mel = whisper.log_mel_spectrogram(audio)
159
 
160
- options = whisper.DecodingOptions(language=language, without_timestamps=True, fp16=False)
161
  result = whisper.decode(whisper_model, mel, options)
162
  result_text = result.text
163
  return result_text
@@ -168,9 +178,9 @@ def init_whisper_model(whisper_model_type):
168
  whisper_model = whisper.load_model(whisper_model_type)
169
  return whisper_model
170
 
171
- def synthesize_speech(text, language):
172
  audioobj = gTTS(text = text,
173
- lang = language,
174
  slow = False)
175
 
176
  audioobj.save("Temp.mp3")
@@ -180,26 +190,87 @@ block = gr.Blocks(css=css_file)
180
  with block:
181
 
182
  #state variables
183
- language = gr.State("en")
 
 
 
 
 
 
 
 
 
184
  temperature = gr.State(0)
185
  whisper_model_type = gr.State("base")
186
  whisper_model = gr.State()
187
- api_key = gr.State()
 
 
188
  project_name = gr.State("voicebot") # TODO add list of projects to organize saved data
 
 
 
 
 
 
 
 
189
 
190
  # state handling functions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  def change_language(choice):
192
  if choice == "Polish":
193
- language="pl"
194
  print("Switching to Polish")
195
- print("language")
196
- print(language)
197
  elif choice == "English":
198
- language="en"
199
  print("Switching to English")
200
- print("language")
201
- print(language)
202
- return(language)
203
 
204
  def change_whisper_model(choice):
205
  whisper_model_type = choice
@@ -211,46 +282,92 @@ with block:
211
  gr.Markdown(markdown)
212
 
213
  with gr.Tabs():
214
- with gr.Row():
215
- with gr.TabItem('Voicebot playground'):
216
- with gr.Accordion(label="Settings"):
217
- gr.HTML("<p class=\"apikey\">Open AI API Key:</p>")
218
- # API key textbox (password-style)
219
- api_key = gr.Textbox(label="", elem_id="pw")
220
- slider_temp = gr.Slider(minimum=0, maximum= 2, step=0.2, label="ChatGPT temperature")
221
- radio_lang = gr.Radio(["Polish", "English"], label="Language", info="If none selected, English is used")
222
- #radio_asr_type = gr.Radio(["Local", "Cloud"], label="Select ASR type", info="Cloud models are faster and more accurate, but costs money")
223
- #radio_cloud_asr = gr.Radio(["Whisper", "Google", "Azure"], label="Select Cloud ASR provider", info="You need to provide API keys for specific service")
224
- radio_whisper_model = gr.Radio(["tiny", "base", "small", "medium", "large"], label="Whisper ASR model (local)", info="Larger models are more accurate, but slower. Default - base")
225
- with gr.Box():
226
- with gr.Row():
227
- mic_recording = gr.Audio(source="microphone", type="filepath", label='Record your voice')
228
-
229
- button_transcribe = gr.Button("Transcribe speech")
230
-
231
- button_save_audio_and_trans = gr.Button("Save recording and meta")
232
-
233
- out_asr = gr.Textbox(placeholder="ASR output",
234
- lines=2,
235
- max_lines=5,
236
- show_label=False)
237
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  button_prompt_gpt = gr.Button("Prompt ChatGPT")
 
239
 
240
- out_gpt = gr.Textbox(placeholder="ChatGPT output",
241
- lines=4,
242
- max_lines=10,
243
- show_label=False)
 
244
  button_synth_speech = gr.Button("Synthesize speech")
245
- synth_recording = gr.Audio()
 
 
246
 
247
- # Events actions
248
- button_save_audio_and_trans.click(save_recording_and_meta, inputs=[project_name, mic_recording, out_asr, language], outputs=[])
249
- button_transcribe.click(transcribe, inputs=[mic_recording, language, whisper_model,whisper_model_type], outputs=out_asr)
250
- button_prompt_gpt.click(prompt_gpt, inputs=[out_asr, api_key, slider_temp], outputs=out_gpt)
251
- button_synth_speech.click(synthesize_speech, inputs=[out_gpt, language], outputs=synth_recording)
252
 
253
- radio_lang.change(fn=change_language, inputs=radio_lang, outputs=language)
254
- radio_whisper_model.change(fn=change_whisper_model, inputs=radio_whisper_model, outputs=[whisper_model_type, whisper_model])
 
255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  block.launch()
 
12
  from datetime import date,datetime
13
  from huggingface_hub import Repository, upload_file
14
  import shutil
15
+ from helpers import dict_origin
16
 
17
  HF_TOKEN_WRITE = os.environ.get("HF_TOKEN_WRITE")
18
  print("HF_TOKEN_WRITE", HF_TOKEN_WRITE)
 
44
  return ''.join([random.choice(string.ascii_letters
45
  + string.digits) for n in range(32)])
46
 
47
+ def get_prompts(domain, type, size, language_code):
48
+ print(f"Retrieving prompts for domain {domain} with method: {type} for language_code {language_code} of size {size}")
49
+ promptset = ["test1", "test2"]
50
+ first_prompt = promptset[0]
51
+ return(promptset, first_prompt)
52
+
53
+ def save_recording_and_meta(project_name, recording, transcript, language_code, spk_age, spk_accent, spk_city, spk_gender, spk_nativity, promptset, prompt_number):
54
  #, name, age, gender):
55
  # TODO save user data in the next version
56
 
57
  speaker_metadata={}
58
+ speaker_metadata['gender'] = spk_gender if spk_gender !='' else 'unknown'
59
+ speaker_metadata['age'] = spk_age if spk_age !='' else 'unknown'
60
+ speaker_metadata['accent'] = spk_accent if spk_accent !='' else 'unknown'
61
+ speaker_metadata['city'] = spk_city if spk_city !='' else 'unknown'
62
+ speaker_metadata['nativity'] = spk_nativity if spk_nativity !='' else 'unknown'
63
 
 
 
64
  # TODO get ISO-693-1 codes
65
  transcript =transcript.strip()
66
 
 
88
  now = datetime.now()
89
  timestamp_str = now.strftime("%d/%m/%Y %H:%M:%S")
90
  metadata= {'id':uuid_name,'audio_file': audio_fn,
91
+ 'language_code':language_code,
92
  'transcript':transcript,'age': speaker_metadata['age'],
93
  'gender': speaker_metadata['gender'],'accent': speaker_metadata['accent'],
94
+ 'nativity': speaker_metadata['nativity'],'city': speaker_metadata['city'],
95
  "date":today_ymd, "timestamp": timestamp_str }
96
 
97
  dump_json(metadata, json_file_path)
 
117
  )
118
 
119
  output = print(f"Recording {audio_fn} and meta file {meta_fn} successfully saved to repo!")
120
+ # None resets the audio component
121
+ return ["Next prompt", 1, None]
122
+
123
 
124
  def whisper_model_change(radio_whisper_model):
125
  whisper_model = whisper.load_model(radio_whisper_model)
 
157
  tts_out = synthesize_speech(gpt_out)
158
  return(tts_out)
159
 
160
+ def transcribe(audio, language_code, whisper_model, whisper_model_type):
161
  if not whisper_model:
162
  whisper_model=init_whisper_model(whisper_model_type)
163
 
164
+ print(f"Transcribing {audio} for language_code {language_code} and model {whisper_model_type}")
165
  audio = whisper.load_audio(audio)
166
  audio = whisper.pad_or_trim(audio)
167
 
168
  mel = whisper.log_mel_spectrogram(audio)
169
 
170
+ options = whisper.DecodingOptions(language=language_code, without_timestamps=True, fp16=False)
171
  result = whisper.decode(whisper_model, mel, options)
172
  result_text = result.text
173
  return result_text
 
178
  whisper_model = whisper.load_model(whisper_model_type)
179
  return whisper_model
180
 
181
+ def synthesize_speech(text, language_code):
182
  audioobj = gTTS(text = text,
183
+ lang = language_code,
184
  slow = False)
185
 
186
  audioobj.save("Temp.mp3")
 
190
  with block:
191
 
192
  #state variables
193
+ language_code = gr.State("pl")
194
+ domain = gr.State()
195
+ prompts_type = gr.State()
196
+ promptset = gr.State("test.prompts.txt")
197
+ prompt_history = gr.State()
198
+ current_prompt = gr.State()
199
+ prompt_number = gr.State()
200
+ finished_recording = gr.State()
201
+
202
+
203
  temperature = gr.State(0)
204
  whisper_model_type = gr.State("base")
205
  whisper_model = gr.State()
206
+ openai_api_key = gr.State()
207
+ google_api_key = gr.State()
208
+ azure_api_key = gr.State()
209
  project_name = gr.State("voicebot") # TODO add list of projects to organize saved data
210
+
211
+ spk_age = gr.State("unknown")
212
+ spk_accent = gr.State("unknown")
213
+ spk_city = gr.State("unknown")
214
+ spk_gender = gr.State("unknown")
215
+ spk_nativity = gr.State("unknown")
216
+ cities = sorted(dict_origin["Poland"]["cities"])
217
+
218
 
219
  # state handling functions
220
+ def change_domain(choice):
221
+ print("Changing promptset domain to")
222
+ print(choice)
223
+ domain=choice
224
+ return(domain)
225
+
226
+ def change_prompts_type(choice):
227
+ print("Changing promptset type to")
228
+ print(choice)
229
+ prompts_type=choice
230
+ return(prompts_type)
231
+
232
+ def change_nativity(choice):
233
+ print("Changing speaker nativity to")
234
+ print(choice)
235
+ spk_nativity=choice
236
+ return(spk_nativity)
237
+
238
+ def change_accent(choice):
239
+ print("Changing speaker accent to")
240
+ print(choice)
241
+ spk_accent=choice
242
+ return(spk_accent)
243
+
244
+ def change_age(choice):
245
+ print("Changing speaker age to")
246
+ print(choice)
247
+ spk_age=choice
248
+ return(spk_age)
249
+
250
+ def change_city(choice):
251
+ print("Changing speaker city to")
252
+ print(choice)
253
+ spk_city=choice
254
+ return(spk_city)
255
+
256
+ def change_gender(choice):
257
+ print("Changing speaker gender to")
258
+ print(choice)
259
+ spk_gender=choice
260
+ return(spk_gender)
261
+
262
  def change_language(choice):
263
  if choice == "Polish":
264
+ language_code="pl"
265
  print("Switching to Polish")
266
+ print("language_code")
267
+ print(language_code)
268
  elif choice == "English":
269
+ language_code="en"
270
  print("Switching to English")
271
+ print("language_code")
272
+ print(language_code)
273
+ return(language_code)
274
 
275
  def change_whisper_model(choice):
276
  whisper_model_type = choice
 
282
  gr.Markdown(markdown)
283
 
284
  with gr.Tabs():
285
+ with gr.TabItem('General settings'):
286
+ radio_lang = gr.Radio(["Polish", "English"], label="Language", info="If none is selected, Polish is used")
287
+ radio_asr_type = gr.Radio(["Local", "Cloud"], label="Select ASR type", info="Cloud models are faster and more accurate, but costs money")
288
+ with gr.Accordion(label="Local ASR settings", open=False):
289
+ #radio_asr_type = gr.Radio(["Local", "Cloud"], label="Select ASR type", info="Cloud models are faster and more accurate, but costs money")
290
+ #radio_cloud_asr = gr.Radio(["Whisper", "Google", "Azure"], label="Select Cloud ASR provider", info="You need to provide API keys for specific service")
291
+ radio_whisper_model = gr.Radio(["tiny", "base", "small", "medium", "large"], label="Whisper ASR model (local)", info="Larger models are more accurate, but slower. Default - base")
292
+ with gr.Accordion(label="Cloud ASR settings", open=False):
293
+ radio_cloud_asr = gr.Radio(["Whisper", "Google", "Azure"], label="Select Cloud ASR provider", info="You need to provide API keys for specific service")
294
+ with gr.Accordion(label="Cloud API Keys",open=False):
295
+ gr.HTML("<p class=\"apikey\">Open AI API Key:</p>")
296
+ # API key textbox (password-style)
297
+ openai_api_key = gr.Textbox(label="", elem_id="pw")
298
+ gr.HTML("<p class=\"apikey\">Google Cloud API Key:</p>")
299
+ # API key textbox (password-style)
300
+ google_api_key = gr.Textbox(label="", elem_id="pw")
301
+ gr.HTML("<p class=\"apikey\">Azure Cloud API Key:</p>")
302
+ # API key textbox (password-style)
303
+ azure_api_key = gr.Textbox(label="", elem_id="pw")
304
+ with gr.Accordion(label="Chat GPT settings",open=False):
305
+ slider_temp = gr.Slider(minimum=0, maximum= 2, step=0.2, label="ChatGPT temperature")
306
+
307
+ with gr.TabItem('Speaker information'):
308
+ with gr.Row():
309
+ dropdown_spk_nativity = gr.Dropdown(["Polish", "Other"], label="Your native language", info="")
310
+ dropdown_spk_gender = gr.Dropdown(["Male", "Female", "Other", "Prefer not to say"], label="Your gender", info="")
311
+ dropdown_spk_age = gr.Dropdown(["under 20", "20-29", "30-39", "40-49", "50-59", "over 60"], label="Your age range", info="")
312
+ dropdown_spk_origin_city = gr.Dropdown(cities, label="Your home city", visible=True, info="Specify the closest city your place of birth and upbringing")
313
+ #radio_gdpr_consent = gr.Radio(["Yes", "No"], label="Personal data processing consent", info="Do you agree for your personal data processing according to the policy (link)")
314
+ dropdown_spk_nativity.change(fn=change_nativity, inputs=dropdown_spk_nativity, outputs=spk_age)
315
+ dropdown_spk_gender.change(fn=change_gender, inputs=dropdown_spk_gender, outputs=spk_gender)
316
+ dropdown_spk_age.change(fn=change_age, inputs=dropdown_spk_age, outputs=spk_age)
317
+ dropdown_spk_origin_city.change(fn=change_city, inputs=dropdown_spk_origin_city, outputs=spk_city)
318
+
319
+ with gr.TabItem('Voicebot playground'):
320
+ mic_recording = gr.Audio(source="microphone", type="filepath", label='Record your voice')
321
+ with gr.Row():
322
+ button_transcribe = gr.Button("Transcribe speech")
323
+
324
+ button_save_audio_and_trans = gr.Button("Save audio recording and transcription")
325
+
326
+ out_asr = gr.Textbox(placeholder="ASR output",
327
+ lines=2,
328
+ max_lines=5,
329
+ show_label=False)
330
+
331
+ with gr.Row():
332
  button_prompt_gpt = gr.Button("Prompt ChatGPT")
333
+ button_save_gpt_response = gr.Button("Save ChatGPT response")
334
 
335
+ out_gpt = gr.Textbox(placeholder="ChatGPT output",
336
+ lines=4,
337
+ max_lines=10,
338
+ show_label=False)
339
+ with gr.Row():
340
  button_synth_speech = gr.Button("Synthesize speech")
341
+ button_save_synth_audio = gr.Button("Save synthetic audio")
342
+
343
+ synth_recording = gr.Audio()
344
 
345
+ # Events actions
346
+ button_save_audio_and_trans.click(save_recording_and_meta, inputs=[project_name, mic_recording, out_asr, language_code, spk_age, spk_accent, spk_city, spk_gender, spk_nativity], outputs=[])
347
+ button_transcribe.click(transcribe, inputs=[mic_recording, language_code, whisper_model,whisper_model_type], outputs=out_asr)
348
+ button_prompt_gpt.click(prompt_gpt, inputs=[out_asr, openai_api_key, slider_temp], outputs=out_gpt)
349
+ button_synth_speech.click(synthesize_speech, inputs=[out_gpt, language_code], outputs=synth_recording)
350
 
351
+ radio_lang.change(fn=change_language, inputs=radio_lang, outputs=language_code)
352
+ radio_whisper_model.change(fn=change_whisper_model, inputs=radio_whisper_model, outputs=[whisper_model_type, whisper_model])
353
+ with gr.TabItem('Batch audio collection'):
354
 
355
+
356
+ with gr.Accordion(label="Promptset settings"):
357
+ radio_prompts_domain = gr.Dropdown(["Bridge"], label="Select promptset domain", info="")
358
+ radio_promptset_type = gr.Radio(["New promptset generation", "Existing promptset use"], label="Language", value ="Existing promptset use", info="New promptset is generated using ChatGPT")
359
+ var_promptset_size = gr.Textbox(label="Specify number of prompts (min 10, max 200)")
360
+ button_get_prompts = gr.Button("Save settings and get first prompt to record")
361
+
362
+ prompt_text = gr.Textbox(placeholder='Prompt to be recorded',label="Prompt to be read during recording")
363
+ speech_recording = gr.Audio(source="microphone",label="Select 'record from microphone' and read prompt displayed above", type="filepath")
364
+
365
+ radio_prompts_domain.change(fn=change_domain, inputs=radio_prompts_domain, outputs=domain)
366
+ radio_promptset_type.change(fn=change_prompts_type, inputs=radio_promptset_type, outputs=prompts_type)
367
+
368
+ button_save_and_next = gr.Button("Save audio recording and move to the next prompt")
369
+ button_get_prompts.click(get_prompts, inputs=[radio_prompts_domain, radio_promptset_type, var_promptset_size, language_code], outputs = [promptset, prompt_text])
370
+
371
+ button_save_and_next.click(save_recording_and_meta, inputs=[project_name, speech_recording, prompt_text, language_code, spk_age, spk_accent, spk_city, spk_gender, spk_nativity, promptset, prompt_number], outputs=[prompt_text, prompt_number, speech_recording])
372
+
373
  block.launch()
helpers.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dict_origin = {
2
+ "Poland": {
3
+ "cities": [
4
+ "Warsaw",
5
+ "Białystok",
6
+ "Bydgoszcz",
7
+ "Gdańsk",
8
+ "Gorzów Wielkopolski",
9
+ "Katowice",
10
+ "Kielce",
11
+ "Kraków",
12
+ "Lublin",
13
+ "Łódź",
14
+ "Olsztyn",
15
+ "Opole",
16
+ "Poznań",
17
+ "Rzeszów",
18
+ "Szczecin",
19
+ "Toruń",
20
+ "Wrocław"
21
+ ]
22
+ },
23
+ "Ukraine": {
24
+ "cities": [
25
+ "Kyiv",
26
+ "Kharkiv",
27
+ "Dnipro",
28
+ "Odessa",
29
+ "Lviv",
30
+ "Zaporizhzhia",
31
+ "Kryvyi Rih",
32
+ "Mykolaiv",
33
+ "Mariupol",
34
+ "Luhansk",
35
+ "Makiivka",
36
+ "Vinnytsia",
37
+ "Simferopol",
38
+ "Kherson",
39
+ "Poltava",
40
+ "Chernihiv",
41
+ "Cherkasy",
42
+ "Sumy",
43
+ "Zhytomyr",
44
+ "Horlivka"
45
+ ]
46
+ }
47
+ }
requirements.txt CHANGED
@@ -19,4 +19,4 @@ jiwer
19
  pytest
20
  pandera
21
  gradio
22
- gtts
 
19
  pytest
20
  pandera
21
  gradio
22
+ gtts