mj-new commited on
Commit
0147fc2
1 Parent(s): 0587641

working audio file saving

Browse files
Files changed (3) hide show
  1. .gitignore +2 -0
  2. Temp.mp3 +0 -0
  3. app.py +159 -34
.gitignore CHANGED
@@ -1 +1,3 @@
1
  .python-version
 
 
 
1
  .python-version
2
+ data_local
3
+ run.sh
Temp.mp3 ADDED
Binary file (39.6 kB). View file
 
app.py CHANGED
@@ -2,6 +2,21 @@ import gradio as gr
2
  import whisper
3
  import numpy as np
4
  import openai
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  def greet(name):
7
  return "Hello " + name + "!!"
@@ -12,23 +27,118 @@ with open('app.css','r') as f:
12
  markdown="""
13
  # Polish ASR BIGOS workspace
14
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def whisper_model_change(radio_whisper_model):
16
  whisper_model = whisper.load_model(radio_whisper_model)
17
  return(whisper_model)
18
 
19
- def prompt_gpt(input_text):
 
 
 
 
 
 
 
20
  messages = [
21
- {"role": "system", "content": "You are a helpful assistant."}]
22
 
23
  if input_text:
24
  messages.append(
25
  {"role": "user", "content": input_text},
26
  )
 
27
  chat_completion = openai.ChatCompletion.create(
28
- model="gpt-3.5-turbo", messages=messages
 
 
29
  )
30
 
31
  reply = chat_completion.choices[0].message.content
 
32
  return reply
33
 
34
  def process_pipeline(audio):
@@ -58,9 +168,9 @@ def init_whisper_model(whisper_model_type):
58
  whisper_model = whisper.load_model(whisper_model_type)
59
  return whisper_model
60
 
61
- def synthesize_speech(text):
62
- audioobj = gTTS(text = out_result,
63
- lang = lang,
64
  slow = False)
65
 
66
  audioobj.save("Temp.mp3")
@@ -71,8 +181,11 @@ with block:
71
 
72
  #state variables
73
  language = gr.State("en")
 
74
  whisper_model_type = gr.State("base")
75
  whisper_model = gr.State()
 
 
76
 
77
  # state handling functions
78
  def change_language(choice):
@@ -96,36 +209,48 @@ with block:
96
  return [whisper_model_type, whisper_model]
97
 
98
  gr.Markdown(markdown)
 
99
  with gr.Tabs():
100
- with gr.TabItem('Voicebot playground'):
 
 
 
 
 
 
 
 
 
 
101
  with gr.Box():
102
- gr.HTML("<p class=\"apikey\">API Key:</p>")
103
- # API key textbox (password-style)
104
- api_key = gr.Textbox(label="", elem_id="pw")
105
-
106
- radio_lang = gr.Radio(["Polish", "English"], label="Language", info="If none selected, English is used")
107
- #radio_asr_type = gr.Radio(["Local", "Cloud"], label="Select ASR type", info="Cloud models are faster and more accurate, but costs money")
108
- #radio_cloud_asr = gr.Radio(["Whisper", "Google", "Azure"], label="Select Cloud ASR provider", info="You need to provide API keys for specific service")
109
- radio_whisper_model = gr.Radio(["tiny", "base", "small", "medium", "large"], label="Whisper ASR model (local)", info="Larger models are better, but slower. Default - base")
110
-
111
- mic_recording = gr.Audio(source="microphone", type="filepath", label='Record your voice')
112
-
113
- out_asr = gr.Textbox(placeholder="ASR output",
114
- lines=5,
115
- max_lines=10,
116
- show_label=False)
117
- out_gpt = gr.Textbox(placeholder="ChatGPT output",
118
- lines=10,
119
- max_lines=25,
120
- show_label=False)
121
-
122
- button_transcribe = gr.Button("Transcribe")
123
- button_prompt_gpt = gr.Button("Prompt ChatGPT")
124
-
125
- button_transcribe.click(transcribe, inputs=[mic_recording,language, whisper_model,whisper_model_type], outputs=out_asr)
126
- button_prompt_gpt.click(prompt_gpt, inputs=out_asr, outputs=out_gpt)
 
127
 
128
- radio_lang.change(fn=change_language, inputs=radio_lang, outputs=language)
129
- radio_whisper_model.change(fn=change_whisper_model, inputs=radio_whisper_model, outputs=[whisper_model_type, whisper_model])
130
 
131
  block.launch()
 
2
  import whisper
3
  import numpy as np
4
  import openai
5
+ import os
6
+ from gtts import gTTS
7
+ import json
8
+ import hashlib
9
+ import random
10
+ import string
11
+ import uuid
12
+ from datetime import date,datetime
13
+ from huggingface_hub import Repository, upload_file
14
+ import shutil
15
+
16
+ HF_TOKEN_WRITE = os.environ.get("HF_TOKEN_WRITE")
17
+ print("HF_TOKEN_WRITE", HF_TOKEN_WRITE)
18
+ today = date.today()
19
+ today_ymd = today.strftime("%Y%m%d")
20
 
21
  def greet(name):
22
  return "Hello " + name + "!!"
 
27
  markdown="""
28
  # Polish ASR BIGOS workspace
29
  """
30
+
31
+ # TODO move to config
32
+ WORKING_DATASET_REPO_URL = "https://huggingface.co/datasets/goodmike31/working-db"
33
+ REPO_NAME = "goodmike31/working-db"
34
+ REPOSITORY_DIR = "data"
35
+ LOCAL_DIR = "data_local"
36
+ os.makedirs(LOCAL_DIR,exist_ok=True)
37
+
38
+ def dump_json(thing,file):
39
+ with open(file,'w+',encoding="utf8") as f:
40
+ json.dump(thing,f)
41
+
42
+ def get_unique_name():
43
+ return ''.join([random.choice(string.ascii_letters
44
+ + string.digits) for n in range(32)])
45
+
46
+ def save_recording_and_meta(project_name, recording, transcript, language):
47
+ #, name, age, gender):
48
+ # TODO save user data in the next version
49
+
50
+ speaker_metadata={}
51
+ speaker_metadata['gender'] = "test" #gender if gender!=GENDER[0] else ''
52
+ speaker_metadata['age'] = "test" #age if age !='' else ''
53
+ speaker_metadata['accent'] = "test" #accent if accent!='' else ''
54
+
55
+ lang_id = language.lower()
56
+
57
+ # TODO get ISO-693-1 codes
58
+ transcript =transcript.strip()
59
+
60
+ SAVE_ROOT_DIR = os.path.join(LOCAL_DIR, project_name, today_ymd)
61
+
62
+ SAVE_DIR_AUDIO = os.path.join(SAVE_ROOT_DIR, "audio")
63
+ SAVE_DIR_META = os.path.join(SAVE_ROOT_DIR, "meta")
64
+ os.makedirs(SAVE_DIR_AUDIO, exist_ok=True)
65
+ os.makedirs(SAVE_DIR_META, exist_ok=True)
66
+
67
+ # Write audio to file
68
+ #audio_name = get_unique_name()
69
+
70
+ uuid_name = str(uuid.uuid4())
71
+ audio_fn = uuid_name + ".wav"
72
+ audio_output_fp = os.path.join(SAVE_DIR_AUDIO, audio_fn)
73
+
74
+ print (f"Saving {recording} as {audio_output_fp}")
75
+ shutil.copy2(recording, audio_output_fp)
76
+
77
+ # Write metadata.json to file
78
+ meta_fn = uuid_name + 'metadata.jsonl'
79
+ json_file_path = os.path.join(SAVE_DIR_META, meta_fn)
80
+
81
+ now = datetime.now()
82
+ timestamp_str = now.strftime("%d/%m/%Y %H:%M:%S")
83
+ metadata= {'id':uuid_name,'audio_file': audio_fn,
84
+ 'language_name':language,'language_id':lang_id,
85
+ 'transcript':transcript,'age': speaker_metadata['age'],
86
+ 'gender': speaker_metadata['gender'],'accent': speaker_metadata['accent'],
87
+ "date":today_ymd, "timestamp": timestamp_str }
88
+
89
+ dump_json(metadata, json_file_path)
90
+
91
+ # Simply upload the audio file and metadata using the hub's upload_file
92
+ # Upload the audio
93
+ repo_audio_path = os.path.join(REPOSITORY_DIR, project_name, today_ymd, "audio", audio_fn)
94
+
95
+ _ = upload_file(path_or_fileobj = audio_output_fp,
96
+ path_in_repo = repo_audio_path,
97
+ repo_id = REPO_NAME,
98
+ repo_type = 'dataset',
99
+ token = HF_TOKEN_WRITE
100
+ )
101
+
102
+ # Upload the metadata
103
+ repo_json_path = os.path.join(REPOSITORY_DIR, project_name, today_ymd, "meta", meta_fn)
104
+ _ = upload_file(path_or_fileobj = json_file_path,
105
+ path_in_repo = repo_json_path,
106
+ repo_id = REPO_NAME,
107
+ repo_type = 'dataset',
108
+ token = HF_TOKEN_WRITE
109
+ )
110
+
111
+ output = print(f"Recording {audio_fn} and meta file {meta_fn} successfully saved to repo!")
112
+ return
113
+
114
  def whisper_model_change(radio_whisper_model):
115
  whisper_model = whisper.load_model(radio_whisper_model)
116
  return(whisper_model)
117
 
118
+ def prompt_gpt(input_text, api_key, temperature):
119
+ #, role, template_prompt, template_answer):
120
+ #TODO add option to specify instruction
121
+ openai.api_key = api_key
122
+
123
+ #TODO add specific message for specific role
124
+ system_role_message="You are a helpful assistant"
125
+
126
  messages = [
127
+ {"role": "system", "content": system_role_message}]
128
 
129
  if input_text:
130
  messages.append(
131
  {"role": "user", "content": input_text},
132
  )
133
+
134
  chat_completion = openai.ChatCompletion.create(
135
+ model="gpt-3.5-turbo",
136
+ messages=messages,
137
+ temperature=temperature
138
  )
139
 
140
  reply = chat_completion.choices[0].message.content
141
+ #TODO save chat completion for future reuse
142
  return reply
143
 
144
  def process_pipeline(audio):
 
168
  whisper_model = whisper.load_model(whisper_model_type)
169
  return whisper_model
170
 
171
+ def synthesize_speech(text, language):
172
+ audioobj = gTTS(text = text,
173
+ lang = language,
174
  slow = False)
175
 
176
  audioobj.save("Temp.mp3")
 
181
 
182
  #state variables
183
  language = gr.State("en")
184
+ temperature = gr.State(0)
185
  whisper_model_type = gr.State("base")
186
  whisper_model = gr.State()
187
+ api_key = gr.State()
188
+ project_name = gr.State("voicebot") # TODO add list of projects to organize saved data
189
 
190
  # state handling functions
191
  def change_language(choice):
 
209
  return [whisper_model_type, whisper_model]
210
 
211
  gr.Markdown(markdown)
212
+
213
  with gr.Tabs():
214
+ with gr.Row():
215
+ with gr.TabItem('Voicebot playground'):
216
+ with gr.Accordion(label="Settings"):
217
+ gr.HTML("<p class=\"apikey\">Open AI API Key:</p>")
218
+ # API key textbox (password-style)
219
+ api_key = gr.Textbox(label="", elem_id="pw")
220
+ slider_temp = gr.Slider(minimum=0, maximum= 2, step=0.2, label="ChatGPT temperature")
221
+ radio_lang = gr.Radio(["Polish", "English"], label="Language", info="If none selected, English is used")
222
+ #radio_asr_type = gr.Radio(["Local", "Cloud"], label="Select ASR type", info="Cloud models are faster and more accurate, but costs money")
223
+ #radio_cloud_asr = gr.Radio(["Whisper", "Google", "Azure"], label="Select Cloud ASR provider", info="You need to provide API keys for specific service")
224
+ radio_whisper_model = gr.Radio(["tiny", "base", "small", "medium", "large"], label="Whisper ASR model (local)", info="Larger models are more accurate, but slower. Default - base")
225
  with gr.Box():
226
+ with gr.Row():
227
+ mic_recording = gr.Audio(source="microphone", type="filepath", label='Record your voice')
228
+
229
+ button_transcribe = gr.Button("Transcribe speech")
230
+
231
+ button_save_audio_and_trans = gr.Button("Save recording and meta")
232
+
233
+ out_asr = gr.Textbox(placeholder="ASR output",
234
+ lines=2,
235
+ max_lines=5,
236
+ show_label=False)
237
+
238
+ button_prompt_gpt = gr.Button("Prompt ChatGPT")
239
+
240
+ out_gpt = gr.Textbox(placeholder="ChatGPT output",
241
+ lines=4,
242
+ max_lines=10,
243
+ show_label=False)
244
+ button_synth_speech = gr.Button("Synthesize speech")
245
+ synth_recording = gr.Audio()
246
+
247
+ # Events actions
248
+ button_save_audio_and_trans.click(save_recording_and_meta, inputs=[project_name, mic_recording, out_asr, language], outputs=[])
249
+ button_transcribe.click(transcribe, inputs=[mic_recording, language, whisper_model,whisper_model_type], outputs=out_asr)
250
+ button_prompt_gpt.click(prompt_gpt, inputs=[out_asr, api_key, slider_temp], outputs=out_gpt)
251
+ button_synth_speech.click(synthesize_speech, inputs=[out_gpt, language], outputs=synth_recording)
252
 
253
+ radio_lang.change(fn=change_language, inputs=radio_lang, outputs=language)
254
+ radio_whisper_model.change(fn=change_whisper_model, inputs=radio_whisper_model, outputs=[whisper_model_type, whisper_model])
255
 
256
  block.launch()