gorkemgoknar commited on
Commit
6af5041
1 Parent(s): 168536f

add some comments and remove unnecessary comemnts

Browse files
Files changed (1) hide show
  1. app.py +99 -94
app.py CHANGED
@@ -120,6 +120,7 @@ text_client = InferenceClient(
120
 
121
 
122
  ###### COQUI TTS FUNCTIONS ######
 
123
  def get_latents(speaker_wav):
124
  # create as function as we can populate here with voice cleanup/filtering
125
  (
@@ -129,7 +130,88 @@ def get_latents(speaker_wav):
129
  ) = model.get_conditioning_latents(audio_path=speaker_wav)
130
  return gpt_cond_latent, diffusion_conditioning, speaker_embedding
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  def format_prompt(message, history):
134
  prompt = (
135
  "<s>[INST]" + system_message + "[/INST]" + system_understand_message + "</s>"
@@ -140,7 +222,6 @@ def format_prompt(message, history):
140
  prompt += f"[INST] {message} [/INST]"
141
  return prompt
142
 
143
-
144
  def generate(
145
  prompt,
146
  history,
@@ -197,6 +278,8 @@ def generate(
197
  return output
198
 
199
 
 
 
200
  def transcribe(wav_path):
201
  try:
202
  # get result from whisper and strip it to delete begin and end space
@@ -212,13 +295,13 @@ def transcribe(wav_path):
212
 
213
  # Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.
214
 
215
-
216
  def add_text(history, text):
217
  history = [] if history is None else history
218
  history = history + [(text, None)]
219
  return history, gr.update(value="", interactive=False)
220
 
221
-
222
  def add_file(history, file):
223
  history = [] if history is None else history
224
 
@@ -247,90 +330,8 @@ def bot(history, system_prompt=""):
247
  history[-1][1] = character
248
  yield history
249
 
250
-
251
- def get_latents(speaker_wav):
252
- # Generate speaker embedding and latents for TTS
253
- (
254
- gpt_cond_latent,
255
- diffusion_conditioning,
256
- speaker_embedding,
257
- ) = model.get_conditioning_latents(audio_path=speaker_wav)
258
- return gpt_cond_latent, diffusion_conditioning, speaker_embedding
259
-
260
-
261
- latent_map = {}
262
- latent_map["Female_Voice"] = get_latents("examples/female.wav")
263
-
264
-
265
-
266
- def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
267
- # This will create a wave header then append the frame input
268
- # It should be first on a streaming wav file
269
- # Other frames better should not have it (else you will hear some artifacts each chunk start)
270
- wav_buf = io.BytesIO()
271
- with wave.open(wav_buf, "wb") as vfout:
272
- vfout.setnchannels(channels)
273
- vfout.setsampwidth(sample_width)
274
- vfout.setframerate(sample_rate)
275
- vfout.writeframes(frame_input)
276
-
277
- wav_buf.seek(0)
278
- return wav_buf.read()
279
-
280
-
281
- def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
282
- gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
283
- try:
284
- t0 = time.time()
285
- chunks = model.inference_stream(
286
- prompt,
287
- language,
288
- gpt_cond_latent,
289
- speaker_embedding,
290
- )
291
-
292
- first_chunk = True
293
- for i, chunk in enumerate(chunks):
294
- if first_chunk:
295
- first_chunk_time = time.time() - t0
296
- metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
297
- first_chunk = False
298
- print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
299
-
300
- # In case output is required to be multiple voice files
301
- # out_file = f'{char}_{i}.wav'
302
- # write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
303
- # audio = AudioSegment.from_file(out_file)
304
- # audio.export(out_file, format='wav')
305
- # return out_file
306
- # directly return chunk as bytes for streaming
307
- chunk = chunk.detach().cpu().numpy().squeeze()
308
- chunk = (chunk * 32767).astype(np.int16)
309
-
310
- yield chunk.tobytes()
311
-
312
- except RuntimeError as e:
313
- if "device-side assert" in str(e):
314
- # cannot do anything on cuda device side error, need tor estart
315
- print(
316
- f"Exit due to: Unrecoverable exception caused by prompt:{prompt}",
317
- flush=True,
318
- )
319
- gr.Warning("Unhandled Exception encounter, please retry in a minute")
320
- print("Cuda device-assert Runtime encountered need restart")
321
-
322
- # HF Space specific.. This error is unrecoverable need to restart space
323
- api.restart_space(repo_id=repo_id)
324
- else:
325
- print("RuntimeError: non device-side assert error:", str(e))
326
- # Does not require warning happens on empty chunk and at end
327
- ###gr.Warning("Unhandled Exception encounter, please retry in a minute")
328
- return None
329
- return None
330
- except:
331
- return None
332
-
333
-
334
  def get_sentence(history, system_prompt=""):
335
  history = [["", None]] if history is None else history
336
 
@@ -368,7 +369,6 @@ def get_sentence(history, system_prompt=""):
368
  yield (sentence, history)
369
 
370
  # return that final sentence token
371
- # TODO need a counter that one may be replica as before
372
  last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())[-1]
373
  sentence_hash = hash(last_sentence)
374
  if sentence_hash not in sentence_hash_list:
@@ -378,7 +378,8 @@ def get_sentence(history, system_prompt=""):
378
 
379
  yield (last_sentence, history)
380
 
381
-
 
382
  def generate_speech(history):
383
  language = "en"
384
 
@@ -402,9 +403,8 @@ def generate_speech(history):
402
  print("Sentence for speech:", sentence)
403
 
404
  try:
405
- # generate speech using precomputed latents
406
- # This is not streaming but it will be fast
407
- if len(sentence) > 250:
408
  gr.Warning("There was a problem with the last sentence, which was too long, so it won't be spoken.")
409
  # should not generate voice it will hit token limit
410
  # It should not generate audio for it
@@ -413,6 +413,8 @@ def generate_speech(history):
413
  audio_stream = get_voice_streaming(
414
  sentence, language, latent_map["Female_Voice"]
415
  )
 
 
416
  if audio_stream is not None:
417
  wav_chunks = wave_header_chunk()
418
  frame_length = 0
@@ -485,7 +487,8 @@ def generate_speech(history):
485
  yield (gr.Audio.update(value=None, autoplay=False), history)
486
  yield (gr.Audio.update(value=outfile, autoplay=False), history)
487
 
488
-
 
489
  with gr.Blocks(title=title) as demo:
490
  gr.Markdown(DESCRIPTION)
491
 
@@ -547,7 +550,9 @@ It relies on 3 models:
547
  3. [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a TTS model, to generate the chatbot answers. This time, the model is hosted locally.
548
 
549
  Note:
550
- - By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml"""
 
 
551
  )
552
  demo.queue()
553
  demo.launch(debug=True)
 
120
 
121
 
122
  ###### COQUI TTS FUNCTIONS ######
123
+
124
  def get_latents(speaker_wav):
125
  # create as function as we can populate here with voice cleanup/filtering
126
  (
 
130
  ) = model.get_conditioning_latents(audio_path=speaker_wav)
131
  return gpt_cond_latent, diffusion_conditioning, speaker_embedding
132
 
133
+ def get_latents(speaker_wav):
134
+ # Generate speaker embedding and latents for TTS
135
+ (
136
+ gpt_cond_latent,
137
+ diffusion_conditioning,
138
+ speaker_embedding,
139
+ ) = model.get_conditioning_latents(audio_path=speaker_wav)
140
+ return gpt_cond_latent, diffusion_conditioning, speaker_embedding
141
+
142
+
143
+ latent_map = {}
144
+ latent_map["Female_Voice"] = get_latents("examples/female.wav")
145
+
146
+ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
147
+ # This will create a wave header then append the frame input
148
+ # It should be first on a streaming wav file
149
+ # Other frames better should not have it (else you will hear some artifacts each chunk start)
150
+ wav_buf = io.BytesIO()
151
+ with wave.open(wav_buf, "wb") as vfout:
152
+ vfout.setnchannels(channels)
153
+ vfout.setsampwidth(sample_width)
154
+ vfout.setframerate(sample_rate)
155
+ vfout.writeframes(frame_input)
156
+
157
+ wav_buf.seek(0)
158
+ return wav_buf.read()
159
+
160
+
161
+ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
162
+ gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
163
+ try:
164
+ t0 = time.time()
165
+ chunks = model.inference_stream(
166
+ prompt,
167
+ language,
168
+ gpt_cond_latent,
169
+ speaker_embedding,
170
+ )
171
+
172
+ first_chunk = True
173
+ for i, chunk in enumerate(chunks):
174
+ if first_chunk:
175
+ first_chunk_time = time.time() - t0
176
+ metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
177
+ first_chunk = False
178
+ print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
179
+
180
+ # In case output is required to be multiple voice files
181
+ # out_file = f'{char}_{i}.wav'
182
+ # write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
183
+ # audio = AudioSegment.from_file(out_file)
184
+ # audio.export(out_file, format='wav')
185
+ # return out_file
186
+ # directly return chunk as bytes for streaming
187
+ chunk = chunk.detach().cpu().numpy().squeeze()
188
+ chunk = (chunk * 32767).astype(np.int16)
189
+
190
+ yield chunk.tobytes()
191
 
192
+ except RuntimeError as e:
193
+ if "device-side assert" in str(e):
194
+ # cannot do anything on cuda device side error, need tor estart
195
+ print(
196
+ f"Exit due to: Unrecoverable exception caused by prompt:{prompt}",
197
+ flush=True,
198
+ )
199
+ gr.Warning("Unhandled Exception encounter, please retry in a minute")
200
+ print("Cuda device-assert Runtime encountered need restart")
201
+
202
+ # HF Space specific.. This error is unrecoverable need to restart space
203
+ api.restart_space(repo_id=repo_id)
204
+ else:
205
+ print("RuntimeError: non device-side assert error:", str(e))
206
+ # Does not require warning happens on empty chunk and at end
207
+ ###gr.Warning("Unhandled Exception encounter, please retry in a minute")
208
+ return None
209
+ return None
210
+ except:
211
+ return None
212
+
213
+ ###### MISTRAL FUNCTIONS ######
214
+
215
  def format_prompt(message, history):
216
  prompt = (
217
  "<s>[INST]" + system_message + "[/INST]" + system_understand_message + "</s>"
 
222
  prompt += f"[INST] {message} [/INST]"
223
  return prompt
224
 
 
225
  def generate(
226
  prompt,
227
  history,
 
278
  return output
279
 
280
 
281
+ ###### WHISPER FUNCTIONS ######
282
+
283
  def transcribe(wav_path):
284
  try:
285
  # get result from whisper and strip it to delete begin and end space
 
295
 
296
  # Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.
297
 
298
+ # Will be triggered on text submit (will send to generate_speech)
299
  def add_text(history, text):
300
  history = [] if history is None else history
301
  history = history + [(text, None)]
302
  return history, gr.update(value="", interactive=False)
303
 
304
+ # Will be triggered on voice submit (will transribe and send to generate_speech)
305
  def add_file(history, file):
306
  history = [] if history is None else history
307
 
 
330
  history[-1][1] = character
331
  yield history
332
 
333
+ ##### MISTRAL STREAMING Sentence splitter ####
334
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  def get_sentence(history, system_prompt=""):
336
  history = [["", None]] if history is None else history
337
 
 
369
  yield (sentence, history)
370
 
371
  # return that final sentence token
 
372
  last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())[-1]
373
  sentence_hash = hash(last_sentence)
374
  if sentence_hash not in sentence_hash_list:
 
378
 
379
  yield (last_sentence, history)
380
 
381
+ #### SPEECH GENERATION BY SENTENCE FROM HISTORY ####
382
+
383
  def generate_speech(history):
384
  language = "en"
385
 
 
403
  print("Sentence for speech:", sentence)
404
 
405
  try:
406
+ #TODO this will be better handled in future using textwrap
407
+ if len(sentence) > 300:
 
408
  gr.Warning("There was a problem with the last sentence, which was too long, so it won't be spoken.")
409
  # should not generate voice it will hit token limit
410
  # It should not generate audio for it
 
413
  audio_stream = get_voice_streaming(
414
  sentence, language, latent_map["Female_Voice"]
415
  )
416
+ # XTTS is actually using streaming response but we are playing audio by sentence
417
+ # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
418
  if audio_stream is not None:
419
  wav_chunks = wave_header_chunk()
420
  frame_length = 0
 
487
  yield (gr.Audio.update(value=None, autoplay=False), history)
488
  yield (gr.Audio.update(value=outfile, autoplay=False), history)
489
 
490
+ #### GRADIO INTERFACE ####
491
+
492
  with gr.Blocks(title=title) as demo:
493
  gr.Markdown(DESCRIPTION)
494
 
 
550
  3. [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a TTS model, to generate the chatbot answers. This time, the model is hosted locally.
551
 
552
  Note:
553
+ - By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml
554
+ - Responses generated by chat model should not be assumed correct as this is a demonstration example only
555
+ - iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
556
  )
557
  demo.queue()
558
  demo.launch(debug=True)