ggoknar commited on
Commit
da4b074
1 Parent(s): d3d83c1

fix repo name

Browse files
Files changed (1) hide show
  1. app.py +1 -431
app.py CHANGED
@@ -68,7 +68,7 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
68
  # will use api to restart space on a unrecoverable error
69
  api = HfApi(token=HF_TOKEN)
70
 
71
- repo_id = "coqui/voice-chat-with-lama"
72
 
73
  default_system_message = """
74
  You are Mistral, a large language model trained and provided by Mistral, architecture of you is decoder-based LM. Your voice backend or text to speech TTS backend is provided via Coqui technology. You are right now served on Huggingface spaces.
@@ -106,433 +106,3 @@ text_client = InferenceClient(
106
  "mistralai/Mistral-7B-Instruct-v0.1",
107
  timeout=WHISPER_TIMEOUT,
108
  )
109
-
110
-
111
- ###### COQUI TTS FUNCTIONS ######
112
- def get_latents(speaker_wav):
113
- # create as function as we can populate here with voice cleanup/filtering
114
- (
115
- gpt_cond_latent,
116
- diffusion_conditioning,
117
- speaker_embedding,
118
- ) = model.get_conditioning_latents(audio_path=speaker_wav)
119
- return gpt_cond_latent, diffusion_conditioning, speaker_embedding
120
-
121
-
122
- def format_prompt(message, history):
123
- prompt = (
124
- "<s>[INST]"
125
- + system_message
126
- + "[/INST] I understand, I am a Mistral chatbot with speech by Coqui team.</s>"
127
- )
128
- for user_prompt, bot_response in history:
129
- prompt += f"[INST] {user_prompt} [/INST]"
130
- prompt += f" {bot_response}</s> "
131
- prompt += f"[INST] {message} [/INST]"
132
- return prompt
133
-
134
-
135
- def generate(
136
- prompt,
137
- history,
138
- temperature=0.9,
139
- max_new_tokens=256,
140
- top_p=0.95,
141
- repetition_penalty=1.0,
142
- ):
143
- temperature = float(temperature)
144
- if temperature < 1e-2:
145
- temperature = 1e-2
146
- top_p = float(top_p)
147
-
148
- generate_kwargs = dict(
149
- temperature=temperature,
150
- max_new_tokens=max_new_tokens,
151
- top_p=top_p,
152
- repetition_penalty=repetition_penalty,
153
- do_sample=True,
154
- seed=42,
155
- )
156
-
157
- formatted_prompt = format_prompt(prompt, history)
158
-
159
- try:
160
- stream = text_client.text_generation(
161
- formatted_prompt,
162
- **generate_kwargs,
163
- stream=True,
164
- details=True,
165
- return_full_text=False,
166
- )
167
- output = ""
168
- for response in stream:
169
- output += response.token.text
170
- yield output
171
-
172
- except Exception as e:
173
- if "Too Many Requests" in str(e):
174
- print("ERROR: Too many requests on mistral client")
175
- gr.Warning("Unfortunately Mistral is unable to process")
176
- output = "Unfortuanately I am not able to process your request now !"
177
- else:
178
- print("Unhandled Exception: ", str(e))
179
- gr.Warning("Unfortunately Mistral is unable to process")
180
- output = "I do not know what happened but I could not understand you ."
181
-
182
- return output
183
-
184
-
185
- def transcribe(wav_path):
186
- try:
187
- # get first element from whisper_jax and strip it to delete begin and end space
188
- return whisper_client.predict(
189
- wav_path, # str (filepath or URL to file) in 'inputs' Audio component
190
- "transcribe", # str in 'Task' Radio component
191
- False, # return_timestamps=False for whisper-jax https://gist.github.com/sanchit-gandhi/781dd7003c5b201bfe16d28634c8d4cf#file-whisper_jax_endpoint-py
192
- api_name="/predict",
193
- )[0].strip()
194
- except:
195
- gr.Warning("There was a problem with Whisper endpoint, telling a joke for you.")
196
- return "There was a problem with my voice, tell me joke"
197
-
198
-
199
- # Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.
200
-
201
-
202
- def add_text(history, text):
203
- history = [] if history is None else history
204
- history = history + [(text, None)]
205
- return history, gr.update(value="", interactive=False)
206
-
207
-
208
- def add_file(history, file):
209
- history = [] if history is None else history
210
-
211
- try:
212
- text = transcribe(file)
213
- print("Transcribed text:", text)
214
- except Exception as e:
215
- print(str(e))
216
- gr.Warning("There was an issue with transcription, please try writing for now")
217
- # Apply a null text on error
218
- text = "Transcription seems failed, please tell me a joke about chickens"
219
-
220
- history = history + [(text, None)]
221
- return history, gr.update(value="", interactive=False)
222
-
223
-
224
- ##NOTE: not using this as it yields a chacter each time while we need to feed history to TTS
225
- def bot(history, system_prompt=""):
226
- history = [] if history is None else history
227
-
228
- if system_prompt == "":
229
- system_prompt = system_message
230
-
231
- history[-1][1] = ""
232
- for character in generate(history[-1][0], history[:-1]):
233
- history[-1][1] = character
234
- yield history
235
-
236
-
237
- def get_latents(speaker_wav):
238
- # Generate speaker embedding and latents for TTS
239
- (
240
- gpt_cond_latent,
241
- diffusion_conditioning,
242
- speaker_embedding,
243
- ) = model.get_conditioning_latents(audio_path=speaker_wav)
244
- return gpt_cond_latent, diffusion_conditioning, speaker_embedding
245
-
246
-
247
- latent_map = {}
248
- latent_map["Female_Voice"] = get_latents("examples/female.wav")
249
-
250
-
251
- def get_voice(prompt, language, latent_tuple, suffix="0"):
252
- gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
253
- # Direct version
254
- t0 = time.time()
255
- out = model.inference(
256
- prompt, language, gpt_cond_latent, speaker_embedding, diffusion_conditioning
257
- )
258
- inference_time = time.time() - t0
259
- print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
260
- real_time_factor = (time.time() - t0) / out["wav"].shape[-1] * 24000
261
- print(f"Real-time factor (RTF): {real_time_factor}")
262
- wav_filename = f"output_{suffix}.wav"
263
- torchaudio.save(wav_filename, torch.tensor(out["wav"]).unsqueeze(0), 24000)
264
- return wav_filename
265
-
266
-
267
- def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
268
- # This will create a wave header then append the frame input
269
- # It should be first on a streaming wav file
270
- # Other frames better should not have it (else you will hear some artifacts each chunk start)
271
- wav_buf = io.BytesIO()
272
- with wave.open(wav_buf, "wb") as vfout:
273
- vfout.setnchannels(channels)
274
- vfout.setsampwidth(sample_width)
275
- vfout.setframerate(sample_rate)
276
- vfout.writeframes(frame_input)
277
-
278
- wav_buf.seek(0)
279
- return wav_buf.read()
280
-
281
-
282
- def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
283
- gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
284
- try:
285
- t0 = time.time()
286
- chunks = model.inference_stream(
287
- prompt,
288
- language,
289
- gpt_cond_latent,
290
- speaker_embedding,
291
- )
292
-
293
- first_chunk = True
294
- for i, chunk in enumerate(chunks):
295
- if first_chunk:
296
- first_chunk_time = time.time() - t0
297
- metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
298
- first_chunk = False
299
- print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
300
-
301
- # In case output is required to be multiple voice files
302
- # out_file = f'{char}_{i}.wav'
303
- # write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
304
- # audio = AudioSegment.from_file(out_file)
305
- # audio.export(out_file, format='wav')
306
- # return out_file
307
- # directly return chunk as bytes for streaming
308
- chunk = chunk.detach().cpu().numpy().squeeze()
309
- chunk = (chunk * 32767).astype(np.int16)
310
-
311
- yield chunk.tobytes()
312
-
313
- except RuntimeError as e:
314
- if "device-side assert" in str(e):
315
- # cannot do anything on cuda device side error, need tor estart
316
- print(
317
- f"Exit due to: Unrecoverable exception caused by prompt:{sentence}",
318
- flush=True,
319
- )
320
- gr.Warning("Unhandled Exception encounter, please retry in a minute")
321
- print("Cuda device-assert Runtime encountered need restart")
322
-
323
- # HF Space specific.. This error is unrecoverable need to restart space
324
- api.restart_space(repo_id=repo_id)
325
- else:
326
- print("RuntimeError: non device-side assert error:", str(e))
327
- gr.Warning("Unhandled Exception encounter, please retry in a minute")
328
- return None
329
- return None
330
- except:
331
- return None
332
-
333
-
334
- def get_sentence(history, system_prompt=""):
335
- history = [] if history is None else history
336
-
337
- if system_prompt == "":
338
- system_prompt = system_message
339
-
340
- history[-1][1] = ""
341
-
342
- mistral_start = time.time()
343
- print("Mistral start")
344
- sentence_list = []
345
- sentence_hash_list = []
346
-
347
- text_to_generate = ""
348
- for character in generate(history[-1][0], history[:-1]):
349
- history[-1][1] = character
350
- # It is coming word by word
351
-
352
- text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())
353
-
354
- if len(text_to_generate) > 1:
355
- dif = len(text_to_generate) - len(sentence_list)
356
-
357
- if dif == 1 and len(sentence_list) != 0:
358
- continue
359
-
360
- sentence = text_to_generate[len(sentence_list)]
361
- # This is expensive replace with hashing!
362
- sentence_hash = hash(sentence)
363
-
364
- if sentence_hash not in sentence_hash_list:
365
- sentence_hash_list.append(sentence_hash)
366
- sentence_list.append(sentence)
367
- print("New Sentence: ", sentence)
368
- yield (sentence, history)
369
-
370
- # return that final sentence token
371
- # TODO need a counter that one may be replica as before
372
- last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())[-1]
373
- sentence_hash = hash(last_sentence)
374
- if sentence_hash not in sentence_hash_list:
375
- sentence_hash_list.append(sentence_hash)
376
- sentence_list.append(last_sentence)
377
- print("New Sentence: ", last_sentence)
378
-
379
- yield (last_sentence, history)
380
-
381
-
382
- def generate_speech(history):
383
- language = "en"
384
-
385
- wav_list = []
386
- for sentence, history in get_sentence(history):
387
- print(sentence)
388
- # Sometimes prompt </s> coming on output remove it
389
- sentence = sentence.replace("</s>", "")
390
- # A fast fix for last chacter, may produce weird sounds if it is with text
391
- if sentence[-1] in ["!", "?", ".", ","]:
392
- # just add a space
393
- sentence = sentence[:-1] + " " + sentence[-1]
394
- print("Sentence for speech:", sentence)
395
-
396
- try:
397
- # generate speech using precomputed latents
398
- # This is not streaming but it will be fast
399
- # wav = get_voice(sentence,language, latent_map["Female_Voice"], suffix=len(wav_list))
400
- audio_stream = get_voice_streaming(
401
- sentence, language, latent_map["Female_Voice"], suffix=len(wav_list)
402
- )
403
- wav_chunks = wave_header_chunk()
404
- frame_length = 0
405
- for chunk in audio_stream:
406
- try:
407
- wav_chunks += chunk
408
- frame_length += len(chunk)
409
- except:
410
- # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
411
- continue
412
-
413
- wav_list.append(wav_chunks)
414
- yield (gr.Audio.update(value=wav_chunks, autoplay=True), history)
415
-
416
- # Streaming wait time calculation
417
- # audio_length = frame_length / sample_width/ frame_rate
418
- wait_time = frame_length / 2 / 24000 + 0.5 # plus 500ms
419
-
420
- # for non streaming
421
- # wait_time= librosa.get_duration(path=wav)
422
-
423
- wait_time = AUDIO_WAIT_MODIFIER * wait_time
424
- print("Sleeping till audio end")
425
- time.sleep(wait_time)
426
- except RuntimeError as e:
427
- if "device-side assert" in str(e):
428
- # cannot do anything on cuda device side error, need tor estart
429
- print(
430
- f"Exit due to: Unrecoverable exception caused by prompt:{sentence}",
431
- flush=True,
432
- )
433
- gr.Warning("Unhandled Exception encounter, please retry in a minute")
434
- print("Cuda device-assert Runtime encountered need restart")
435
-
436
- # HF Space specific.. This error is unrecoverable need to restart space
437
- api.restart_space(repo_id=repo_id)
438
- else:
439
- print("RuntimeError: non device-side assert error:", str(e))
440
- raise e
441
-
442
- # Spoken on autoplay everysencen now produce a concataned one at the one
443
- # requires pip install ffmpeg-python
444
-
445
- # files_to_concat= [ffmpeg.input(w) for w in wav_list]
446
- # combined_file_name="combined.wav"
447
- # ffmpeg.concat(*files_to_concat,v=0, a=1).output(combined_file_name).run(overwrite_output=True)
448
- # final_audio.update(value=combined_file_name, visible=True)
449
- # yield (combined_file_name, history)
450
-
451
-
452
- css = """
453
- .bot .chatbot p {
454
- overflow: hidden; /* Ensures the content is not revealed until the animation */
455
- //border-right: .15em solid orange; /* The typwriter cursor */
456
- white-space: nowrap; /* Keeps the content on a single line */
457
- margin: 0 auto; /* Gives that scrolling effect as the typing happens */
458
- letter-spacing: .15em; /* Adjust as needed */
459
- animation:
460
- typing 3.5s steps(40, end);
461
- blink-caret .75s step-end infinite;
462
- }
463
-
464
- /* The typing effect */
465
- @keyframes typing {
466
- from { width: 0 }
467
- to { width: 100% }
468
- }
469
-
470
- /* The typewriter cursor effect */
471
- @keyframes blink-caret {
472
- from, to { border-color: transparent }
473
- 50% { border-color: orange; }
474
- }
475
- """
476
-
477
- with gr.Blocks(title=title) as demo:
478
- gr.Markdown(DESCRIPTION)
479
-
480
- chatbot = gr.Chatbot(
481
- [],
482
- elem_id="chatbot",
483
- avatar_images=("examples/lama.jpeg", "examples/lama2.jpeg"),
484
- bubble_full_width=False,
485
- )
486
-
487
- with gr.Row():
488
- txt = gr.Textbox(
489
- scale=3,
490
- show_label=False,
491
- placeholder="Enter text and press enter, or speak to your microphone",
492
- container=False,
493
- )
494
- txt_btn = gr.Button(value="Submit text", scale=1)
495
- btn = gr.Audio(source="microphone", type="filepath", scale=4)
496
-
497
- with gr.Row():
498
- audio = gr.Audio(
499
- label="Generated audio response",
500
- streaming=False,
501
- autoplay=False,
502
- interactive=True,
503
- show_label=True,
504
- )
505
- # TODO add a second audio that plays whole sentences (for mobile especially)
506
- # final_audio = gr.Audio(label="Final audio response", streaming=False, autoplay=False, interactive=False,show_label=True, visible=False)
507
-
508
- clear_btn = gr.ClearButton([chatbot, audio])
509
-
510
- txt_msg = txt_btn.click(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
511
- generate_speech, chatbot, [audio, chatbot]
512
- )
513
-
514
- txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
515
-
516
- txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
517
- generate_speech, chatbot, [audio, chatbot]
518
- )
519
-
520
- txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
521
-
522
- file_msg = btn.stop_recording(
523
- add_file, [chatbot, btn], [chatbot, txt], queue=False
524
- ).then(generate_speech, chatbot, [audio, chatbot])
525
-
526
- gr.Markdown(
527
- """
528
- This Space demonstrates how to speak to a chatbot, based solely on open-source models.
529
- It relies on 3 models:
530
- 1. [Whisper-large-v2](https://huggingface.co/spaces/sanchit-gandhi/whisper-jax) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
531
- 2. [Mistral-7b-instruct](https://huggingface.co/spaces/osanseviero/mistral-super-fast) as the chat model, the actual chat model. It is called from [huggingface_hub](https://huggingface.co/docs/huggingface_hub/guides/inference).
532
- 3. [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a TTS model, to generate the chatbot answers. This time, the model is hosted locally.
533
-
534
- Note:
535
- - By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml"""
536
- )
537
- demo.queue()
538
- demo.launch(debug=True)
 
68
  # will use api to restart space on a unrecoverable error
69
  api = HfApi(token=HF_TOKEN)
70
 
71
+ repo_id = "ylacombe/voice-chat-with-mistral"
72
 
73
  default_system_message = """
74
  You are Mistral, a large language model trained and provided by Mistral, architecture of you is decoder-based LM. Your voice backend or text to speech TTS backend is provided via Coqui technology. You are right now served on Huggingface spaces.
 
106
  "mistralai/Mistral-7B-Instruct-v0.1",
107
  timeout=WHISPER_TIMEOUT,
108
  )