voice-chat-with-mistral

Runtime error

App Files Files Community

ylacombe commited on Oct 18, 2023

Commit

ac1e9e7

1 Parent(s): 3029bff

Improve UX a bit and switch back to Whisper large v2

Browse files

Files changed (1) hide show

app.py +19 -36

app.py CHANGED Viewed

@@ -19,7 +19,6 @@ import datetime
 from scipy.io.wavfile import write
 from pydub import AudioSegment
-import ffmpeg
 import re
 import io, wave
@@ -57,7 +56,7 @@ model.load_checkpoint(
     checkpoint_path=os.path.join(model_path, "model.pth"),
     vocab_path=os.path.join(model_path, "vocab.json"),
     eval=True,
-    use_deepspeed=True,
 )
 model.cuda()
 print("Done loading TTS")
@@ -113,10 +112,7 @@ from gradio_client import Client
 from huggingface_hub import InferenceClient
 WHISPER_TIMEOUT = int(os.environ.get("WHISPER_TIMEOUT", 30))
-# This client is down
-# whisper_client = Client("https://sanchit-gandhi-whisper-large-v2.hf.space/")
-# Replacement whisper client, it may be time limited
-whisper_client = Client("https://sanchit-gandhi-whisper-jax.hf.space")
 text_client = InferenceClient(
     "mistralai/Mistral-7B-Instruct-v0.1",
     timeout=WHISPER_TIMEOUT,
@@ -203,13 +199,12 @@ def generate(
 def transcribe(wav_path):
     try:
-        # get first element from whisper_jax and strip it to delete begin and end space
         return whisper_client.predict(
-            wav_path,  # str (filepath or URL to file) in 'inputs' Audio component
-            "transcribe",  # str in 'Task' Radio component
-            False,  # return_timestamps=False for whisper-jax https://gist.github.com/sanchit-gandhi/781dd7003c5b201bfe16d28634c8d4cf#file-whisper_jax_endpoint-py
-            api_name="/predict",
-        )[0].strip()
     except:
         gr.Warning("There was a problem with Whisper endpoint, telling a joke for you.")
         return "There was a problem with my voice, tell me joke"
@@ -242,8 +237,8 @@ def add_file(history, file):
 ##NOTE: not using this as it yields a chacter each time while we need to feed history to TTS
 def bot(history, system_prompt=""):
-    history = [] if history is None else history
     if system_prompt == "":
         system_prompt = system_message
@@ -267,21 +262,6 @@ latent_map = {}
 latent_map["Female_Voice"] = get_latents("examples/female.wav")
-def get_voice(prompt, language, latent_tuple, suffix="0"):
-    gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
-    # Direct version
-    t0 = time.time()
-    out = model.inference(
-        prompt, language, gpt_cond_latent, speaker_embedding, diffusion_conditioning
-    )
-    inference_time = time.time() - t0
-    print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
-    real_time_factor = (time.time() - t0) / out["wav"].shape[-1] * 24000
-    print(f"Real-time factor (RTF): {real_time_factor}")
-    wav_filename = f"output_{suffix}.wav"
-    torchaudio.save(wav_filename, torch.tensor(out["wav"]).unsqueeze(0), 24000)
-    return wav_filename
 def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
     # This will create a wave header then append the frame input
@@ -333,7 +313,7 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
         if "device-side assert" in str(e):
             # cannot do anything on cuda device side error, need tor estart
             print(
-                f"Exit due to: Unrecoverable exception caused by prompt:{sentence}",
                 flush=True,
             )
             gr.Warning("Unhandled Exception encounter, please retry in a minute")
@@ -353,10 +333,12 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
 def get_sentence(history, system_prompt=""):
     history = [["", None]] if history is None else history
-    print(history)
     if system_prompt == "":
         system_prompt = system_message
     mistral_start = time.time()
     print("Mistral start")
     sentence_list = []
@@ -422,8 +404,8 @@ def generate_speech(history):
         try:
             # generate speech using precomputed latents
             # This is not streaming but it will be fast
-            # wav = get_voice(sentence,language, latent_map["Female_Voice"], suffix=len(wav_list))
             if len(sentence) > 250:
                 # should not generate voice it will hit token limit
                 # It should not generate audio for it
                 audio_stream = None
@@ -520,6 +502,7 @@ with gr.Blocks(title=title) as demo:
             show_label=False,
             placeholder="Enter text and press enter, or speak to your microphone",
             container=False,
         )
         txt_btn = gr.Button(value="Submit text", scale=1)
         btn = gr.Audio(source="microphone", type="filepath", scale=4)
@@ -536,7 +519,7 @@ with gr.Blocks(title=title) as demo:
         # final_audio = gr.Audio(label="Final audio response", streaming=False, autoplay=False, interactive=False,show_label=True, visible=False)
     clear_btn = gr.ClearButton([chatbot, audio])
     txt_msg = txt_btn.click(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
         generate_speech, chatbot, [audio, chatbot]
     )
@@ -553,13 +536,13 @@ with gr.Blocks(title=title) as demo:
         add_file, [chatbot, btn], [chatbot, txt], queue=False
     ).then(generate_speech, chatbot, [audio, chatbot])
-    file_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
     gr.Markdown(
         """
 This Space demonstrates how to speak to a chatbot, based solely on open-source models.
 It relies on 3 models:
-1. [Whisper-large-v2](https://huggingface.co/spaces/sanchit-gandhi/whisper-jax) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
 2. [Mistral-7b-instruct](https://huggingface.co/spaces/osanseviero/mistral-super-fast) as the chat model, the actual chat model. It is called from [huggingface_hub](https://huggingface.co/docs/huggingface_hub/guides/inference).
 3. [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a TTS model, to generate the chatbot answers. This time, the model is hosted locally.
@@ -567,4 +550,4 @@ Note:
 - By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml"""
     )
 demo.queue()
-demo.launch(debug=True, share=True)

 from scipy.io.wavfile import write
 from pydub import AudioSegment
 import re
 import io, wave
     checkpoint_path=os.path.join(model_path, "model.pth"),
     vocab_path=os.path.join(model_path, "vocab.json"),
     eval=True,
+    use_deepspeed=False, # TODO: replace by True
 )
 model.cuda()
 print("Done loading TTS")
 from huggingface_hub import InferenceClient
 WHISPER_TIMEOUT = int(os.environ.get("WHISPER_TIMEOUT", 30))
+whisper_client = Client("https://sanchit-gandhi-whisper-large-v2.hf.space/")
 text_client = InferenceClient(
     "mistralai/Mistral-7B-Instruct-v0.1",
     timeout=WHISPER_TIMEOUT,
 def transcribe(wav_path):
     try:
+        # get result from whisper and strip it to delete begin and end space
         return whisper_client.predict(
+				wav_path,	# str (filepath or URL to file) in 'inputs' Audio component
+				"transcribe",	# str in 'Task' Radio component
+				api_name="/predict"
+        ).strip()
     except:
         gr.Warning("There was a problem with Whisper endpoint, telling a joke for you.")
         return "There was a problem with my voice, tell me joke"
 ##NOTE: not using this as it yields a chacter each time while we need to feed history to TTS
 def bot(history, system_prompt=""):
+    history = [["", None]] if history is None else history
     if system_prompt == "":
         system_prompt = system_message
 latent_map["Female_Voice"] = get_latents("examples/female.wav")
 def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
     # This will create a wave header then append the frame input
         if "device-side assert" in str(e):
             # cannot do anything on cuda device side error, need tor estart
             print(
+                f"Exit due to: Unrecoverable exception caused by prompt:{prompt}",
                 flush=True,
             )
             gr.Warning("Unhandled Exception encounter, please retry in a minute")
 def get_sentence(history, system_prompt=""):
     history = [["", None]] if history is None else history
     if system_prompt == "":
         system_prompt = system_message
+    history[-1][1] = ""
     mistral_start = time.time()
     print("Mistral start")
     sentence_list = []
         try:
             # generate speech using precomputed latents
             # This is not streaming but it will be fast
             if len(sentence) > 250:
+                gr.Warning("There was a problem with the last sentence, which was too long, so it won't be spoken.")
                 # should not generate voice it will hit token limit
                 # It should not generate audio for it
                 audio_stream = None
             show_label=False,
             placeholder="Enter text and press enter, or speak to your microphone",
             container=False,
+            interactive=True,
         )
         txt_btn = gr.Button(value="Submit text", scale=1)
         btn = gr.Audio(source="microphone", type="filepath", scale=4)
         # final_audio = gr.Audio(label="Final audio response", streaming=False, autoplay=False, interactive=False,show_label=True, visible=False)
     clear_btn = gr.ClearButton([chatbot, audio])
     txt_msg = txt_btn.click(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
         generate_speech, chatbot, [audio, chatbot]
     )
         add_file, [chatbot, btn], [chatbot, txt], queue=False
     ).then(generate_speech, chatbot, [audio, chatbot])
+    file_msg.then(lambda: (gr.update(interactive=True),gr.update(interactive=True,value=None)), None, [txt, btn], queue=False)
     gr.Markdown(
         """
 This Space demonstrates how to speak to a chatbot, based solely on open-source models.
 It relies on 3 models:
+1. [Whisper-large-v2](https://sanchit-gandhi-whisper-large-v2.hf.space/) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
 2. [Mistral-7b-instruct](https://huggingface.co/spaces/osanseviero/mistral-super-fast) as the chat model, the actual chat model. It is called from [huggingface_hub](https://huggingface.co/docs/huggingface_hub/guides/inference).
 3. [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a TTS model, to generate the chatbot answers. This time, the model is hosted locally.
 - By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml"""
     )
 demo.queue()
+demo.launch(debug=True)