Chat-with-Saul-Instruct-v1-GGUF

Runtime error

+'''
++----------------------+        +-------------------------+        +-------------------------------+        +-------------------------+
+| Step 1: Set Up       |        |  Step 2: Set Up Gradio  |        |  Step 3: Speech-to-Text       |        |  Step 4: Text-to-Speech |
+| Environment          |        |  Interface              |        | & Language Model Processing   |        |  Output                 |
++----------------------+        +-------------------------+        +-------------------------------+        +-------------------------+
+|                      |        |                         |        |                               |        |                         |
+| - Import Python      |        | - Define interface      |        | - Transcribe audio            |        | - XTTS model generates  |
+|   libraries          |        |   components            |        |   to text using               |        |   spoken response from  |
+| - Initialize models: |--------> - Configure audio and   |------->|   Faster Whisper ASR          |------->|   LLM's text response   |
+|   Whisper, Mistral,  |        |   text interaction      |        | - Transcribed text            |        |                         |
+|   XTTS               |        | - Launch interface      |        |   is added to                 |        |                         |
+|                      |        |                         |        |   chatbot's history           |        |                         |
+|                      |        |                         |        | - Mistral LLM                 |        |                         |
+|                      |        |                         |        |   processes chatbot           |        |                         |
+|                      |        |                         |        |   history to generate         |        |                         |
+|                      |        |                         |        |   response                    |        |                         |
++----------------------+        +-------------------------+        +-------------------------------+        +-------------------------+
+'''
+###### Set Up Environment ######
+import os
+# Set CUDA environment variable and install llama-cpp-python
+# llama-cpp-python is a python binding for llama.cpp library which enables LLM inference in pure C/C++
+os.environ["CUDACXX"] = "/usr/local/cuda/bin/nvcc"
+os.system('python -m unidic download')
+os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11 --verbose')
+# Third-party library imports
+from faster_whisper import WhisperModel
+import gradio as gr
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+from TTS.tts.configs.xtts_config import XttsConfig
+from TTS.tts.models.xtts import Xtts
+from TTS.utils.generic_utils import get_user_data_dir
+from TTS.utils.manage import ModelManager
+# Local imports
+from utils import get_sentence, generate_speech_for_sentence, wave_header_chunk
+# Load Whisper ASR model
+print("Loading Whisper ASR")
+whisper_model = WhisperModel("large-v3", device="cuda", compute_type="float16")
+# Load Mistral LLM
+print("Loading Mistral LLM")
+llm_model_name="mistral-7b-instruct-v0.1.Q5_K_M.gguf"
+hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename=llm_model_name)
+mistral_llm = Llama(model_path=f"./{llm_model_name}",n_gpu_layers=35,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=False)
+# Load XTTS Model
+print("Loading XTTS model")
+os.environ["COQUI_TOS_AGREED"] = "1"
+tts_model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
+ModelManager().download_model(tts_model_name)
+tts_model_path = os.path.join(get_user_data_dir("tts"), tts_model_name.replace("/", "--"))
+config = XttsConfig()
+config.load_json(os.path.join(tts_model_path, "config.json"))
+xtts_model = Xtts.init_from_config(config)
+xtts_model.load_checkpoint(
+    config,
+    checkpoint_path=os.path.join(tts_model_path, "model.pth"),
+    vocab_path=os.path.join(tts_model_path, "vocab.json"),
+    eval=True,
+    use_deepspeed=True,
+)
+xtts_model.cuda()
+###### Set up Gradio Interface ######
+with gr.Blocks(title="Voice chat with LLM") as demo:
+    DESCRIPTION = """# Voice chat with LLM"""
+    gr.Markdown(DESCRIPTION)
+    # Define chatbot component
+    chatbot = gr.Chatbot(
+        value=[(None, "Hi friend, I'm Amy, an AI coach. How can I help you today?")],  # Initial greeting from the chatbot
+        elem_id="chatbot",
+        avatar_images=("examples/hf-logo.png", "examples/ai-chat-logo.png"),
+        bubble_full_width=False,
+    )
+    # Define chatbot voice component
+    VOICES = ["female", "male"]
+    with gr.Row():
+        chatbot_voice = gr.Dropdown(
+            label="Voice of the Chatbot",
+            info="How should Chatbot talk like",
+            choices=VOICES,
+            max_choices=1,
+            value=VOICES[0],
+        )
+    # Define text and audio record input components
+    with gr.Row():
+        txt_box = gr.Textbox(
+            scale=3,
+            show_label=False,
+            placeholder="Enter text and press enter, or speak to your microphone",
+            container=False,
+            interactive=True,
+        )
+        audio_record = gr.Audio(source="microphone", type="filepath", scale=4)
+    # Define generated audio playback component
+    with gr.Row():
+        sentence = gr.Textbox(visible=False)
+        audio_playback = gr.Audio(
+            value=None,
+            label="Generated audio response",
+            streaming=True,
+            autoplay=True,
+            interactive=False,
+            show_label=True,
+        )
+    # Will be triggered on text submit (will send to generate_speech)
+    def add_text(chatbot_history, text):
+        chatbot_history = [] if chatbot_history is None else chatbot_history
+        chatbot_history = chatbot_history + [(text, None)]
+        return chatbot_history, gr.update(value="", interactive=False)
+    # Will be triggered on voice submit (will transribe and send to generate_speech)
+    def add_audio(chatbot_history, audio):
+        chatbot_history = [] if chatbot_history is None else chatbot_history
+        # get result from whisper and strip it to delete begin and end space
+        response, _ = whisper_model.transcribe(audio)
+        text = list(response)[0].text.strip()
+        print("Transcribed text:", text)
+        chatbot_history = chatbot_history + [(text, None)]
+        return chatbot_history, gr.update(value="", interactive=False)
+    def generate_speech(chatbot_history, chatbot_voice, initial_greeting=False):
+        # Start by yielding an initial empty audio to set up autoplay
+        yield ("", chatbot_history, wave_header_chunk())
+        # Helper function to handle the speech generation and yielding process
+        def handle_speech_generation(sentence, chatbot_history, chatbot_voice):
+            if sentence != "":
+                print("Processing sentence")
+                generated_speech = generate_speech_for_sentence(chatbot_history, chatbot_voice, sentence, xtts_model, xtts_supported_languages=config.languages, return_as_byte=True)
+                if generated_speech is not None:
+                    _, audio_dict = generated_speech
+                    yield (sentence, chatbot_history, audio_dict["value"])
+        if initial_greeting:
+            # Process only the initial greeting if specified
+            for _, sentence in chatbot_history:
+                yield from handle_speech_generation(sentence, chatbot_history, chatbot_voice)
+        else:
+            # Continuously get and process sentences from a generator function
+            for sentence, chatbot_history in get_sentence(chatbot_history, mistral_llm):
+                print("Inserting sentence to queue")
+                yield from handle_speech_generation(sentence, chatbot_history, chatbot_voice)
+    txt_msg = txt_box.submit(fn=add_text, inputs=[chatbot, txt_box], outputs=[chatbot, txt_box], queue=False
+                             ).then(fn=generate_speech,  inputs=[chatbot,chatbot_voice], outputs=[sentence, chatbot, audio_playback])
+    txt_msg.then(fn=lambda: gr.update(interactive=True), inputs=None, outputs=[txt_box], queue=False)
+    audio_msg = audio_record.stop_recording(fn=add_audio, inputs=[chatbot, audio_record], outputs=[chatbot, txt_box], queue=False
+                                            ).then(fn=generate_speech,  inputs=[chatbot,chatbot_voice], outputs=[sentence, chatbot, audio_playback])
+    audio_msg.then(fn=lambda: (gr.update(interactive=True),gr.update(interactive=True,value=None)), inputs=None, outputs=[txt_box, audio_record], queue=False)
+    FOOTNOTE = """
+            This Space demonstrates how to speak to an llm chatbot, based solely on open accessible models.
+            It relies on following models :
+            - Speech to Text : [Faster-Whisper](https://github.com/SYSTRAN/faster-whisper/) an ASR model, to transcribe recorded audio to text.
+            - LLM Mistral    : [Mistral-7b-instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) as the chatbot model.
+            - Text to Speech : [Coqui's XTTS V2](https://huggingface.co/spaces/coqui/xtts) as a Multilingual TTS model, to generate the voice of the chatbot.
+            Note:
+            - Responses generated by chat model should not be assumed correct or taken serious, as this is a demonstration example only
+            - iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
+    gr.Markdown(FOOTNOTE)
+    demo.load(block=None, fn=generate_speech, inputs=[chatbot,chatbot_voice, gr.State(value=True)], outputs=[sentence, chatbot, audio_playback])
+demo.queue().launch(debug=True,share=True)

examples/ai-chat-logo.png ADDED Viewed

examples/app_ui.png ADDED Viewed

examples/coqui-logo.png ADDED Viewed

examples/female.wav ADDED Viewed

Binary file (454 kB). View file

examples/hf-logo.png ADDED Viewed

examples/male.wav ADDED Viewed

Binary file (381 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+# Preinstall requirements from TTS
+TTS @ git+https://github.com/coqui-ai/TTS@v0.20.6
+pydantic==1.10.13
+python-multipart==0.0.6
+typing-extensions>=4.8.0
+cutlet
+mecab-python3==1.0.6
+unidic-lite==1.0.8
+unidic==1.1.0
+langid
+deepspeed
+pydub
+librosa
+ffmpeg-python
+gradio_client
+emoji
+asyncio
+noisereduce==3.0.0
+faster-whisper==1.0.1

utils.py ADDED Viewed

	@@ -0,0 +1,410 @@

+from __future__ import annotations
+import io
+import os
+import re
+import subprocess
+import textwrap
+import time
+import uuid
+import wave
+import emoji
+import gradio as gr
+import langid
+import nltk
+import numpy as np
+import noisereduce as nr
+from huggingface_hub import HfApi
+# Download the 'punkt' tokenizer for the NLTK library
+nltk.download("punkt")
+# will use api to restart space on a unrecoverable error
+HF_TOKEN = os.environ.get("HF_TOKEN")
+REPO_ID = os.environ.get("REPO_ID")
+api = HfApi(token=HF_TOKEN)
+latent_map = {}
+def get_latents(chatbot_voice, xtts_model, voice_cleanup=False):
+    global latent_map
+    if chatbot_voice not in latent_map:
+        speaker_wav = f"examples/{chatbot_voice}.wav"
+        if (voice_cleanup):
+            try:
+                cleanup_filter="lowpass=8000,highpass=75,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02"
+                resample_filter="-ac 1 -ar 22050"
+                out_filename = speaker_wav + str(uuid.uuid4()) + ".wav"  #ffmpeg to know output format
+                #we will use newer ffmpeg as that has afftn denoise filter
+                shell_command = f"ffmpeg -y -i {speaker_wav} -af {cleanup_filter} {resample_filter} {out_filename}".split(" ")
+                command_result = subprocess.run([item for item in shell_command], capture_output=False,text=True, check=True)
+                speaker_wav=out_filename
+                print("Filtered microphone input")
+            except subprocess.CalledProcessError:
+                # There was an error - command exited with non-zero code
+                print("Error: failed filtering, use original microphone input")
+        else:
+                speaker_wav=speaker_wav
+        # gets condition latents from the model
+        # returns tuple (gpt_cond_latent, speaker_embedding)
+        latent_map[chatbot_voice] = xtts_model.get_conditioning_latents(audio_path=speaker_wav)
+    return latent_map[chatbot_voice]
+def detect_language(prompt, xtts_supported_languages=None):
+    if xtts_supported_languages is None:
+        xtts_supported_languages = ["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn","ja"]
+    # Fast language autodetection
+    if len(prompt)>15:
+        language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
+        if language_predicted == "zh":
+            #we use zh-cn on xtts
+            language_predicted = "zh-cn"
+        if language_predicted not in xtts_supported_languages:
+            print(f"Detected a language not supported by xtts :{language_predicted}, switching to english for now")
+            gr.Warning(f"Language detected '{language_predicted}' can not be spoken properly 'yet' ")
+            language= "en"
+        else:
+            language = language_predicted
+        print(f"Language: Predicted sentence language:{language_predicted} , using language for xtts:{language}")
+    else:
+        # Hard to detect language fast in short sentence, use english default
+        language = "en"
+        print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
+    return language
+def get_voice_streaming(prompt, language, chatbot_voice, xtts_model, suffix="0"):
+    gpt_cond_latent, speaker_embedding = get_latents(chatbot_voice, xtts_model)
+    try:
+        t0 = time.time()
+        chunks = xtts_model.inference_stream(
+            prompt,
+            language,
+            gpt_cond_latent,
+            speaker_embedding,
+            repetition_penalty=7.0,
+            temperature=0.85,
+        )
+        first_chunk = True
+        for i, chunk in enumerate(chunks):
+            if first_chunk:
+                first_chunk_time = time.time() - t0
+                metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
+                first_chunk = False
+            #print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
+            # In case output is required to be multiple voice files
+            # out_file = f'{char}_{i}.wav'
+            # write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
+            # audio = AudioSegment.from_file(out_file)
+            # audio.export(out_file, format='wav')
+            # return out_file
+            # directly return chunk as bytes for streaming
+            chunk = chunk.detach().cpu().numpy().squeeze()
+            chunk = (chunk * 32767).astype(np.int16)
+            yield chunk.tobytes()
+    except RuntimeError as e:
+        if "device-side assert" in str(e):
+            # cannot do anything on cuda device side error, need tor estart
+            print(
+                f"Exit due to: Unrecoverable exception caused by prompt:{prompt}",
+                flush=True,
+            )
+            gr.Warning("Unhandled Exception encounter, please retry in a minute")
+            print("Cuda device-assert Runtime encountered need restart")
+            # HF Space specific.. This error is unrecoverable need to restart space
+            api.restart_space(REPO_ID=REPO_ID)
+        else:
+            print("RuntimeError: non device-side assert error:", str(e))
+            # Does not require warning happens on empty chunk and at end
+            ###gr.Warning("Unhandled Exception encounter, please retry in a minute")
+            return None
+        return None
+    except:
+        return None
+def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
+    # This will create a wave header then append the frame input
+    # It should be first on a streaming wav file
+    # Other frames better should not have it (else you will hear some artifacts each chunk start)
+    wav_buf = io.BytesIO()
+    with wave.open(wav_buf, "wb") as vfout:
+        vfout.setnchannels(channels)
+        vfout.setsampwidth(sample_width)
+        vfout.setframerate(sample_rate)
+        vfout.writeframes(frame_input)
+    wav_buf.seek(0)
+    return wav_buf.read()
+def format_prompt(message, history):
+    system_message = f"""
+    You are an empathetic, insightful, and supportive coach who helps people deal with challenges and celebrate achievements.
+    You help people feel better by asking questions to reflect on and evoke feelings of positivity, gratitude, joy, and love.
+    You show radical candor and tough love.
+    Respond in a casual and friendly tone.
+    Sprinkle in filler words, contractions, idioms, and other casual speech that we use in conversation.
+    Emulate the user’s speaking style and be concise in your response.
+    """
+    prompt = (
+        "<s>[INST]" + system_message + "[/INST]"
+    )
+    for user_prompt, bot_response in history:
+        if user_prompt is not None:
+            prompt += f"[INST] {user_prompt} [/INST]"
+        prompt += f" {bot_response}</s> "
+    if message=="":
+        message="Hello"
+    prompt += f"[INST] {message} [/INST]"
+    return prompt
+def generate_llm_output(
+        prompt,
+        history,
+        llm,
+        temperature=0.8,
+        max_tokens=256,
+        top_p=0.95,
+        stop_words=["<s>","[/INST]", "</s>"]
+    ):
+        temperature = float(temperature)
+        if temperature < 1e-2:
+            temperature = 1e-2
+        top_p = float(top_p)
+        generate_kwargs = dict(
+            temperature=temperature,
+            max_tokens=max_tokens,
+            top_p=top_p,
+            stop=stop_words
+        )
+        formatted_prompt = format_prompt(prompt, history)
+        try:
+            print("LLM Input:", formatted_prompt)
+            # Local GGUF
+            stream = llm(
+                formatted_prompt,
+                **generate_kwargs,
+                stream=True,
+            )
+            output = ""
+            for response in stream:
+                character= response["choices"][0]["text"]
+                if character in stop_words:
+                    # end of context
+                    return
+                if emoji.is_emoji(character):
+                    # Bad emoji not a meaning messes chat from next lines
+                    return
+                output += response["choices"][0]["text"]
+                yield output
+        except Exception as e:
+            print("Unhandled Exception: ", str(e))
+            gr.Warning("Unfortunately Mistral is unable to process")
+            output = "I do not know what happened but I could not understand you ."
+        return output
+def get_sentence(history, llm):
+    history = [["", None]] if history is None else history
+    history[-1][1] = ""
+    sentence_list = []
+    sentence_hash_list = []
+    text_to_generate = ""
+    stored_sentence = None
+    stored_sentence_hash = None
+    for character in generate_llm_output(history[-1][0], history[:-1], llm):
+        history[-1][1] = character.replace("<|assistant|>","")
+        # It is coming word by word
+        text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|assistant|>"," ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())
+        if len(text_to_generate) > 1:
+            dif = len(text_to_generate) - len(sentence_list)
+            if dif == 1 and len(sentence_list) != 0:
+                continue
+            if dif == 2 and len(sentence_list) != 0 and stored_sentence is not None:
+                continue
+            # All this complexity due to trying append first short sentence to next one for proper language auto-detect
+            if stored_sentence is not None and stored_sentence_hash is None and dif>1:
+                #means we consumed stored sentence and should look at next sentence to generate
+                sentence = text_to_generate[len(sentence_list)+1]
+            elif stored_sentence is not None and len(text_to_generate)>2 and stored_sentence_hash is not None:
+                print("Appending stored")
+                sentence = stored_sentence + text_to_generate[len(sentence_list)+1]
+                stored_sentence_hash = None
+            else:
+                sentence = text_to_generate[len(sentence_list)]
+            # too short sentence just append to next one if there is any
+            # this is for proper language detection
+            if len(sentence)<=15 and stored_sentence_hash is None and stored_sentence is None:
+                if sentence[-1] in [".","!","?"]:
+                    if stored_sentence_hash != hash(sentence):
+                        stored_sentence = sentence
+                        stored_sentence_hash = hash(sentence)
+                        print("Storing:",stored_sentence)
+                        continue
+            sentence_hash = hash(sentence)
+            if stored_sentence_hash is not None and sentence_hash == stored_sentence_hash:
+                continue
+            if sentence_hash not in sentence_hash_list:
+                sentence_hash_list.append(sentence_hash)
+                sentence_list.append(sentence)
+                print("New Sentence: ", sentence)
+                yield (sentence, history)
+    # return that final sentence token
+    try:
+        last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())[-1]
+        sentence_hash = hash(last_sentence)
+        if sentence_hash not in sentence_hash_list:
+            if stored_sentence is not None and stored_sentence_hash is not None:
+                last_sentence = stored_sentence + last_sentence
+                stored_sentence = stored_sentence_hash = None
+                print("Last Sentence with stored:",last_sentence)
+            sentence_hash_list.append(sentence_hash)
+            sentence_list.append(last_sentence)
+            print("Last Sentence: ", last_sentence)
+            yield (last_sentence, history)
+    except:
+        print("ERROR on last sentence history is :", history)
+# will generate speech audio file per sentence
+def generate_speech_for_sentence(history, chatbot_voice, sentence, xtts_model, xtts_supported_languages=None, filter_output=True, return_as_byte=False):
+    language = "autodetect"
+    wav_bytestream = b""
+    if len(sentence)==0:
+        print("EMPTY SENTENCE")
+        return
+    # Sometimes prompt </s> coming on output remove it
+    # Some post process for speech only
+    sentence = sentence.replace("</s>", "")
+    # remove code from speech
+    sentence = re.sub("```.*```", "", sentence, flags=re.DOTALL)
+    sentence = re.sub("`.*`", "", sentence, flags=re.DOTALL)
+    sentence = re.sub("\(.*\)", "", sentence, flags=re.DOTALL)
+    sentence = sentence.replace("```", "")
+    sentence = sentence.replace("...", " ")
+    sentence = sentence.replace("(", " ")
+    sentence = sentence.replace(")", " ")
+    sentence = sentence.replace("<|assistant|>","")
+    if len(sentence)==0:
+        print("EMPTY SENTENCE after processing")
+        return
+    # A fast fix for last chacter, may produce weird sounds if it is with text
+    #if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
+    #    # just add a space
+    #    sentence = sentence[:-1] + " " + sentence[-1]
+    # regex does the job well
+    sentence= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)",r"\1 \2\2",sentence)
+    print("Sentence for speech:", sentence)
+    try:
+        SENTENCE_SPLIT_LENGTH=350
+        if len(sentence)<SENTENCE_SPLIT_LENGTH:
+            # no problem continue on
+            sentence_list = [sentence]
+        else:
+            # Until now nltk likely split sentences properly but we need additional
+            # check for longer sentence and split at last possible position
+            # Do whatever necessary, first break at hypens then spaces and then even split very long words
+            sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
+            print("SPLITTED LONG SENTENCE:",sentence_list)
+        for sentence in sentence_list:
+            if any(c.isalnum() for c in sentence):
+                if language=="autodetect":
+                    #on first call autodetect, nexts sentence calls will use same language
+                    language = detect_language(sentence, xtts_supported_languages)
+                #exists at least 1 alphanumeric (utf-8)
+                audio_stream = get_voice_streaming(
+                        sentence, language, chatbot_voice, xtts_model
+                    )
+            else:
+                # likely got a ' or " or some other text without alphanumeric in it
+                audio_stream = None
+            # XTTS is actually using streaming response but we are playing audio by sentence
+            # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
+            if audio_stream is not None:
+                frame_length = 0
+                for chunk in audio_stream:
+                    try:
+                        wav_bytestream += chunk
+                        frame_length += len(chunk)
+                    except:
+                        # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
+                        continue
+            # Filter output for better voice
+            if filter_output:
+                data_s16 = np.frombuffer(wav_bytestream, dtype=np.int16, count=len(wav_bytestream)//2, offset=0)
+                float_data = data_s16 * 0.5**15
+                reduced_noise = nr.reduce_noise(y=float_data, sr=24000,prop_decrease =0.8,n_fft=1024)
+                wav_bytestream = (reduced_noise * 32767).astype(np.int16)
+                wav_bytestream = wav_bytestream.tobytes()
+            if audio_stream is not None:
+                if not return_as_byte:
+                    audio_unique_filename = "/tmp/"+ str(uuid.uuid4())+".wav"
+                    with wave.open(audio_unique_filename, "w") as f:
+                        f.setnchannels(1)
+                        # 2 bytes per sample.
+                        f.setsampwidth(2)
+                        f.setframerate(24000)
+                        f.writeframes(wav_bytestream)
+                    return (history , gr.Audio.update(value=audio_unique_filename, autoplay=True))
+                else:
+                    return (history , gr.Audio.update(value=wav_bytestream, autoplay=True))
+    except RuntimeError as e:
+        if "device-side assert" in str(e):
+            # cannot do anything on cuda device side error, need tor estart
+            print(
+                f"Exit due to: Unrecoverable exception caused by prompt:{sentence}",
+                flush=True,
+            )
+            gr.Warning("Unhandled Exception encounter, please retry in a minute")
+            print("Cuda device-assert Runtime encountered need restart")
+            # HF Space specific.. This error is unrecoverable need to restart space
+            api.restart_space(REPO_ID=REPO_ID)
+        else:
+            print("RuntimeError: non device-side assert error:", str(e))
+            raise e
+    print("All speech ended")
+    return