roudayna77 commited on
Commit
905db08
1 Parent(s): 0b79ad7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -0
app.py CHANGED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ###### Set Up Environment ######
2
+
3
+ import os
4
+ # Set CUDA environment variable and install llama-cpp-python
5
+ # llama-cpp-python is a python binding for llama.cpp library which enables LLM inference in pure C/C++
6
+ os.environ["CUDACXX"] = "/usr/local/cuda/bin/nvcc"
7
+ os.system('python -m unidic download')
8
+ os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11 --verbose')
9
+
10
+
11
+ # Third-party library imports
12
+ from faster_whisper import WhisperModel
13
+ import gradio as gr
14
+ from huggingface_hub import hf_hub_download
15
+ from llama_cpp import Llama
16
+ from TTS.tts.configs.xtts_config import XttsConfig
17
+ from TTS.tts.models.xtts import Xtts
18
+ from TTS.utils.generic_utils import get_user_data_dir
19
+ from TTS.utils.manage import ModelManager
20
+
21
+ # Local imports
22
+ from utils import get_sentence, generate_speech_for_sentence, wave_header_chunk
23
+
24
+ # Load Whisper ASR model
25
+ print("Loading Whisper ASR")
26
+ whisper_model = WhisperModel("large-v3", device="cuda", compute_type="float16")
27
+
28
+ # Load Mistral LLM
29
+ print("Loading Mistral LLM")
30
+ hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
31
+ mistral_model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf"
32
+ mistral_llm = Llama(model_path=mistral_model_path,n_gpu_layers=35,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=False)
33
+
34
+
35
+ # Load XTTS Model
36
+ print("Loading XTTS model")
37
+ os.environ["COQUI_TOS_AGREED"] = "1"
38
+ tts_model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
39
+ ModelManager().download_model(tts_model_name)
40
+ tts_model_path = os.path.join(get_user_data_dir("tts"), tts_model_name.replace("/", "--"))
41
+ config = XttsConfig()
42
+ config.load_json(os.path.join(tts_model_path, "config.json"))
43
+ xtts_model = Xtts.init_from_config(config)
44
+ xtts_model.load_checkpoint(
45
+ config,
46
+ checkpoint_path=os.path.join(tts_model_path, "model.pth"),
47
+ vocab_path=os.path.join(tts_model_path, "vocab.json"),
48
+ eval=True,
49
+ use_deepspeed=True,
50
+ )
51
+ xtts_model.cuda()
52
+
53
+ ###### Set up Gradio Interface ######
54
+
55
+ with gr.Blocks(title="Voice chat with LLM") as demo:
56
+ DESCRIPTION = """# Voice chat with LLM"""
57
+ gr.Markdown(DESCRIPTION)
58
+
59
+ # Define chatbot component
60
+ chatbot = gr.Chatbot(
61
+ value=[(None, "Hi friend, I'm Amy, an AI coach. How can I help you today?")], # Initial greeting from the chatbot
62
+ elem_id="chatbot",
63
+ avatar_images=("examples/hf-logo.png", "examples/ai-chat-logo.png"),
64
+ bubble_full_width=False,
65
+ )
66
+
67
+ # Define chatbot voice component
68
+ VOICES = ["female", "male"]
69
+ with gr.Row():
70
+ chatbot_voice = gr.Dropdown(
71
+ label="Voice of the Chatbot",
72
+ info="How should Chatbot talk like",
73
+ choices=VOICES,
74
+ max_choices=1,
75
+ value=VOICES[0],
76
+ )
77
+
78
+ # Define text and audio record input components
79
+ with gr.Row():
80
+ txt_box = gr.Textbox(
81
+ scale=3,
82
+ show_label=False,
83
+ placeholder="Enter text and press enter, or speak to your microphone",
84
+ container=False,
85
+ interactive=True,
86
+ )
87
+ audio_record = gr.Audio(source="microphone", type="filepath", scale=4)
88
+
89
+ # Define generated audio playback component
90
+ with gr.Row():
91
+ sentence = gr.Textbox(visible=False)
92
+ audio_playback = gr.Audio(
93
+ value=None,
94
+ label="Generated audio response",
95
+ streaming=True,
96
+ autoplay=True,
97
+ interactive=False,
98
+ show_label=True,
99
+ )
100
+
101
+ # Will be triggered on text submit (will send to generate_speech)
102
+ def add_text(chatbot_history, text):
103
+ chatbot_history = [] if chatbot_history is None else chatbot_history
104
+ chatbot_history = chatbot_history + [(text, None)]
105
+ return chatbot_history, gr.update(value="", interactive=False)
106
+
107
+ # Will be triggered on voice submit (will transribe and send to generate_speech)
108
+ def add_audio(chatbot_history, audio):
109
+ chatbot_history = [] if chatbot_history is None else chatbot_history
110
+ # get result from whisper and strip it to delete begin and end space
111
+ response, _ = whisper_model.transcribe(audio)
112
+ text = list(response)[0].text.strip()
113
+ print("Transcribed text:", text)
114
+ chatbot_history = chatbot_history + [(text, None)]
115
+ return chatbot_history, gr.update(value="", interactive=False)
116
+
117
+ def generate_speech(chatbot_history, chatbot_voice, initial_greeting=False):
118
+ # Start by yielding an initial empty audio to set up autoplay
119
+ yield ("", chatbot_history, wave_header_chunk())
120
+
121
+ # Helper function to handle the speech generation and yielding process
122
+ def handle_speech_generation(sentence, chatbot_history, chatbot_voice):
123
+ if sentence != "":
124
+ print("Processing sentence")
125
+ generated_speech = generate_speech_for_sentence(chatbot_history, chatbot_voice, sentence, xtts_model, xtts_supported_languages=config.languages, return_as_byte=True)
126
+ if generated_speech is not None:
127
+ _, audio_dict = generated_speech
128
+ yield (sentence, chatbot_history, audio_dict["value"])
129
+
130
+ if initial_greeting:
131
+ # Process only the initial greeting if specified
132
+ for _, sentence in chatbot_history:
133
+ yield from handle_speech_generation(sentence, chatbot_history, chatbot_voice)
134
+ else:
135
+ # Continuously get and process sentences from a generator function
136
+ for sentence, chatbot_history in get_sentence(chatbot_history, mistral_llm):
137
+ print("Inserting sentence to queue")
138
+ yield from handle_speech_generation(sentence, chatbot_history, chatbot_voice)
139
+
140
+ txt_msg = txt_box.submit(fn=add_text, inputs=[chatbot, txt_box], outputs=[chatbot, txt_box], queue=False
141
+ ).then(fn=generate_speech, inputs=[chatbot,chatbot_voice], outputs=[sentence, chatbot, audio_playback])
142
+
143
+ txt_msg.then(fn=lambda: gr.update(interactive=True), inputs=None, outputs=[txt_box], queue=False)
144
+
145
+ audio_msg = audio_record.stop_recording(fn=add_audio, inputs=[chatbot, audio_record], outputs=[chatbot, txt_box], queue=False
146
+ ).then(fn=generate_speech, inputs=[chatbot,chatbot_voice], outputs=[sentence, chatbot, audio_playback])
147
+
148
+ audio_msg.then(fn=lambda: (gr.update(interactive=True),gr.update(interactive=True,value=None)), inputs=None, outputs=[txt_box, audio_record], queue=False)
149
+
150
+ FOOTNOTE = """
151
+ This Space demonstrates how to speak to an llm chatbot, based solely on open accessible models.
152
+ It relies on the following models :
153
+ - Speech to Text Model: [Faster-Whisper-large-v3](https://huggingface.co/Systran/faster-whisper-large-v3) an ASR model, to transcribe recorded audio to text.
154
+ - Large Language Model: [Mistral-7b-instruct-v0.1-quantized](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF) a LLM to generate the chatbot responses.
155
+ - Text to Speech Model: [XTTS-v2](https://huggingface.co/spaces/coqui/xtts) a TTS model, to generate the voice of the chatbot.
156
+ Note:
157
+ - Responses generated by chat model should not be assumed correct or taken serious, as this is a demonstration example only
158
+ - iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
159
+ gr.Markdown(FOOTNOTE)
160
+ demo.load(fn=generate_speech, inputs=[chatbot,chatbot_voice, gr.State(value=True)], outputs=[sentence, chatbot, audio_playback])
161
+ demo.queue().launch(debug=True,share=True)