Ntabukiraniro commited on
Commit
1acb220
1 Parent(s): 611452d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +159 -0
app.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # Set CUDA environment variable and install llama-cpp-python
3
+ # llama-cpp-python is a python binding for llama.cpp library which enables LLM inference in pure C/C++
4
+ os.environ["CUDACXX"] = "/usr/local/cuda/bin/nvcc"
5
+ os.system('python -m unidic download')
6
+ os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11 --verbose')
7
+
8
+
9
+ # Third-party library imports
10
+ from faster_whisper import WhisperModel
11
+ import gradio as gr
12
+ from huggingface_hub import hf_hub_download
13
+ from llama_cpp import Llama
14
+ from TTS.tts.configs.xtts_config import XttsConfig
15
+ from TTS.tts.models.xtts import Xtts
16
+ from TTS.utils.generic_utils import get_user_data_dir
17
+ from TTS.utils.manage import ModelManager
18
+
19
+ # Local imports
20
+ from utils import get_sentence, generate_speech_for_sentence, wave_header_chunk
21
+
22
+ # Load Whisper ASR model
23
+ print("Loading Whisper ASR")
24
+ whisper_model = WhisperModel("large-v3", device="cuda", compute_type="float16")
25
+
26
+ # Load Mistral LLM
27
+ print("Loading Mistral LLM")
28
+ hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
29
+ mistral_model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf"
30
+ mistral_llm = Llama(model_path=mistral_model_path,n_gpu_layers=35,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=False)
31
+
32
+
33
+ # Load XTTS Model
34
+ print("Loading XTTS model")
35
+ os.environ["COQUI_TOS_AGREED"] = "1"
36
+ tts_model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
37
+ ModelManager().download_model(tts_model_name)
38
+ tts_model_path = os.path.join(get_user_data_dir("tts"), tts_model_name.replace("/", "--"))
39
+ config = XttsConfig()
40
+ config.load_json(os.path.join(tts_model_path, "config.json"))
41
+ xtts_model = Xtts.init_from_config(config)
42
+ xtts_model.load_checkpoint(
43
+ config,
44
+ checkpoint_path=os.path.join(tts_model_path, "model.pth"),
45
+ vocab_path=os.path.join(tts_model_path, "vocab.json"),
46
+ eval=True,
47
+ use_deepspeed=True,
48
+ )
49
+ xtts_model.cuda()
50
+
51
+ ###### Set up Gradio Interface ######
52
+
53
+ with gr.Blocks(title="Voice chat with LLM") as demo:
54
+ DESCRIPTION = """# Voice chat with LLM"""
55
+ gr.Markdown(DESCRIPTION)
56
+
57
+ # Define chatbot component
58
+ chatbot = gr.Chatbot(
59
+ value=[(None, "Hi friend, I'm Liya Interviewer and I'm here to help you with that today?")], # Initial greeting from the chatbot
60
+ elem_id="chatbot",
61
+ avatar_images=("examples/hf-logo.png", "examples/ai-chat-logo.png"),
62
+ bubble_full_width=False,
63
+ )
64
+
65
+ # Define chatbot voice component
66
+ VOICES = ["female", "male"]
67
+ with gr.Row():
68
+ chatbot_voice = gr.Dropdown(
69
+ label="Voice of the Chatbot",
70
+ info="How should Chatbot talk like",
71
+ choices=VOICES,
72
+ max_choices=1,
73
+ value=VOICES[0],
74
+ )
75
+
76
+ # Define text and audio record input components
77
+ with gr.Row():
78
+ txt_box = gr.Textbox(
79
+ scale=3,
80
+ show_label=False,
81
+ placeholder="Enter text and press enter, or speak to your microphone",
82
+ container=False,
83
+ interactive=True,
84
+ )
85
+ audio_record = gr.Audio(source="microphone", type="filepath", scale=4)
86
+
87
+ # Define generated audio playback component
88
+ with gr.Row():
89
+ sentence = gr.Textbox(visible=False)
90
+ audio_playback = gr.Audio(
91
+ value=None,
92
+ label="Generated audio response",
93
+ streaming=True,
94
+ autoplay=True,
95
+ interactive=False,
96
+ show_label=True,
97
+ )
98
+
99
+ # Will be triggered on text submit (will send to generate_speech)
100
+ def add_text(chatbot_history, text):
101
+ chatbot_history = [] if chatbot_history is None else chatbot_history
102
+ chatbot_history = chatbot_history + [(text, None)]
103
+ return chatbot_history, gr.update(value="", interactive=False)
104
+
105
+ # Will be triggered on voice submit (will transribe and send to generate_speech)
106
+ def add_audio(chatbot_history, audio):
107
+ chatbot_history = [] if chatbot_history is None else chatbot_history
108
+ # get result from whisper and strip it to delete begin and end space
109
+ response, _ = whisper_model.transcribe(audio)
110
+ text = list(response)[0].text.strip()
111
+ print("Transcribed text:", text)
112
+ chatbot_history = chatbot_history + [(text, None)]
113
+ return chatbot_history, gr.update(value="", interactive=False)
114
+
115
+ def generate_speech(chatbot_history, chatbot_voice, initial_greeting=False):
116
+ # Start by yielding an initial empty audio to set up autoplay
117
+ yield ("", chatbot_history, wave_header_chunk())
118
+
119
+ # Helper function to handle the speech generation and yielding process
120
+ def handle_speech_generation(sentence, chatbot_history, chatbot_voice):
121
+ if sentence != "":
122
+ print("Processing sentence")
123
+ generated_speech = generate_speech_for_sentence(chatbot_history, chatbot_voice, sentence, xtts_model, xtts_supported_languages=config.languages, return_as_byte=True)
124
+ if generated_speech is not None:
125
+ _, audio_dict = generated_speech
126
+ yield (sentence, chatbot_history, audio_dict["value"])
127
+
128
+ if initial_greeting:
129
+ # Process only the initial greeting if specified
130
+ for _, sentence in chatbot_history:
131
+ yield from handle_speech_generation(sentence, chatbot_history, chatbot_voice)
132
+ else:
133
+ # Continuously get and process sentences from a generator function
134
+ for sentence, chatbot_history in get_sentence(chatbot_history, mistral_llm):
135
+ print("Inserting sentence to queue")
136
+ yield from handle_speech_generation(sentence, chatbot_history, chatbot_voice)
137
+
138
+ txt_msg = txt_box.submit(fn=add_text, inputs=[chatbot, txt_box], outputs=[chatbot, txt_box], queue=False
139
+ ).then(fn=generate_speech, inputs=[chatbot,chatbot_voice], outputs=[sentence, chatbot, audio_playback])
140
+
141
+ txt_msg.then(fn=lambda: gr.update(interactive=True), inputs=None, outputs=[txt_box], queue=False)
142
+
143
+ audio_msg = audio_record.stop_recording(fn=add_audio, inputs=[chatbot, audio_record], outputs=[chatbot, txt_box], queue=False
144
+ ).then(fn=generate_speech, inputs=[chatbot,chatbot_voice], outputs=[sentence, chatbot, audio_playback])
145
+
146
+ audio_msg.then(fn=lambda: (gr.update(interactive=True),gr.update(interactive=True,value=None)), inputs=None, outputs=[txt_box, audio_record], queue=False)
147
+
148
+ FOOTNOTE = """
149
+ This Space demonstrates how to speak to an llm chatbot, based solely on open accessible models.
150
+ It relies on the following models :
151
+ - Speech to Text Model: [Faster-Whisper-large-v3](https://huggingface.co/Systran/faster-whisper-large-v3) an ASR model, to transcribe recorded audio to text.
152
+ - Large Language Model: [Mistral-7b-instruct-v0.1-quantized](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF) a LLM to generate the chatbot responses.
153
+ - Text to Speech Model: [XTTS-v2](https://huggingface.co/spaces/coqui/xtts) a TTS model, to generate the voice of the chatbot.
154
+ Note:
155
+ - Responses generated by chat model should not be assumed correct or taken serious, as this is a demonstration example only
156
+ - iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
157
+ gr.Markdown(FOOTNOTE)
158
+ demo.load(fn=generate_speech, inputs=[chatbot,chatbot_voice, gr.State(value=True)], outputs=[sentence, chatbot, audio_playback])
159
+ demo.queue().launch(debug=True,share=True)