gorkemgoknar
commited on
Commit
•
6af5041
1
Parent(s):
168536f
add some comments and remove unnecessary comemnts
Browse files
app.py
CHANGED
@@ -120,6 +120,7 @@ text_client = InferenceClient(
|
|
120 |
|
121 |
|
122 |
###### COQUI TTS FUNCTIONS ######
|
|
|
123 |
def get_latents(speaker_wav):
|
124 |
# create as function as we can populate here with voice cleanup/filtering
|
125 |
(
|
@@ -129,7 +130,88 @@ def get_latents(speaker_wav):
|
|
129 |
) = model.get_conditioning_latents(audio_path=speaker_wav)
|
130 |
return gpt_cond_latent, diffusion_conditioning, speaker_embedding
|
131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
def format_prompt(message, history):
|
134 |
prompt = (
|
135 |
"<s>[INST]" + system_message + "[/INST]" + system_understand_message + "</s>"
|
@@ -140,7 +222,6 @@ def format_prompt(message, history):
|
|
140 |
prompt += f"[INST] {message} [/INST]"
|
141 |
return prompt
|
142 |
|
143 |
-
|
144 |
def generate(
|
145 |
prompt,
|
146 |
history,
|
@@ -197,6 +278,8 @@ def generate(
|
|
197 |
return output
|
198 |
|
199 |
|
|
|
|
|
200 |
def transcribe(wav_path):
|
201 |
try:
|
202 |
# get result from whisper and strip it to delete begin and end space
|
@@ -212,13 +295,13 @@ def transcribe(wav_path):
|
|
212 |
|
213 |
# Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.
|
214 |
|
215 |
-
|
216 |
def add_text(history, text):
|
217 |
history = [] if history is None else history
|
218 |
history = history + [(text, None)]
|
219 |
return history, gr.update(value="", interactive=False)
|
220 |
|
221 |
-
|
222 |
def add_file(history, file):
|
223 |
history = [] if history is None else history
|
224 |
|
@@ -247,90 +330,8 @@ def bot(history, system_prompt=""):
|
|
247 |
history[-1][1] = character
|
248 |
yield history
|
249 |
|
250 |
-
|
251 |
-
|
252 |
-
# Generate speaker embedding and latents for TTS
|
253 |
-
(
|
254 |
-
gpt_cond_latent,
|
255 |
-
diffusion_conditioning,
|
256 |
-
speaker_embedding,
|
257 |
-
) = model.get_conditioning_latents(audio_path=speaker_wav)
|
258 |
-
return gpt_cond_latent, diffusion_conditioning, speaker_embedding
|
259 |
-
|
260 |
-
|
261 |
-
latent_map = {}
|
262 |
-
latent_map["Female_Voice"] = get_latents("examples/female.wav")
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
|
267 |
-
# This will create a wave header then append the frame input
|
268 |
-
# It should be first on a streaming wav file
|
269 |
-
# Other frames better should not have it (else you will hear some artifacts each chunk start)
|
270 |
-
wav_buf = io.BytesIO()
|
271 |
-
with wave.open(wav_buf, "wb") as vfout:
|
272 |
-
vfout.setnchannels(channels)
|
273 |
-
vfout.setsampwidth(sample_width)
|
274 |
-
vfout.setframerate(sample_rate)
|
275 |
-
vfout.writeframes(frame_input)
|
276 |
-
|
277 |
-
wav_buf.seek(0)
|
278 |
-
return wav_buf.read()
|
279 |
-
|
280 |
-
|
281 |
-
def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
|
282 |
-
gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
|
283 |
-
try:
|
284 |
-
t0 = time.time()
|
285 |
-
chunks = model.inference_stream(
|
286 |
-
prompt,
|
287 |
-
language,
|
288 |
-
gpt_cond_latent,
|
289 |
-
speaker_embedding,
|
290 |
-
)
|
291 |
-
|
292 |
-
first_chunk = True
|
293 |
-
for i, chunk in enumerate(chunks):
|
294 |
-
if first_chunk:
|
295 |
-
first_chunk_time = time.time() - t0
|
296 |
-
metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
|
297 |
-
first_chunk = False
|
298 |
-
print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
|
299 |
-
|
300 |
-
# In case output is required to be multiple voice files
|
301 |
-
# out_file = f'{char}_{i}.wav'
|
302 |
-
# write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
|
303 |
-
# audio = AudioSegment.from_file(out_file)
|
304 |
-
# audio.export(out_file, format='wav')
|
305 |
-
# return out_file
|
306 |
-
# directly return chunk as bytes for streaming
|
307 |
-
chunk = chunk.detach().cpu().numpy().squeeze()
|
308 |
-
chunk = (chunk * 32767).astype(np.int16)
|
309 |
-
|
310 |
-
yield chunk.tobytes()
|
311 |
-
|
312 |
-
except RuntimeError as e:
|
313 |
-
if "device-side assert" in str(e):
|
314 |
-
# cannot do anything on cuda device side error, need tor estart
|
315 |
-
print(
|
316 |
-
f"Exit due to: Unrecoverable exception caused by prompt:{prompt}",
|
317 |
-
flush=True,
|
318 |
-
)
|
319 |
-
gr.Warning("Unhandled Exception encounter, please retry in a minute")
|
320 |
-
print("Cuda device-assert Runtime encountered need restart")
|
321 |
-
|
322 |
-
# HF Space specific.. This error is unrecoverable need to restart space
|
323 |
-
api.restart_space(repo_id=repo_id)
|
324 |
-
else:
|
325 |
-
print("RuntimeError: non device-side assert error:", str(e))
|
326 |
-
# Does not require warning happens on empty chunk and at end
|
327 |
-
###gr.Warning("Unhandled Exception encounter, please retry in a minute")
|
328 |
-
return None
|
329 |
-
return None
|
330 |
-
except:
|
331 |
-
return None
|
332 |
-
|
333 |
-
|
334 |
def get_sentence(history, system_prompt=""):
|
335 |
history = [["", None]] if history is None else history
|
336 |
|
@@ -368,7 +369,6 @@ def get_sentence(history, system_prompt=""):
|
|
368 |
yield (sentence, history)
|
369 |
|
370 |
# return that final sentence token
|
371 |
-
# TODO need a counter that one may be replica as before
|
372 |
last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())[-1]
|
373 |
sentence_hash = hash(last_sentence)
|
374 |
if sentence_hash not in sentence_hash_list:
|
@@ -378,7 +378,8 @@ def get_sentence(history, system_prompt=""):
|
|
378 |
|
379 |
yield (last_sentence, history)
|
380 |
|
381 |
-
|
|
|
382 |
def generate_speech(history):
|
383 |
language = "en"
|
384 |
|
@@ -402,9 +403,8 @@ def generate_speech(history):
|
|
402 |
print("Sentence for speech:", sentence)
|
403 |
|
404 |
try:
|
405 |
-
#
|
406 |
-
|
407 |
-
if len(sentence) > 250:
|
408 |
gr.Warning("There was a problem with the last sentence, which was too long, so it won't be spoken.")
|
409 |
# should not generate voice it will hit token limit
|
410 |
# It should not generate audio for it
|
@@ -413,6 +413,8 @@ def generate_speech(history):
|
|
413 |
audio_stream = get_voice_streaming(
|
414 |
sentence, language, latent_map["Female_Voice"]
|
415 |
)
|
|
|
|
|
416 |
if audio_stream is not None:
|
417 |
wav_chunks = wave_header_chunk()
|
418 |
frame_length = 0
|
@@ -485,7 +487,8 @@ def generate_speech(history):
|
|
485 |
yield (gr.Audio.update(value=None, autoplay=False), history)
|
486 |
yield (gr.Audio.update(value=outfile, autoplay=False), history)
|
487 |
|
488 |
-
|
|
|
489 |
with gr.Blocks(title=title) as demo:
|
490 |
gr.Markdown(DESCRIPTION)
|
491 |
|
@@ -547,7 +550,9 @@ It relies on 3 models:
|
|
547 |
3. [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a TTS model, to generate the chatbot answers. This time, the model is hosted locally.
|
548 |
|
549 |
Note:
|
550 |
-
- By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml
|
|
|
|
|
551 |
)
|
552 |
demo.queue()
|
553 |
demo.launch(debug=True)
|
|
|
120 |
|
121 |
|
122 |
###### COQUI TTS FUNCTIONS ######
|
123 |
+
|
124 |
def get_latents(speaker_wav):
|
125 |
# create as function as we can populate here with voice cleanup/filtering
|
126 |
(
|
|
|
130 |
) = model.get_conditioning_latents(audio_path=speaker_wav)
|
131 |
return gpt_cond_latent, diffusion_conditioning, speaker_embedding
|
132 |
|
133 |
+
def get_latents(speaker_wav):
|
134 |
+
# Generate speaker embedding and latents for TTS
|
135 |
+
(
|
136 |
+
gpt_cond_latent,
|
137 |
+
diffusion_conditioning,
|
138 |
+
speaker_embedding,
|
139 |
+
) = model.get_conditioning_latents(audio_path=speaker_wav)
|
140 |
+
return gpt_cond_latent, diffusion_conditioning, speaker_embedding
|
141 |
+
|
142 |
+
|
143 |
+
latent_map = {}
|
144 |
+
latent_map["Female_Voice"] = get_latents("examples/female.wav")
|
145 |
+
|
146 |
+
def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
|
147 |
+
# This will create a wave header then append the frame input
|
148 |
+
# It should be first on a streaming wav file
|
149 |
+
# Other frames better should not have it (else you will hear some artifacts each chunk start)
|
150 |
+
wav_buf = io.BytesIO()
|
151 |
+
with wave.open(wav_buf, "wb") as vfout:
|
152 |
+
vfout.setnchannels(channels)
|
153 |
+
vfout.setsampwidth(sample_width)
|
154 |
+
vfout.setframerate(sample_rate)
|
155 |
+
vfout.writeframes(frame_input)
|
156 |
+
|
157 |
+
wav_buf.seek(0)
|
158 |
+
return wav_buf.read()
|
159 |
+
|
160 |
+
|
161 |
+
def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
|
162 |
+
gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
|
163 |
+
try:
|
164 |
+
t0 = time.time()
|
165 |
+
chunks = model.inference_stream(
|
166 |
+
prompt,
|
167 |
+
language,
|
168 |
+
gpt_cond_latent,
|
169 |
+
speaker_embedding,
|
170 |
+
)
|
171 |
+
|
172 |
+
first_chunk = True
|
173 |
+
for i, chunk in enumerate(chunks):
|
174 |
+
if first_chunk:
|
175 |
+
first_chunk_time = time.time() - t0
|
176 |
+
metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
|
177 |
+
first_chunk = False
|
178 |
+
print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
|
179 |
+
|
180 |
+
# In case output is required to be multiple voice files
|
181 |
+
# out_file = f'{char}_{i}.wav'
|
182 |
+
# write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
|
183 |
+
# audio = AudioSegment.from_file(out_file)
|
184 |
+
# audio.export(out_file, format='wav')
|
185 |
+
# return out_file
|
186 |
+
# directly return chunk as bytes for streaming
|
187 |
+
chunk = chunk.detach().cpu().numpy().squeeze()
|
188 |
+
chunk = (chunk * 32767).astype(np.int16)
|
189 |
+
|
190 |
+
yield chunk.tobytes()
|
191 |
|
192 |
+
except RuntimeError as e:
|
193 |
+
if "device-side assert" in str(e):
|
194 |
+
# cannot do anything on cuda device side error, need tor estart
|
195 |
+
print(
|
196 |
+
f"Exit due to: Unrecoverable exception caused by prompt:{prompt}",
|
197 |
+
flush=True,
|
198 |
+
)
|
199 |
+
gr.Warning("Unhandled Exception encounter, please retry in a minute")
|
200 |
+
print("Cuda device-assert Runtime encountered need restart")
|
201 |
+
|
202 |
+
# HF Space specific.. This error is unrecoverable need to restart space
|
203 |
+
api.restart_space(repo_id=repo_id)
|
204 |
+
else:
|
205 |
+
print("RuntimeError: non device-side assert error:", str(e))
|
206 |
+
# Does not require warning happens on empty chunk and at end
|
207 |
+
###gr.Warning("Unhandled Exception encounter, please retry in a minute")
|
208 |
+
return None
|
209 |
+
return None
|
210 |
+
except:
|
211 |
+
return None
|
212 |
+
|
213 |
+
###### MISTRAL FUNCTIONS ######
|
214 |
+
|
215 |
def format_prompt(message, history):
|
216 |
prompt = (
|
217 |
"<s>[INST]" + system_message + "[/INST]" + system_understand_message + "</s>"
|
|
|
222 |
prompt += f"[INST] {message} [/INST]"
|
223 |
return prompt
|
224 |
|
|
|
225 |
def generate(
|
226 |
prompt,
|
227 |
history,
|
|
|
278 |
return output
|
279 |
|
280 |
|
281 |
+
###### WHISPER FUNCTIONS ######
|
282 |
+
|
283 |
def transcribe(wav_path):
|
284 |
try:
|
285 |
# get result from whisper and strip it to delete begin and end space
|
|
|
295 |
|
296 |
# Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.
|
297 |
|
298 |
+
# Will be triggered on text submit (will send to generate_speech)
|
299 |
def add_text(history, text):
|
300 |
history = [] if history is None else history
|
301 |
history = history + [(text, None)]
|
302 |
return history, gr.update(value="", interactive=False)
|
303 |
|
304 |
+
# Will be triggered on voice submit (will transribe and send to generate_speech)
|
305 |
def add_file(history, file):
|
306 |
history = [] if history is None else history
|
307 |
|
|
|
330 |
history[-1][1] = character
|
331 |
yield history
|
332 |
|
333 |
+
##### MISTRAL STREAMING Sentence splitter ####
|
334 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
335 |
def get_sentence(history, system_prompt=""):
|
336 |
history = [["", None]] if history is None else history
|
337 |
|
|
|
369 |
yield (sentence, history)
|
370 |
|
371 |
# return that final sentence token
|
|
|
372 |
last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())[-1]
|
373 |
sentence_hash = hash(last_sentence)
|
374 |
if sentence_hash not in sentence_hash_list:
|
|
|
378 |
|
379 |
yield (last_sentence, history)
|
380 |
|
381 |
+
#### SPEECH GENERATION BY SENTENCE FROM HISTORY ####
|
382 |
+
|
383 |
def generate_speech(history):
|
384 |
language = "en"
|
385 |
|
|
|
403 |
print("Sentence for speech:", sentence)
|
404 |
|
405 |
try:
|
406 |
+
#TODO this will be better handled in future using textwrap
|
407 |
+
if len(sentence) > 300:
|
|
|
408 |
gr.Warning("There was a problem with the last sentence, which was too long, so it won't be spoken.")
|
409 |
# should not generate voice it will hit token limit
|
410 |
# It should not generate audio for it
|
|
|
413 |
audio_stream = get_voice_streaming(
|
414 |
sentence, language, latent_map["Female_Voice"]
|
415 |
)
|
416 |
+
# XTTS is actually using streaming response but we are playing audio by sentence
|
417 |
+
# If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
|
418 |
if audio_stream is not None:
|
419 |
wav_chunks = wave_header_chunk()
|
420 |
frame_length = 0
|
|
|
487 |
yield (gr.Audio.update(value=None, autoplay=False), history)
|
488 |
yield (gr.Audio.update(value=outfile, autoplay=False), history)
|
489 |
|
490 |
+
#### GRADIO INTERFACE ####
|
491 |
+
|
492 |
with gr.Blocks(title=title) as demo:
|
493 |
gr.Markdown(DESCRIPTION)
|
494 |
|
|
|
550 |
3. [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a TTS model, to generate the chatbot answers. This time, the model is hosted locally.
|
551 |
|
552 |
Note:
|
553 |
+
- By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml
|
554 |
+
- Responses generated by chat model should not be assumed correct as this is a demonstration example only
|
555 |
+
- iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
|
556 |
)
|
557 |
demo.queue()
|
558 |
demo.launch(debug=True)
|