gorkemgoknar
commited on
Commit
•
0a0b1ab
1
Parent(s):
f31f07e
improvements
Browse files
app.py
CHANGED
@@ -11,8 +11,9 @@ import gradio as gr
|
|
11 |
import numpy as np
|
12 |
import torch
|
13 |
import nltk # we'll use this to split into sentences
|
14 |
-
|
15 |
nltk.download("punkt")
|
|
|
|
|
16 |
import uuid
|
17 |
|
18 |
import datetime
|
@@ -33,9 +34,10 @@ from TTS.utils.generic_utils import get_user_data_dir
|
|
33 |
# For older cards (like 2070 or T4) will reduce value to to smaller for unnecessary waiting
|
34 |
# Could not make play audio next work seemlesly on current Gradio with autoplay so this is a workaround
|
35 |
AUDIO_WAIT_MODIFIER = float(os.environ.get("AUDIO_WAIT_MODIFIER", 0.9))
|
36 |
-
|
37 |
# if set will try to stream audio while receveng audio chunks, beware that recreating audio each time produces artifacts
|
38 |
DIRECT_STREAM = int(os.environ.get("DIRECT_STREAM", 0))
|
|
|
39 |
|
40 |
# This will trigger downloading model
|
41 |
print("Downloading if not downloaded Coqui XTTS V1")
|
@@ -73,7 +75,7 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
|
|
73 |
# will use api to restart space on a unrecoverable error
|
74 |
api = HfApi(token=HF_TOKEN)
|
75 |
|
76 |
-
repo_id = "
|
77 |
|
78 |
default_system_message = """
|
79 |
You are Mistral, a large language model trained and provided by Mistral, architecture of you is decoder-based LM. Your voice backend or text to speech TTS backend is provided via Coqui technology. You are right now served on Huggingface spaces.
|
@@ -94,6 +96,7 @@ system_understand_message = os.environ.get(
|
|
94 |
"SYSTEM_UNDERSTAND_MESSAGE", default_system_understand_message
|
95 |
)
|
96 |
|
|
|
97 |
|
98 |
temperature = 0.9
|
99 |
top_p = 0.6
|
@@ -157,9 +160,28 @@ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=2
|
|
157 |
wav_buf.seek(0)
|
158 |
return wav_buf.read()
|
159 |
|
160 |
-
|
161 |
def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
|
162 |
gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
try:
|
164 |
t0 = time.time()
|
165 |
chunks = model.inference_stream(
|
@@ -381,7 +403,7 @@ def get_sentence(history, system_prompt=""):
|
|
381 |
#### SPEECH GENERATION BY SENTENCE FROM HISTORY ####
|
382 |
|
383 |
def generate_speech(history):
|
384 |
-
language = "
|
385 |
|
386 |
wav_bytestream = b""
|
387 |
for sentence, history in get_sentence(history):
|
@@ -403,65 +425,75 @@ def generate_speech(history):
|
|
403 |
print("Sentence for speech:", sentence)
|
404 |
|
405 |
try:
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
# should not generate voice it will hit token limit
|
410 |
-
# It should not generate audio for it
|
411 |
-
audio_stream = None
|
412 |
else:
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
yield (
|
426 |
-
gr.Audio.update(
|
427 |
-
value=wave_header_chunk() + chunk, autoplay=True
|
428 |
-
),
|
429 |
-
history,
|
430 |
-
)
|
431 |
-
wait_time = len(chunk) / 2 / 24000
|
432 |
-
wait_time = AUDIO_WAIT_MODIFIER * wait_time
|
433 |
-
print("Sleeping till chunk end")
|
434 |
-
time.sleep(wait_time)
|
435 |
-
|
436 |
-
else:
|
437 |
-
wav_chunks += chunk
|
438 |
-
frame_length += len(chunk)
|
439 |
-
except:
|
440 |
-
# hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
|
441 |
-
continue
|
442 |
-
|
443 |
-
if not DIRECT_STREAM:
|
444 |
-
yield (
|
445 |
-
gr.Audio.update(value=None, autoplay=True),
|
446 |
-
history,
|
447 |
-
) # hack to switch autoplay
|
448 |
-
if audio_stream is not None:
|
449 |
-
yield (gr.Audio.update(value=wav_chunks, autoplay=True), history)
|
450 |
-
# Streaming wait time calculation
|
451 |
-
# audio_length = frame_length / sample_width/ frame_rate
|
452 |
-
wait_time = frame_length / 2 / 24000
|
453 |
-
|
454 |
-
# for non streaming
|
455 |
-
# wait_time= librosa.get_duration(path=wav)
|
456 |
-
|
457 |
-
wait_time = AUDIO_WAIT_MODIFIER * wait_time
|
458 |
-
print("Sleeping till audio end")
|
459 |
-
time.sleep(wait_time)
|
460 |
else:
|
461 |
-
#
|
462 |
-
|
463 |
-
|
464 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
465 |
|
466 |
except RuntimeError as e:
|
467 |
if "device-side assert" in str(e):
|
@@ -479,7 +511,7 @@ def generate_speech(history):
|
|
479 |
print("RuntimeError: non device-side assert error:", str(e))
|
480 |
raise e
|
481 |
|
482 |
-
time.sleep(1.
|
483 |
wav_bytestream = wave_header_chunk() + wav_bytestream
|
484 |
outfile = "combined.wav"
|
485 |
with open(outfile, "wb") as f:
|
@@ -495,7 +527,7 @@ with gr.Blocks(title=title) as demo:
|
|
495 |
chatbot = gr.Chatbot(
|
496 |
[],
|
497 |
elem_id="chatbot",
|
498 |
-
avatar_images=("examples/
|
499 |
bubble_full_width=False,
|
500 |
)
|
501 |
|
|
|
11 |
import numpy as np
|
12 |
import torch
|
13 |
import nltk # we'll use this to split into sentences
|
|
|
14 |
nltk.download("punkt")
|
15 |
+
|
16 |
+
import langid
|
17 |
import uuid
|
18 |
|
19 |
import datetime
|
|
|
34 |
# For older cards (like 2070 or T4) will reduce value to to smaller for unnecessary waiting
|
35 |
# Could not make play audio next work seemlesly on current Gradio with autoplay so this is a workaround
|
36 |
AUDIO_WAIT_MODIFIER = float(os.environ.get("AUDIO_WAIT_MODIFIER", 0.9))
|
37 |
+
print("AUDIO_WAIT_MODIFIER set to",AUDIO_WAIT_MODIFIER)
|
38 |
# if set will try to stream audio while receveng audio chunks, beware that recreating audio each time produces artifacts
|
39 |
DIRECT_STREAM = int(os.environ.get("DIRECT_STREAM", 0))
|
40 |
+
print("DIRECT_STREAM set to",DIRECT_STREAM)
|
41 |
|
42 |
# This will trigger downloading model
|
43 |
print("Downloading if not downloaded Coqui XTTS V1")
|
|
|
75 |
# will use api to restart space on a unrecoverable error
|
76 |
api = HfApi(token=HF_TOKEN)
|
77 |
|
78 |
+
repo_id = "coqui/voice-chat-with-mistral"
|
79 |
|
80 |
default_system_message = """
|
81 |
You are Mistral, a large language model trained and provided by Mistral, architecture of you is decoder-based LM. Your voice backend or text to speech TTS backend is provided via Coqui technology. You are right now served on Huggingface spaces.
|
|
|
96 |
"SYSTEM_UNDERSTAND_MESSAGE", default_system_understand_message
|
97 |
)
|
98 |
|
99 |
+
print("Mistral system message set as:", default_system_message)
|
100 |
|
101 |
temperature = 0.9
|
102 |
top_p = 0.6
|
|
|
160 |
wav_buf.seek(0)
|
161 |
return wav_buf.read()
|
162 |
|
163 |
+
xtts_supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
|
164 |
def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
|
165 |
gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
|
166 |
+
|
167 |
+
# Fast language autodetection
|
168 |
+
if len(prompt)>15 and language=="autodetect":
|
169 |
+
language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
|
170 |
+
if language_predicted == "zh":
|
171 |
+
#we use zh-cn on xtts
|
172 |
+
language_predicted = "zh-cn"
|
173 |
+
if language_predicted not in xtts_supported_languages:
|
174 |
+
print(f"Detected a language not supported by xtts :{language_predicted}, switching to english for now")
|
175 |
+
gr.Warning(f"Language detected '{language_predicted}' can not be spoken properly 'yet' ")
|
176 |
+
language= "en"
|
177 |
+
else:
|
178 |
+
language = language_predicted
|
179 |
+
print(f"Language: Predicted sentence language:{language_predicted} , using language for xtts:{language}")
|
180 |
+
else:
|
181 |
+
# Hard to detect language fast in short sentence, use english default
|
182 |
+
language = "en"
|
183 |
+
print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
|
184 |
+
|
185 |
try:
|
186 |
t0 = time.time()
|
187 |
chunks = model.inference_stream(
|
|
|
403 |
#### SPEECH GENERATION BY SENTENCE FROM HISTORY ####
|
404 |
|
405 |
def generate_speech(history):
|
406 |
+
language = "autodetect"
|
407 |
|
408 |
wav_bytestream = b""
|
409 |
for sentence, history in get_sentence(history):
|
|
|
425 |
print("Sentence for speech:", sentence)
|
426 |
|
427 |
try:
|
428 |
+
if len(sentence)<300:
|
429 |
+
# no problem continue on
|
430 |
+
sentence_list = [sentence]
|
|
|
|
|
|
|
431 |
else:
|
432 |
+
# Until now nltk likely split sentences properly but we need additional
|
433 |
+
# check for longer sentence and split at last possible position
|
434 |
+
# Do whatever necessary, first break at hypens then spaces and then even split very long words
|
435 |
+
sentence_list=textwrap(sentence,300)
|
436 |
+
print("SPLITTED LONG SENTENCE:",sentence_list)
|
437 |
+
|
438 |
+
for sentence in sentence_list:
|
439 |
+
if any(c.isalnum() for c in sentence):
|
440 |
+
#exists at least 1 alphanumeric (utf-8)
|
441 |
+
audio_stream = get_voice_streaming(
|
442 |
+
sentence, language, latent_map["Female_Voice"]
|
443 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
444 |
else:
|
445 |
+
# likely got a ' or " or some other text without alphanumeric in it
|
446 |
+
audio_stream = None
|
447 |
+
|
448 |
+
# XTTS is actually using streaming response but we are playing audio by sentence
|
449 |
+
# If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
|
450 |
+
if audio_stream is not None:
|
451 |
+
wav_chunks = wave_header_chunk()
|
452 |
+
frame_length = 0
|
453 |
+
for chunk in audio_stream:
|
454 |
+
try:
|
455 |
+
wav_bytestream += chunk
|
456 |
+
if DIRECT_STREAM:
|
457 |
+
yield (
|
458 |
+
gr.Audio.update(
|
459 |
+
value=wave_header_chunk() + chunk, autoplay=True
|
460 |
+
),
|
461 |
+
history,
|
462 |
+
)
|
463 |
+
wait_time = len(chunk) / 2 / 24000
|
464 |
+
wait_time = AUDIO_WAIT_MODIFIER * wait_time
|
465 |
+
print("Sleeping till chunk end")
|
466 |
+
time.sleep(wait_time)
|
467 |
+
|
468 |
+
else:
|
469 |
+
wav_chunks += chunk
|
470 |
+
frame_length += len(chunk)
|
471 |
+
except:
|
472 |
+
# hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
|
473 |
+
continue
|
474 |
+
|
475 |
+
if not DIRECT_STREAM:
|
476 |
+
yield (
|
477 |
+
gr.Audio.update(value=None, autoplay=True),
|
478 |
+
history,
|
479 |
+
) # hack to switch autoplay
|
480 |
+
if audio_stream is not None:
|
481 |
+
yield (gr.Audio.update(value=wav_chunks, autoplay=True), history)
|
482 |
+
# Streaming wait time calculation
|
483 |
+
# audio_length = frame_length / sample_width/ frame_rate
|
484 |
+
wait_time = frame_length / 2 / 24000
|
485 |
+
|
486 |
+
# for non streaming
|
487 |
+
# wait_time= librosa.get_duration(path=wav)
|
488 |
+
|
489 |
+
wait_time = AUDIO_WAIT_MODIFIER * wait_time
|
490 |
+
print("Sleeping till audio end")
|
491 |
+
time.sleep(wait_time)
|
492 |
+
else:
|
493 |
+
# Either too much text or some programming, give a silence so stream continues
|
494 |
+
second_of_silence = AudioSegment.silent() # use default
|
495 |
+
second_of_silence.export("sil.wav", format="wav")
|
496 |
+
yield (gr.Audio.update(value="sil.wav", autoplay=True), history)
|
497 |
|
498 |
except RuntimeError as e:
|
499 |
if "device-side assert" in str(e):
|
|
|
511 |
print("RuntimeError: non device-side assert error:", str(e))
|
512 |
raise e
|
513 |
|
514 |
+
time.sleep(1.5)
|
515 |
wav_bytestream = wave_header_chunk() + wav_bytestream
|
516 |
outfile = "combined.wav"
|
517 |
with open(outfile, "wb") as f:
|
|
|
527 |
chatbot = gr.Chatbot(
|
528 |
[],
|
529 |
elem_id="chatbot",
|
530 |
+
avatar_images=("examples/mirror.png", "examples/coqui-logo.png"),
|
531 |
bubble_full_width=False,
|
532 |
)
|
533 |
|