gorkemgoknar
commited on
Commit
•
2b2b539
1
Parent(s):
ca0feab
Update app.py
Browse files
app.py
CHANGED
@@ -13,6 +13,7 @@ import torch
|
|
13 |
import nltk # we'll use this to split into sentences
|
14 |
nltk.download("punkt")
|
15 |
|
|
|
16 |
import langid
|
17 |
import uuid
|
18 |
|
@@ -114,8 +115,8 @@ import numpy as np
|
|
114 |
from gradio_client import Client
|
115 |
from huggingface_hub import InferenceClient
|
116 |
|
117 |
-
WHISPER_TIMEOUT = int(os.environ.get("WHISPER_TIMEOUT",
|
118 |
-
whisper_client = Client("https://sanchit-gandhi-whisper-large-v2.hf.space/")
|
119 |
text_client = InferenceClient(
|
120 |
"mistralai/Mistral-7B-Instruct-v0.1",
|
121 |
timeout=WHISPER_TIMEOUT,
|
@@ -133,8 +134,25 @@ def get_latents(speaker_wav):
|
|
133 |
) = model.get_conditioning_latents(audio_path=speaker_wav)
|
134 |
return gpt_cond_latent, diffusion_conditioning, speaker_embedding
|
135 |
|
136 |
-
def get_latents(speaker_wav):
|
137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
(
|
139 |
gpt_cond_latent,
|
140 |
diffusion_conditioning,
|
@@ -161,11 +179,9 @@ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=2
|
|
161 |
return wav_buf.read()
|
162 |
|
163 |
xtts_supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
|
164 |
-
def
|
165 |
-
gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
|
166 |
-
|
167 |
# Fast language autodetection
|
168 |
-
if len(prompt)>15
|
169 |
language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
|
170 |
if language_predicted == "zh":
|
171 |
#we use zh-cn on xtts
|
@@ -181,7 +197,12 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
|
|
181 |
# Hard to detect language fast in short sentence, use english default
|
182 |
language = "en"
|
183 |
print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
|
|
|
|
|
184 |
|
|
|
|
|
|
|
185 |
try:
|
186 |
t0 = time.time()
|
187 |
chunks = model.inference_stream(
|
@@ -197,7 +218,7 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
|
|
197 |
first_chunk_time = time.time() - t0
|
198 |
metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
|
199 |
first_chunk = False
|
200 |
-
print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
|
201 |
|
202 |
# In case output is required to be multiple voice files
|
203 |
# out_file = f'{char}_{i}.wav'
|
@@ -368,22 +389,48 @@ def get_sentence(history, system_prompt=""):
|
|
368 |
sentence_hash_list = []
|
369 |
|
370 |
text_to_generate = ""
|
|
|
|
|
371 |
for character in generate(history[-1][0], history[:-1]):
|
372 |
history[-1][1] = character
|
373 |
# It is coming word by word
|
374 |
|
375 |
text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())
|
376 |
-
|
377 |
if len(text_to_generate) > 1:
|
378 |
dif = len(text_to_generate) - len(sentence_list)
|
379 |
|
380 |
if dif == 1 and len(sentence_list) != 0:
|
381 |
continue
|
382 |
|
383 |
-
|
384 |
-
|
385 |
-
sentence_hash = hash(sentence)
|
386 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
387 |
if sentence_hash not in sentence_hash_list:
|
388 |
sentence_hash_list.append(sentence_hash)
|
389 |
sentence_list.append(sentence)
|
@@ -394,9 +441,14 @@ def get_sentence(history, system_prompt=""):
|
|
394 |
last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())[-1]
|
395 |
sentence_hash = hash(last_sentence)
|
396 |
if sentence_hash not in sentence_hash_list:
|
|
|
|
|
|
|
|
|
|
|
397 |
sentence_hash_list.append(sentence_hash)
|
398 |
sentence_list.append(last_sentence)
|
399 |
-
print("
|
400 |
|
401 |
yield (last_sentence, history)
|
402 |
|
@@ -408,6 +460,7 @@ def generate_speech(history):
|
|
408 |
wav_bytestream = b""
|
409 |
for sentence, history in get_sentence(history):
|
410 |
print(sentence)
|
|
|
411 |
# Sometimes prompt </s> coming on output remove it
|
412 |
# Some post process for speech only
|
413 |
sentence = sentence.replace("</s>", "")
|
@@ -417,9 +470,9 @@ def generate_speech(history):
|
|
417 |
sentence = sentence.replace("```", "")
|
418 |
sentence = sentence.replace("(", " ")
|
419 |
sentence = sentence.replace(")", " ")
|
420 |
-
|
421 |
# A fast fix for last chacter, may produce weird sounds if it is with text
|
422 |
-
if sentence[-1] in ["!", "?", ".", ","]:
|
423 |
# just add a space
|
424 |
sentence = sentence[:-1] + " " + sentence[-1]
|
425 |
print("Sentence for speech:", sentence)
|
@@ -436,7 +489,12 @@ def generate_speech(history):
|
|
436 |
print("SPLITTED LONG SENTENCE:",sentence_list)
|
437 |
|
438 |
for sentence in sentence_list:
|
|
|
439 |
if any(c.isalnum() for c in sentence):
|
|
|
|
|
|
|
|
|
440 |
#exists at least 1 alphanumeric (utf-8)
|
441 |
audio_stream = get_voice_streaming(
|
442 |
sentence, language, latent_map["Female_Voice"]
|
@@ -511,7 +569,7 @@ def generate_speech(history):
|
|
511 |
print("RuntimeError: non device-side assert error:", str(e))
|
512 |
raise e
|
513 |
|
514 |
-
time.sleep(1
|
515 |
wav_bytestream = wave_header_chunk() + wav_bytestream
|
516 |
outfile = "combined.wav"
|
517 |
with open(outfile, "wb") as f:
|
@@ -587,4 +645,4 @@ Note:
|
|
587 |
- iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
|
588 |
)
|
589 |
demo.queue()
|
590 |
-
demo.launch(debug=True)
|
|
|
13 |
import nltk # we'll use this to split into sentences
|
14 |
nltk.download("punkt")
|
15 |
|
16 |
+
import subprocess
|
17 |
import langid
|
18 |
import uuid
|
19 |
|
|
|
115 |
from gradio_client import Client
|
116 |
from huggingface_hub import InferenceClient
|
117 |
|
118 |
+
WHISPER_TIMEOUT = int(os.environ.get("WHISPER_TIMEOUT", 45))
|
119 |
+
whisper_client = Client("https://sanchit-gandhi-whisper-large-v2.hf.space/",timeout=WHISPER_TIMEOUT)
|
120 |
text_client = InferenceClient(
|
121 |
"mistralai/Mistral-7B-Instruct-v0.1",
|
122 |
timeout=WHISPER_TIMEOUT,
|
|
|
134 |
) = model.get_conditioning_latents(audio_path=speaker_wav)
|
135 |
return gpt_cond_latent, diffusion_conditioning, speaker_embedding
|
136 |
|
137 |
+
def get_latents(speaker_wav,voice_cleanup=False):
|
138 |
+
if (voice_cleanup):
|
139 |
+
try:
|
140 |
+
cleanup_filter="lowpass=8000,highpass=75,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02"
|
141 |
+
resample_filter="-ac 1 -ar 22050"
|
142 |
+
out_filename = speaker_wav + str(uuid.uuid4()) + ".wav" #ffmpeg to know output format
|
143 |
+
#we will use newer ffmpeg as that has afftn denoise filter
|
144 |
+
shell_command = f"ffmpeg -y -i {speaker_wav} -af {cleanup_filter} {resample_filter} {out_filename}".split(" ")
|
145 |
+
|
146 |
+
command_result = subprocess.run([item for item in shell_command], capture_output=False,text=True, check=True)
|
147 |
+
speaker_wav=out_filename
|
148 |
+
print("Filtered microphone input")
|
149 |
+
except subprocess.CalledProcessError:
|
150 |
+
# There was an error - command exited with non-zero code
|
151 |
+
print("Error: failed filtering, use original microphone input")
|
152 |
+
else:
|
153 |
+
speaker_wav=speaker_wav
|
154 |
+
|
155 |
+
# create as function as we can populate here with voice cleanup/filtering
|
156 |
(
|
157 |
gpt_cond_latent,
|
158 |
diffusion_conditioning,
|
|
|
179 |
return wav_buf.read()
|
180 |
|
181 |
xtts_supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
|
182 |
+
def detect_language(prompt):
|
|
|
|
|
183 |
# Fast language autodetection
|
184 |
+
if len(prompt)>15:
|
185 |
language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
|
186 |
if language_predicted == "zh":
|
187 |
#we use zh-cn on xtts
|
|
|
197 |
# Hard to detect language fast in short sentence, use english default
|
198 |
language = "en"
|
199 |
print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
|
200 |
+
|
201 |
+
return language
|
202 |
|
203 |
+
def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
|
204 |
+
gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
|
205 |
+
|
206 |
try:
|
207 |
t0 = time.time()
|
208 |
chunks = model.inference_stream(
|
|
|
218 |
first_chunk_time = time.time() - t0
|
219 |
metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
|
220 |
first_chunk = False
|
221 |
+
#print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
|
222 |
|
223 |
# In case output is required to be multiple voice files
|
224 |
# out_file = f'{char}_{i}.wav'
|
|
|
389 |
sentence_hash_list = []
|
390 |
|
391 |
text_to_generate = ""
|
392 |
+
stored_sentence = None
|
393 |
+
stored_sentence_hash = None
|
394 |
for character in generate(history[-1][0], history[:-1]):
|
395 |
history[-1][1] = character
|
396 |
# It is coming word by word
|
397 |
|
398 |
text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())
|
|
|
399 |
if len(text_to_generate) > 1:
|
400 |
dif = len(text_to_generate) - len(sentence_list)
|
401 |
|
402 |
if dif == 1 and len(sentence_list) != 0:
|
403 |
continue
|
404 |
|
405 |
+
if dif == 2 and len(sentence_list) != 0 and stored_sentence is not None:
|
406 |
+
continue
|
|
|
407 |
|
408 |
+
# All this complexity due to trying append first short sentence to next one for proper language auto-detect
|
409 |
+
if stored_sentence is not None and stored_sentence_hash is None and dif>1:
|
410 |
+
#means we consumed stored sentence and should look at next sentence to generate
|
411 |
+
sentence = text_to_generate[len(sentence_list)+1]
|
412 |
+
elif stored_sentence is not None and len(text_to_generate)>2 and stored_sentence_hash is not None:
|
413 |
+
print("Appending stored")
|
414 |
+
sentence = stored_sentence + text_to_generate[len(sentence_list)+1]
|
415 |
+
stored_sentence_hash = None
|
416 |
+
else:
|
417 |
+
sentence = text_to_generate[len(sentence_list)]
|
418 |
+
|
419 |
+
# too short sentence just append to next one if there is any
|
420 |
+
# this is for proper language detection
|
421 |
+
if len(sentence)<=15 and stored_sentence_hash is None and stored_sentence is None:
|
422 |
+
if sentence[-1] in [".","!","?"]:
|
423 |
+
if stored_sentence_hash != hash(sentence):
|
424 |
+
stored_sentence = sentence
|
425 |
+
stored_sentence_hash = hash(sentence)
|
426 |
+
print("Storing:",stored_sentence)
|
427 |
+
continue
|
428 |
+
|
429 |
+
|
430 |
+
sentence_hash = hash(sentence)
|
431 |
+
if stored_sentence_hash is not None and sentence_hash == stored_sentence_hash:
|
432 |
+
continue
|
433 |
+
|
434 |
if sentence_hash not in sentence_hash_list:
|
435 |
sentence_hash_list.append(sentence_hash)
|
436 |
sentence_list.append(sentence)
|
|
|
441 |
last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())[-1]
|
442 |
sentence_hash = hash(last_sentence)
|
443 |
if sentence_hash not in sentence_hash_list:
|
444 |
+
if stored_sentence is not None and stored_sentence_hash is not None:
|
445 |
+
last_sentence = stored_sentence + last_sentence
|
446 |
+
stored_sentence = stored_sentence_hash = None
|
447 |
+
print("Last Sentence with stored:",last_sentence)
|
448 |
+
|
449 |
sentence_hash_list.append(sentence_hash)
|
450 |
sentence_list.append(last_sentence)
|
451 |
+
print("Last Sentence: ", last_sentence)
|
452 |
|
453 |
yield (last_sentence, history)
|
454 |
|
|
|
460 |
wav_bytestream = b""
|
461 |
for sentence, history in get_sentence(history):
|
462 |
print(sentence)
|
463 |
+
|
464 |
# Sometimes prompt </s> coming on output remove it
|
465 |
# Some post process for speech only
|
466 |
sentence = sentence.replace("</s>", "")
|
|
|
470 |
sentence = sentence.replace("```", "")
|
471 |
sentence = sentence.replace("(", " ")
|
472 |
sentence = sentence.replace(")", " ")
|
473 |
+
|
474 |
# A fast fix for last chacter, may produce weird sounds if it is with text
|
475 |
+
if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
|
476 |
# just add a space
|
477 |
sentence = sentence[:-1] + " " + sentence[-1]
|
478 |
print("Sentence for speech:", sentence)
|
|
|
489 |
print("SPLITTED LONG SENTENCE:",sentence_list)
|
490 |
|
491 |
for sentence in sentence_list:
|
492 |
+
|
493 |
if any(c.isalnum() for c in sentence):
|
494 |
+
if language=="autodetect":
|
495 |
+
#on first call autodetect, nexts sentence calls will use same language
|
496 |
+
language = detect_language(sentence)
|
497 |
+
|
498 |
#exists at least 1 alphanumeric (utf-8)
|
499 |
audio_stream = get_voice_streaming(
|
500 |
sentence, language, latent_map["Female_Voice"]
|
|
|
569 |
print("RuntimeError: non device-side assert error:", str(e))
|
570 |
raise e
|
571 |
|
572 |
+
time.sleep(1)
|
573 |
wav_bytestream = wave_header_chunk() + wav_bytestream
|
574 |
outfile = "combined.wav"
|
575 |
with open(outfile, "wb") as f:
|
|
|
645 |
- iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
|
646 |
)
|
647 |
demo.queue()
|
648 |
+
demo.launch(debug=True)
|