gorkemgoknar commited on
Commit
2b2b539
1 Parent(s): ca0feab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -18
app.py CHANGED
@@ -13,6 +13,7 @@ import torch
13
  import nltk # we'll use this to split into sentences
14
  nltk.download("punkt")
15
 
 
16
  import langid
17
  import uuid
18
 
@@ -114,8 +115,8 @@ import numpy as np
114
  from gradio_client import Client
115
  from huggingface_hub import InferenceClient
116
 
117
- WHISPER_TIMEOUT = int(os.environ.get("WHISPER_TIMEOUT", 30))
118
- whisper_client = Client("https://sanchit-gandhi-whisper-large-v2.hf.space/")
119
  text_client = InferenceClient(
120
  "mistralai/Mistral-7B-Instruct-v0.1",
121
  timeout=WHISPER_TIMEOUT,
@@ -133,8 +134,25 @@ def get_latents(speaker_wav):
133
  ) = model.get_conditioning_latents(audio_path=speaker_wav)
134
  return gpt_cond_latent, diffusion_conditioning, speaker_embedding
135
 
136
- def get_latents(speaker_wav):
137
- # Generate speaker embedding and latents for TTS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  (
139
  gpt_cond_latent,
140
  diffusion_conditioning,
@@ -161,11 +179,9 @@ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=2
161
  return wav_buf.read()
162
 
163
  xtts_supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
164
- def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
165
- gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
166
-
167
  # Fast language autodetection
168
- if len(prompt)>15 and language=="autodetect":
169
  language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
170
  if language_predicted == "zh":
171
  #we use zh-cn on xtts
@@ -181,7 +197,12 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
181
  # Hard to detect language fast in short sentence, use english default
182
  language = "en"
183
  print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
 
 
184
 
 
 
 
185
  try:
186
  t0 = time.time()
187
  chunks = model.inference_stream(
@@ -197,7 +218,7 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
197
  first_chunk_time = time.time() - t0
198
  metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
199
  first_chunk = False
200
- print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
201
 
202
  # In case output is required to be multiple voice files
203
  # out_file = f'{char}_{i}.wav'
@@ -368,22 +389,48 @@ def get_sentence(history, system_prompt=""):
368
  sentence_hash_list = []
369
 
370
  text_to_generate = ""
 
 
371
  for character in generate(history[-1][0], history[:-1]):
372
  history[-1][1] = character
373
  # It is coming word by word
374
 
375
  text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())
376
-
377
  if len(text_to_generate) > 1:
378
  dif = len(text_to_generate) - len(sentence_list)
379
 
380
  if dif == 1 and len(sentence_list) != 0:
381
  continue
382
 
383
- sentence = text_to_generate[len(sentence_list)]
384
- # This is expensive replace with hashing!
385
- sentence_hash = hash(sentence)
386
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
  if sentence_hash not in sentence_hash_list:
388
  sentence_hash_list.append(sentence_hash)
389
  sentence_list.append(sentence)
@@ -394,9 +441,14 @@ def get_sentence(history, system_prompt=""):
394
  last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())[-1]
395
  sentence_hash = hash(last_sentence)
396
  if sentence_hash not in sentence_hash_list:
 
 
 
 
 
397
  sentence_hash_list.append(sentence_hash)
398
  sentence_list.append(last_sentence)
399
- print("New Sentence: ", last_sentence)
400
 
401
  yield (last_sentence, history)
402
 
@@ -408,6 +460,7 @@ def generate_speech(history):
408
  wav_bytestream = b""
409
  for sentence, history in get_sentence(history):
410
  print(sentence)
 
411
  # Sometimes prompt </s> coming on output remove it
412
  # Some post process for speech only
413
  sentence = sentence.replace("</s>", "")
@@ -417,9 +470,9 @@ def generate_speech(history):
417
  sentence = sentence.replace("```", "")
418
  sentence = sentence.replace("(", " ")
419
  sentence = sentence.replace(")", " ")
420
-
421
  # A fast fix for last chacter, may produce weird sounds if it is with text
422
- if sentence[-1] in ["!", "?", ".", ","]:
423
  # just add a space
424
  sentence = sentence[:-1] + " " + sentence[-1]
425
  print("Sentence for speech:", sentence)
@@ -436,7 +489,12 @@ def generate_speech(history):
436
  print("SPLITTED LONG SENTENCE:",sentence_list)
437
 
438
  for sentence in sentence_list:
 
439
  if any(c.isalnum() for c in sentence):
 
 
 
 
440
  #exists at least 1 alphanumeric (utf-8)
441
  audio_stream = get_voice_streaming(
442
  sentence, language, latent_map["Female_Voice"]
@@ -511,7 +569,7 @@ def generate_speech(history):
511
  print("RuntimeError: non device-side assert error:", str(e))
512
  raise e
513
 
514
- time.sleep(1.5)
515
  wav_bytestream = wave_header_chunk() + wav_bytestream
516
  outfile = "combined.wav"
517
  with open(outfile, "wb") as f:
@@ -587,4 +645,4 @@ Note:
587
  - iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
588
  )
589
  demo.queue()
590
- demo.launch(debug=True)
 
13
  import nltk # we'll use this to split into sentences
14
  nltk.download("punkt")
15
 
16
+ import subprocess
17
  import langid
18
  import uuid
19
 
 
115
  from gradio_client import Client
116
  from huggingface_hub import InferenceClient
117
 
118
+ WHISPER_TIMEOUT = int(os.environ.get("WHISPER_TIMEOUT", 45))
119
+ whisper_client = Client("https://sanchit-gandhi-whisper-large-v2.hf.space/",timeout=WHISPER_TIMEOUT)
120
  text_client = InferenceClient(
121
  "mistralai/Mistral-7B-Instruct-v0.1",
122
  timeout=WHISPER_TIMEOUT,
 
134
  ) = model.get_conditioning_latents(audio_path=speaker_wav)
135
  return gpt_cond_latent, diffusion_conditioning, speaker_embedding
136
 
137
+ def get_latents(speaker_wav,voice_cleanup=False):
138
+ if (voice_cleanup):
139
+ try:
140
+ cleanup_filter="lowpass=8000,highpass=75,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02"
141
+ resample_filter="-ac 1 -ar 22050"
142
+ out_filename = speaker_wav + str(uuid.uuid4()) + ".wav" #ffmpeg to know output format
143
+ #we will use newer ffmpeg as that has afftn denoise filter
144
+ shell_command = f"ffmpeg -y -i {speaker_wav} -af {cleanup_filter} {resample_filter} {out_filename}".split(" ")
145
+
146
+ command_result = subprocess.run([item for item in shell_command], capture_output=False,text=True, check=True)
147
+ speaker_wav=out_filename
148
+ print("Filtered microphone input")
149
+ except subprocess.CalledProcessError:
150
+ # There was an error - command exited with non-zero code
151
+ print("Error: failed filtering, use original microphone input")
152
+ else:
153
+ speaker_wav=speaker_wav
154
+
155
+ # create as function as we can populate here with voice cleanup/filtering
156
  (
157
  gpt_cond_latent,
158
  diffusion_conditioning,
 
179
  return wav_buf.read()
180
 
181
  xtts_supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
182
+ def detect_language(prompt):
 
 
183
  # Fast language autodetection
184
+ if len(prompt)>15:
185
  language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
186
  if language_predicted == "zh":
187
  #we use zh-cn on xtts
 
197
  # Hard to detect language fast in short sentence, use english default
198
  language = "en"
199
  print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
200
+
201
+ return language
202
 
203
+ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
204
+ gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
205
+
206
  try:
207
  t0 = time.time()
208
  chunks = model.inference_stream(
 
218
  first_chunk_time = time.time() - t0
219
  metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
220
  first_chunk = False
221
+ #print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
222
 
223
  # In case output is required to be multiple voice files
224
  # out_file = f'{char}_{i}.wav'
 
389
  sentence_hash_list = []
390
 
391
  text_to_generate = ""
392
+ stored_sentence = None
393
+ stored_sentence_hash = None
394
  for character in generate(history[-1][0], history[:-1]):
395
  history[-1][1] = character
396
  # It is coming word by word
397
 
398
  text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())
 
399
  if len(text_to_generate) > 1:
400
  dif = len(text_to_generate) - len(sentence_list)
401
 
402
  if dif == 1 and len(sentence_list) != 0:
403
  continue
404
 
405
+ if dif == 2 and len(sentence_list) != 0 and stored_sentence is not None:
406
+ continue
 
407
 
408
+ # All this complexity due to trying append first short sentence to next one for proper language auto-detect
409
+ if stored_sentence is not None and stored_sentence_hash is None and dif>1:
410
+ #means we consumed stored sentence and should look at next sentence to generate
411
+ sentence = text_to_generate[len(sentence_list)+1]
412
+ elif stored_sentence is not None and len(text_to_generate)>2 and stored_sentence_hash is not None:
413
+ print("Appending stored")
414
+ sentence = stored_sentence + text_to_generate[len(sentence_list)+1]
415
+ stored_sentence_hash = None
416
+ else:
417
+ sentence = text_to_generate[len(sentence_list)]
418
+
419
+ # too short sentence just append to next one if there is any
420
+ # this is for proper language detection
421
+ if len(sentence)<=15 and stored_sentence_hash is None and stored_sentence is None:
422
+ if sentence[-1] in [".","!","?"]:
423
+ if stored_sentence_hash != hash(sentence):
424
+ stored_sentence = sentence
425
+ stored_sentence_hash = hash(sentence)
426
+ print("Storing:",stored_sentence)
427
+ continue
428
+
429
+
430
+ sentence_hash = hash(sentence)
431
+ if stored_sentence_hash is not None and sentence_hash == stored_sentence_hash:
432
+ continue
433
+
434
  if sentence_hash not in sentence_hash_list:
435
  sentence_hash_list.append(sentence_hash)
436
  sentence_list.append(sentence)
 
441
  last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())[-1]
442
  sentence_hash = hash(last_sentence)
443
  if sentence_hash not in sentence_hash_list:
444
+ if stored_sentence is not None and stored_sentence_hash is not None:
445
+ last_sentence = stored_sentence + last_sentence
446
+ stored_sentence = stored_sentence_hash = None
447
+ print("Last Sentence with stored:",last_sentence)
448
+
449
  sentence_hash_list.append(sentence_hash)
450
  sentence_list.append(last_sentence)
451
+ print("Last Sentence: ", last_sentence)
452
 
453
  yield (last_sentence, history)
454
 
 
460
  wav_bytestream = b""
461
  for sentence, history in get_sentence(history):
462
  print(sentence)
463
+
464
  # Sometimes prompt </s> coming on output remove it
465
  # Some post process for speech only
466
  sentence = sentence.replace("</s>", "")
 
470
  sentence = sentence.replace("```", "")
471
  sentence = sentence.replace("(", " ")
472
  sentence = sentence.replace(")", " ")
473
+
474
  # A fast fix for last chacter, may produce weird sounds if it is with text
475
+ if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
476
  # just add a space
477
  sentence = sentence[:-1] + " " + sentence[-1]
478
  print("Sentence for speech:", sentence)
 
489
  print("SPLITTED LONG SENTENCE:",sentence_list)
490
 
491
  for sentence in sentence_list:
492
+
493
  if any(c.isalnum() for c in sentence):
494
+ if language=="autodetect":
495
+ #on first call autodetect, nexts sentence calls will use same language
496
+ language = detect_language(sentence)
497
+
498
  #exists at least 1 alphanumeric (utf-8)
499
  audio_stream = get_voice_streaming(
500
  sentence, language, latent_map["Female_Voice"]
 
569
  print("RuntimeError: non device-side assert error:", str(e))
570
  raise e
571
 
572
+ time.sleep(1)
573
  wav_bytestream = wave_header_chunk() + wav_bytestream
574
  outfile = "combined.wav"
575
  with open(outfile, "wb") as f:
 
645
  - iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
646
  )
647
  demo.queue()
648
+ demo.launch(debug=True)