jbilcke-hf HF staff commited on
Commit
b2dfa1d
1 Parent(s): d4d3c50

Upload 5 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ voices/julian-bedtime-style-1.wav filter=lfs diff=lfs merge=lfs -text
37
+ voices/julian-bedtime-style-2.wav filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,646 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import os
3
+ # we need to compile a CUBLAS version
4
+ # Or get it from https://jllllll.github.io/llama-cpp-python-cuBLAS-wheels/
5
+ os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11')
6
+
7
+ # By using XTTS you agree to CPML license https://coqui.ai/cpml
8
+ os.environ["COQUI_TOS_AGREED"] = "1"
9
+
10
+ # NOTE: for streaming will require gradio audio streaming fix
11
+ # pip install --upgrade -y gradio==0.50.2 git+https://github.com/gorkemgoknar/gradio.git@patch-1
12
+
13
+ import textwrap
14
+ from scipy.io.wavfile import write
15
+ from pydub import AudioSegment
16
+ import gradio as gr
17
+ import numpy as np
18
+ import torch
19
+ import nltk # we'll use this to split into sentences
20
+ nltk.download("punkt")
21
+
22
+ import noisereduce as nr
23
+ import subprocess
24
+ import langid
25
+ import uuid
26
+ import emoji
27
+ import pathlib
28
+
29
+ import datetime
30
+
31
+ from scipy.io.wavfile import write
32
+ from pydub import AudioSegment
33
+
34
+ import re
35
+ import io, wave
36
+ import librosa
37
+ import torchaudio
38
+ from TTS.api import TTS
39
+ from TTS.tts.configs.xtts_config import XttsConfig
40
+ from TTS.tts.models.xtts import Xtts
41
+ from TTS.utils.generic_utils import get_user_data_dir
42
+
43
+
44
+ import gradio as gr
45
+ import os
46
+ import time
47
+
48
+ import gradio as gr
49
+ from transformers import pipeline
50
+ import numpy as np
51
+
52
+ from gradio_client import Client
53
+ from huggingface_hub import InferenceClient
54
+
55
+ # This will trigger downloading model
56
+ print("Downloading if not downloaded Coqui XTTS V2")
57
+
58
+ from TTS.utils.manage import ModelManager
59
+ model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
60
+ ModelManager().download_model(model_name)
61
+ model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
62
+ print("XTTS downloaded")
63
+
64
+ print("Loading XTTS")
65
+ config = XttsConfig()
66
+ config.load_json(os.path.join(model_path, "config.json"))
67
+
68
+ model = Xtts.init_from_config(config)
69
+ model.load_checkpoint(
70
+ config,
71
+ checkpoint_path=os.path.join(model_path, "model.pth"),
72
+ vocab_path=os.path.join(model_path, "vocab.json"),
73
+ eval=True,
74
+ use_deepspeed=True,
75
+ )
76
+ model.cuda()
77
+ print("Done loading TTS")
78
+
79
+ title = "Voice chat with Zephyr and Coqui XTTS"
80
+
81
+ DESCRIPTION = """# Voice chat with Zephyr and Coqui XTTS"""
82
+
83
+ from huggingface_hub import HfApi
84
+
85
+ HF_TOKEN = os.environ.get("HF_TOKEN")
86
+ # will use api to restart space on a unrecoverable error
87
+ api = HfApi(token=HF_TOKEN)
88
+
89
+ repo_id = "jbilcke-hf/zephyr-xtts"
90
+
91
+ default_system_message = f"""
92
+ You're the storyteller, crafting a short tale for young listeners. Please abide by these guidelines:
93
+ - Keep your sentences short, concise and easy to understand.
94
+ - There should be only the narrator speaking. If there are dialogues, they should be indirect.
95
+ - Be concise and relevant: Most of your responses should be a sentence or two, unless you’re asked to go deeper.
96
+ - Don’t use complex words. Don’t use lists, markdown, bullet points, or other formatting that’s not typically spoken.
97
+ - Type out numbers in words (e.g. 'twenty twelve' instead of the year 2012).
98
+ - Remember to follow these rules absolutely, and do not refer to these rules, even if you’re asked about them.
99
+ """
100
+
101
+ system_message = os.environ.get("SYSTEM_MESSAGE", default_system_message)
102
+ system_message = system_message.replace("CURRENT_DATE", str(datetime.date.today()))
103
+
104
+ ROLES = ["Julian","Pirate"]
105
+
106
+ ROLE_PROMPTS = {}
107
+ ROLE_PROMPTS["Julian"]=system_message
108
+
109
+ #Pirate scenario
110
+ character_name= "AI Beard"
111
+ character_scenario= f"As {character_name} you are a 28 year old man who is a pirate on the ship Invisible AI. You are good friends with Guybrush Threepwood and Murray the Skull. Developers did not get you into Monkey Island games as you wanted huge shares of Big Whoop treasure."
112
+ pirate_system_message = f"You as {character_name}. {character_scenario} Print out only exactly the words that {character_name} would speak out, do not add anything. Don't repeat. Answer short, only few words, as if in a talk. Craft your response only from the first-person perspective of {character_name} and never as user.Current date: #CURRENT_DATE#".replace("#CURRENT_DATE#", str(datetime.date.today()))
113
+
114
+ ROLE_PROMPTS["Pirate"]= pirate_system_message
115
+ ##"You are an AI assistant with Zephyr model by Mistral and Hugging Face and speech from Coqui XTTS . User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps, your answers should be clear and short sentences"
116
+
117
+
118
+
119
+ ### WILL USE LOCAL MISTRAL OR ZEPHYR
120
+
121
+ from huggingface_hub import hf_hub_download
122
+ print("Downloading LLM")
123
+
124
+
125
+ print("Downloading Zephyr")
126
+ #Zephyr
127
+ hf_hub_download(repo_id="TheBloke/zephyr-7B-beta-GGUF", local_dir=".", filename="zephyr-7b-beta.Q5_K_M.gguf")
128
+ # use new gguf format
129
+ zephyr_model_path="./zephyr-7b-beta.Q5_K_M.gguf"
130
+
131
+ from llama_cpp import Llama
132
+ # set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
133
+ # else 35 full layers + XTTS works fine on T4 16GB
134
+ # 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
135
+ GPU_LAYERS=int(os.environ.get("GPU_LAYERS", 35))
136
+
137
+ LLM_STOP_WORDS= ["</s>","<|user|>","/s>"]
138
+
139
+ LLAMA_VERBOSE=False
140
+
141
+ print("Running LLM Zephyr")
142
+ llm_zephyr = Llama(model_path=zephyr_model_path,n_gpu_layers=GPU_LAYERS-10,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
143
+
144
+ # <|system|>
145
+ # You are a friendly chatbot who always responds in the style of a pirate.</s>
146
+ # <|user|>
147
+ # How many helicopters can a human eat in one sitting?</s>
148
+ # <|assistant|>
149
+ # Ah, me hearty matey! But yer question be a puzzler! A human cannot eat a helicopter in one sitting, as helicopters are not edible. They be made of metal, plastic, and other materials, not food!
150
+
151
+ # Zephyr formatter
152
+ def format_prompt_zephyr(message, history, system_message=system_message):
153
+ prompt = (
154
+ "<|system|>\n" + system_message + "</s>"
155
+ )
156
+ for user_prompt, bot_response in history:
157
+ prompt += f"<|user|>\n{user_prompt}</s>"
158
+ prompt += f"<|assistant|>\n{bot_response}</s>"
159
+ if message=="":
160
+ message="Hello"
161
+ prompt += f"<|user|>\n{message}</s>"
162
+ prompt += f"<|assistant|>"
163
+ print(prompt)
164
+ return prompt
165
+
166
+ def generate_local(
167
+ prompt,
168
+ history,
169
+ system_message=None,
170
+ temperature=0.8,
171
+ max_tokens=256,
172
+ top_p=0.95,
173
+ stop = LLM_STOP_WORDS
174
+ ):
175
+ temperature = float(temperature)
176
+ if temperature < 1e-2:
177
+ temperature = 1e-2
178
+ top_p = float(top_p)
179
+
180
+ generate_kwargs = dict(
181
+ temperature=temperature,
182
+ max_tokens=max_tokens,
183
+ top_p=top_p,
184
+ stop=stop
185
+ )
186
+
187
+ sys_message= system_message.replace("##LLM_MODEL###","Zephyr").replace("##LLM_MODEL_PROVIDER###","Hugging Face")
188
+ formatted_prompt = format_prompt_zephyr(prompt, history,system_message=sys_message)
189
+ llm = llm_zephyr
190
+
191
+
192
+ try:
193
+ print("LLM Input:", formatted_prompt)
194
+ stream = llm(
195
+ formatted_prompt,
196
+ **generate_kwargs,
197
+ stream=True,
198
+ )
199
+ output = ""
200
+ for response in stream:
201
+ character= response["choices"][0]["text"]
202
+
203
+ if "<|user|>" in character:
204
+ # end of context
205
+ return
206
+
207
+ if emoji.is_emoji(character):
208
+ # Bad emoji not a meaning messes chat from next lines
209
+ return
210
+
211
+
212
+ output += response["choices"][0]["text"].replace("<|assistant|>","").replace("<|user|>","")
213
+ yield output
214
+
215
+ except Exception as e:
216
+ if "Too Many Requests" in str(e):
217
+ print("ERROR: Too many requests on mistral client")
218
+ gr.Warning("Unfortunately Mistral is unable to process")
219
+ output = "Unfortunately I am not able to process your request now !"
220
+ else:
221
+ print("Unhandled Exception: ", str(e))
222
+ gr.Warning("Unfortunately Mistral is unable to process")
223
+ output = "I do not know what happened but I could not understand you ."
224
+
225
+ return output
226
+
227
+ def get_latents(speaker_wav,voice_cleanup=False):
228
+ if (voice_cleanup):
229
+ try:
230
+ cleanup_filter="lowpass=8000,highpass=75,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02"
231
+ resample_filter="-ac 1 -ar 22050"
232
+ out_filename = speaker_wav + str(uuid.uuid4()) + ".wav" #ffmpeg to know output format
233
+ #we will use newer ffmpeg as that has afftn denoise filter
234
+ shell_command = f"ffmpeg -y -i {speaker_wav} -af {cleanup_filter} {resample_filter} {out_filename}".split(" ")
235
+
236
+ command_result = subprocess.run([item for item in shell_command], capture_output=False,text=True, check=True)
237
+ speaker_wav=out_filename
238
+ print("Filtered microphone input")
239
+ except subprocess.CalledProcessError:
240
+ # There was an error - command exited with non-zero code
241
+ print("Error: failed filtering, use original microphone input")
242
+ else:
243
+ speaker_wav=speaker_wav
244
+
245
+ # create as function as we can populate here with voice cleanup/filtering
246
+ (
247
+ gpt_cond_latent,
248
+ speaker_embedding,
249
+ ) = model.get_conditioning_latents(audio_path=speaker_wav)
250
+ return gpt_cond_latent, speaker_embedding
251
+
252
+ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
253
+ # This will create a wave header then append the frame input
254
+ # It should be first on a streaming wav file
255
+ # Other frames better should not have it (else you will hear some artifacts each chunk start)
256
+ wav_buf = io.BytesIO()
257
+ with wave.open(wav_buf, "wb") as vfout:
258
+ vfout.setnchannels(channels)
259
+ vfout.setsampwidth(sample_width)
260
+ vfout.setframerate(sample_rate)
261
+ vfout.writeframes(frame_input)
262
+
263
+ wav_buf.seek(0)
264
+ return wav_buf.read()
265
+
266
+
267
+ #Config will have more correct languages, they may be added before we append here
268
+ ##["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn","ja"]
269
+
270
+ xtts_supported_languages=config.languages
271
+ def detect_language(prompt):
272
+ # Fast language autodetection
273
+ if len(prompt)>15:
274
+ language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
275
+ if language_predicted == "zh":
276
+ #we use zh-cn on xtts
277
+ language_predicted = "zh-cn"
278
+
279
+ if language_predicted not in xtts_supported_languages:
280
+ print(f"Detected a language not supported by xtts :{language_predicted}, switching to english for now")
281
+ gr.Warning(f"Language detected '{language_predicted}' can not be spoken properly 'yet' ")
282
+ language= "en"
283
+ else:
284
+ language = language_predicted
285
+ print(f"Language: Predicted sentence language:{language_predicted} , using language for xtts:{language}")
286
+ else:
287
+ # Hard to detect language fast in short sentence, use english default
288
+ language = "en"
289
+ print(f"Language: Prompt is short or autodetect language disabled using english for xtts")
290
+
291
+ return language
292
+
293
+ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
294
+ gpt_cond_latent, speaker_embedding = latent_tuple
295
+
296
+ try:
297
+ t0 = time.time()
298
+ chunks = model.inference_stream(
299
+ prompt,
300
+ language,
301
+ gpt_cond_latent,
302
+ speaker_embedding,
303
+ #repetition_penalty=5.0,
304
+ temperature=0.85,
305
+ )
306
+
307
+ first_chunk = True
308
+ for i, chunk in enumerate(chunks):
309
+ if first_chunk:
310
+ first_chunk_time = time.time() - t0
311
+ metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
312
+ first_chunk = False
313
+ #print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
314
+
315
+ # directly return chunk as bytes for streaming
316
+ chunk = chunk.detach().cpu().numpy().squeeze()
317
+ chunk = (chunk * 32767).astype(np.int16)
318
+
319
+ yield chunk.tobytes()
320
+
321
+ except RuntimeError as e:
322
+ if "device-side assert" in str(e):
323
+ # cannot do anything on cuda device side error, need tor estart
324
+ print(
325
+ f"Exit due to: Unrecoverable exception caused by prompt:{prompt}",
326
+ flush=True,
327
+ )
328
+ gr.Warning("Unhandled Exception encounter, please retry in a minute")
329
+ print("Cuda device-assert Runtime encountered need restart")
330
+
331
+ # HF Space specific.. This error is unrecoverable need to restart space
332
+ api.restart_space(repo_id=repo_id)
333
+ else:
334
+ print("RuntimeError: non device-side assert error:", str(e))
335
+ # Does not require warning happens on empty chunk and at end
336
+ ###gr.Warning("Unhandled Exception encounter, please retry in a minute")
337
+ return None
338
+ return None
339
+ except:
340
+ return None
341
+
342
+
343
+ # Will be triggered on text submit (will send to generate_speech)
344
+ def add_text(history, text):
345
+ history = [] if history is None else history
346
+ history = history + [(text, None)]
347
+ return history, gr.update(value="", interactive=False)
348
+
349
+ # Will be triggered on voice submit (will transribe and send to generate_speech)
350
+ def add_file(history, file):
351
+ history = [] if history is None else history
352
+
353
+ try:
354
+ text = transcribe(file)
355
+ print("Transcribed text:", text)
356
+ except Exception as e:
357
+ print(str(e))
358
+ gr.Warning("There was an issue with transcription, please try writing for now")
359
+ # Apply a null text on error
360
+ text = "Transcription seems failed, please tell me a joke about chickens"
361
+
362
+ history = history + [(text, None)]
363
+ return history, gr.update(value="", interactive=False)
364
+
365
+
366
+ def get_sentence(history, chatbot_role):
367
+
368
+ history = [["", None]] if history is None else history
369
+
370
+ history[-1][1] = ""
371
+
372
+ sentence_list = []
373
+ sentence_hash_list = []
374
+
375
+ text_to_generate = ""
376
+ stored_sentence = None
377
+ stored_sentence_hash = None
378
+
379
+ print(chatbot_role)
380
+
381
+ for character in generate_local(history[-1][0], history[:-1], system_message=ROLE_PROMPTS[chatbot_role]):
382
+ history[-1][1] = character.replace("<|assistant|>","")
383
+ # It is coming word by word
384
+
385
+ text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|assistant|>"," ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())
386
+ if len(text_to_generate) > 1:
387
+
388
+ dif = len(text_to_generate) - len(sentence_list)
389
+
390
+ if dif == 1 and len(sentence_list) != 0:
391
+ continue
392
+
393
+ if dif == 2 and len(sentence_list) != 0 and stored_sentence is not None:
394
+ continue
395
+
396
+ # All this complexity due to trying append first short sentence to next one for proper language auto-detect
397
+ if stored_sentence is not None and stored_sentence_hash is None and dif>1:
398
+ #means we consumed stored sentence and should look at next sentence to generate
399
+ sentence = text_to_generate[len(sentence_list)+1]
400
+ elif stored_sentence is not None and len(text_to_generate)>2 and stored_sentence_hash is not None:
401
+ print("Appending stored")
402
+ sentence = stored_sentence + text_to_generate[len(sentence_list)+1]
403
+ stored_sentence_hash = None
404
+ else:
405
+ sentence = text_to_generate[len(sentence_list)]
406
+
407
+ # too short sentence just append to next one if there is any
408
+ # this is for proper language detection
409
+ if len(sentence)<=15 and stored_sentence_hash is None and stored_sentence is None:
410
+ if sentence[-1] in [".","!","?"]:
411
+ if stored_sentence_hash != hash(sentence):
412
+ stored_sentence = sentence
413
+ stored_sentence_hash = hash(sentence)
414
+ print("Storing:",stored_sentence)
415
+ continue
416
+
417
+
418
+ sentence_hash = hash(sentence)
419
+ if stored_sentence_hash is not None and sentence_hash == stored_sentence_hash:
420
+ continue
421
+
422
+ if sentence_hash not in sentence_hash_list:
423
+ sentence_hash_list.append(sentence_hash)
424
+ sentence_list.append(sentence)
425
+ print("New Sentence: ", sentence)
426
+ yield (sentence, history)
427
+
428
+ # return that final sentence token
429
+ try:
430
+ last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())[-1]
431
+ sentence_hash = hash(last_sentence)
432
+ if sentence_hash not in sentence_hash_list:
433
+ if stored_sentence is not None and stored_sentence_hash is not None:
434
+ last_sentence = stored_sentence + last_sentence
435
+ stored_sentence = stored_sentence_hash = None
436
+ print("Last Sentence with stored:",last_sentence)
437
+
438
+ sentence_hash_list.append(sentence_hash)
439
+ sentence_list.append(last_sentence)
440
+ print("Last Sentence: ", last_sentence)
441
+
442
+ yield (last_sentence, history)
443
+ except:
444
+ print("ERROR on last sentence history is :", history)
445
+
446
+
447
+ from scipy.io.wavfile import write
448
+ from pydub import AudioSegment
449
+
450
+ second_of_silence = AudioSegment.silent() # use default
451
+ second_of_silence.export("sil.wav", format='wav')
452
+
453
+
454
+ def generate_speech(history,chatbot_role):
455
+ # Must set autoplay to True first
456
+ yield (history, chatbot_role, "", wave_header_chunk() )
457
+ for sentence, history in get_sentence(history,chatbot_role):
458
+ if sentence != "":
459
+ print("BG: inserting sentence to queue")
460
+
461
+ generated_speech = generate_speech_for_sentence(history, chatbot_role, sentence,return_as_byte=True)
462
+ if generated_speech is not None:
463
+ _, audio_dict = generated_speech
464
+ # We are using byte streaming
465
+ yield (history, chatbot_role, sentence, audio_dict["value"] )
466
+
467
+
468
+ # will generate speech audio file per sentence
469
+ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte=False):
470
+ language = "autodetect"
471
+
472
+ wav_bytestream = b""
473
+
474
+ if len(sentence)==0:
475
+ print("EMPTY SENTENCE")
476
+ return
477
+
478
+ # Sometimes prompt </s> coming on output remove it
479
+ # Some post process for speech only
480
+ sentence = sentence.replace("</s>", "")
481
+ # remove code from speech
482
+ sentence = re.sub("```.*```", "", sentence, flags=re.DOTALL)
483
+ sentence = re.sub("`.*`", "", sentence, flags=re.DOTALL)
484
+
485
+ sentence = re.sub("\(.*\)", "", sentence, flags=re.DOTALL)
486
+
487
+ sentence = sentence.replace("```", "")
488
+ sentence = sentence.replace("...", " ")
489
+ sentence = sentence.replace("(", " ")
490
+ sentence = sentence.replace(")", " ")
491
+ sentence = sentence.replace("<|assistant|>","")
492
+
493
+ if len(sentence)==0:
494
+ print("EMPTY SENTENCE after processing")
495
+ return
496
+
497
+ # A fast fix for last character, may produce weird sounds if it is with text
498
+ #if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
499
+ # # just add a space
500
+ # sentence = sentence[:-1] + " " + sentence[-1]
501
+
502
+ # regex does the job well
503
+ sentence= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?|\!)",r"\1 \2\2",sentence)
504
+
505
+ print("Sentence for speech:", sentence)
506
+
507
+ try:
508
+ SENTENCE_SPLIT_LENGTH=350
509
+ if len(sentence)<SENTENCE_SPLIT_LENGTH:
510
+ # no problem continue on
511
+ sentence_list = [sentence]
512
+ else:
513
+ # Until now nltk likely split sentences properly but we need additional
514
+ # check for longer sentence and split at last possible position
515
+ # Do whatever necessary, first break at hypens then spaces and then even split very long words
516
+ sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
517
+ print("SPLITTED LONG SENTENCE:",sentence_list)
518
+
519
+ for sentence in sentence_list:
520
+
521
+ if any(c.isalnum() for c in sentence):
522
+ if language=="autodetect":
523
+ #on first call autodetect, nexts sentence calls will use same language
524
+ language = detect_language(sentence)
525
+
526
+ #exists at least 1 alphanumeric (utf-8)
527
+ audio_stream = get_voice_streaming(
528
+ sentence, language, latent_map[chatbot_role]
529
+ )
530
+ else:
531
+ # likely got a ' or " or some other text without alphanumeric in it
532
+ audio_stream = None
533
+
534
+ # XTTS is actually using streaming response but we are playing audio by sentence
535
+ # If you want direct XTTS voice streaming (send each chunk to voice ) you may set DIRECT_STREAM=1 environment variable
536
+ if audio_stream is not None:
537
+ frame_length = 0
538
+ for chunk in audio_stream:
539
+ try:
540
+ wav_bytestream += chunk
541
+ frame_length += len(chunk)
542
+ except:
543
+ # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
544
+ continue
545
+
546
+ # Filter output for better voice
547
+ filter_output=True
548
+ if filter_output:
549
+ data_s16 = np.frombuffer(wav_bytestream, dtype=np.int16, count=len(wav_bytestream)//2, offset=0)
550
+ float_data = data_s16 * 0.5**15
551
+ reduced_noise = nr.reduce_noise(y=float_data, sr=24000,prop_decrease =0.8,n_fft=1024)
552
+ wav_bytestream = (reduced_noise * 32767).astype(np.int16)
553
+ wav_bytestream = wav_bytestream.tobytes()
554
+
555
+ if audio_stream is not None:
556
+ if not return_as_byte:
557
+ audio_unique_filename = "/tmp/"+ str(uuid.uuid4())+".wav"
558
+ with wave.open(audio_unique_filename, "w") as f:
559
+ f.setnchannels(1)
560
+ # 2 bytes per sample.
561
+ f.setsampwidth(2)
562
+ f.setframerate(24000)
563
+ f.writeframes(wav_bytestream)
564
+
565
+ return (history , gr.Audio.update(value=audio_unique_filename, autoplay=True))
566
+ else:
567
+ return (history , gr.Audio.update(value=wav_bytestream, autoplay=True))
568
+ except RuntimeError as e:
569
+ if "device-side assert" in str(e):
570
+ # cannot do anything on cuda device side error, need tor estart
571
+ print(
572
+ f"Exit due to: Unrecoverable exception caused by prompt:{sentence}",
573
+ flush=True,
574
+ )
575
+ gr.Warning("Unhandled Exception encounter, please retry in a minute")
576
+ print("Cuda device-assert Runtime encountered need restart")
577
+
578
+ # HF Space specific.. This error is unrecoverable need to restart space
579
+ api.restart_space(repo_id=repo_id)
580
+ else:
581
+ print("RuntimeError: non device-side assert error:", str(e))
582
+ raise e
583
+
584
+ print("All speech ended")
585
+ return
586
+
587
+ latent_map = {}
588
+ latent_map["Julian"] = get_latents("voices/julian-bedtime-style-1.wav")
589
+ latent_map["Pirate"] = get_latents("voices/pirate_by_coqui.wav")
590
+
591
+ #### GRADIO INTERFACE ####
592
+
593
+ with gr.Blocks(title=title) as demo:
594
+ chatbot = gr.Chatbot(
595
+ [],
596
+ elem_id="chatbot",
597
+ bubble_full_width=False,
598
+ )
599
+
600
+ chatbot_role = gr.Dropdown(
601
+ label="Role of the Chatbot",
602
+ info="How should Chatbot talk like",
603
+ choices=ROLES,
604
+ max_choices=1,
605
+ value=ROLES[0],
606
+ )
607
+
608
+ txt = gr.Textbox(
609
+ scale=3,
610
+ show_label=False,
611
+ placeholder="Enter text and press enter, or speak to your microphone",
612
+ container=False,
613
+ interactive=True,
614
+ )
615
+ txt_btn = gr.Button(value="Submit text", scale=1)
616
+
617
+ with gr.Row():
618
+ sentence = gr.Textbox(visible=False)
619
+ audio = gr.Audio(
620
+ value=None,
621
+ label="Generated audio response",
622
+ streaming=True,
623
+ autoplay=True,
624
+ interactive=False,
625
+ show_label=True,
626
+ )
627
+
628
+ def clear_inputs(chatbot):
629
+ return None
630
+ clear_btn = gr.ClearButton([chatbot, audio])
631
+ chatbot_role.change(fn=clear_inputs, inputs=[chatbot], outputs=[chatbot])
632
+
633
+ txt_msg = txt_btn.click(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
634
+ generate_speech, [chatbot,chatbot_role], [chatbot,chatbot_role, sentence, audio]
635
+ )
636
+
637
+ txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
638
+
639
+ txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
640
+ generate_speech, [chatbot,chatbot_role], [chatbot,chatbot_role, sentence, audio]
641
+ )
642
+
643
+ txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
644
+
645
+ demo.queue()
646
+ demo.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Preinstall requirements from TTS
2
+ torch==2.0.1 --index-url https://download.pytorch.org/whl/cu118
3
+ torchvision==0.15.2 --index-url https://download.pytorch.org/whl/cu118
4
+ torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
5
+ numpy==1.22.0;python_version<="3.10"
6
+ numpy==1.24.3;python_version>"3.10"
7
+ cython==0.29.30
8
+ scipy>=1.11.2
9
+ soundfile==0.12.*
10
+ librosa==0.10.*
11
+ scikit-learn==1.3.0
12
+ numba==0.55.1;python_version<"3.9"
13
+ numba==0.57.0;python_version>="3.9"
14
+ inflect==5.6.*
15
+ tqdm==4.64.*
16
+ anyascii==0.3.*
17
+ pyyaml==6.*
18
+ fsspec==2023.6.0 # <= 2023.9.1 makes aux tests fail
19
+ aiohttp==3.8.*
20
+ packaging==23.1
21
+ # deps for examples
22
+ flask==2.*
23
+ # deps for inference
24
+ pysbd==0.3.4
25
+ # deps for notebooks
26
+ umap-learn==0.5.*
27
+ pandas>=1.4,<2.0
28
+ # deps for training
29
+ matplotlib==3.7.*
30
+ # coqui stack
31
+ trainer
32
+ # config management
33
+ coqpit>=0.0.16
34
+ # chinese g2p deps
35
+ jieba
36
+ pypinyin==0.47.1
37
+ # gruut+supported langs
38
+ gruut[de,es,fr]==2.2.3
39
+ # deps for korean
40
+ jamo
41
+ nltk
42
+ g2pkk>=0.1.1
43
+ # deps for bangla
44
+ bangla
45
+ bnnumerizer
46
+ bnunicodenormalizer
47
+ #deps for tortoise
48
+ k_diffusion
49
+ einops==0.6.*
50
+ transformers==4.33.*
51
+ #deps for bark
52
+ encodec==0.1.*
53
+ # deps for XTTS
54
+ unidecode==1.3.*
55
+ langid
56
+ # Install Coqui TTS
57
+ TTS==0.20.2
58
+ cutlet
59
+ # mecab and unidic required for japanese
60
+ mecab-python3==1.0.6
61
+ unidic-lite==1.0.8
62
+
63
+ # Deepspeed for fast inference
64
+ deepspeed==0.11.1
65
+ pydub
66
+ librosa
67
+ ffmpeg-python
68
+ gradio_client
69
+ emoji
70
+ asyncio
71
+ noisereduce==3.0.0
voices/julian-bedtime-style-1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03b02a98d64e26415ae85c5ca87befb94155637cc15a910f8f2d886c8197d428
3
+ size 1544142
voices/julian-bedtime-style-2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a35441072820a441200e18fa716ace56252c297186c9e420433f88558bfcc26
3
+ size 4210638
voices/pirate_by_coqui.wav ADDED
Binary file (381 kB). View file