Spaces:
Running
Running
updates
Browse files- app.py +103 -168
- requirements.txt +3 -1
app.py
CHANGED
@@ -1,49 +1,86 @@
|
|
1 |
from __future__ import annotations
|
2 |
-
|
3 |
-
from IPython import get_ipython
|
4 |
import os
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
#os.system('pip install gradio==3.48.0')
|
|
|
7 |
os.system('pip install python-dotenv')
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
print('Not connected to a GPU')
|
14 |
is_gpu = False
|
15 |
-
else:
|
16 |
-
print(gpu_info)
|
17 |
-
is_gpu = True
|
18 |
-
# In[2]:
|
19 |
-
# In[3]:
|
20 |
import os
|
21 |
import dotenv
|
22 |
# Load the environment variables from the .env file
|
|
|
|
|
|
|
23 |
dotenv.load_dotenv()
|
24 |
# Access the value of the SECRET_TOKEN variable
|
25 |
secret_token = os.getenv("SECRET_TOKEN")
|
26 |
-
# In[7]:
|
27 |
import os
|
28 |
#download for mecab
|
|
|
29 |
os.system("python -m unidic download")
|
30 |
-
|
31 |
# By using XTTS you agree to CPML license https://coqui.ai/cpml
|
32 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
33 |
# NOTE: for streaming will require gradio audio streaming fix
|
34 |
# pip install --upgrade -y gradio==0.50.2 git+https://github.com/gorkemgoknar/gradio.git@patch-1
|
35 |
#Now you’re ready to install 🤗 Transformers with the following command:
|
36 |
-
#For CPU-support only, Transformers and PyTorch with:
|
37 |
-
os.system('pip install transformers[torch]')
|
38 |
if not is_gpu:
|
|
|
|
|
|
|
39 |
#pip install 'transformers[tf-cpu]' #Transformers and TensorFlow 2.0:
|
40 |
os.system('pip install llama-cpp-python==0.2.11')
|
41 |
else:
|
42 |
-
|
|
|
43 |
# Or get it from https://jllllll.github.io/llama-cpp-python-cuBLAS-wheels/
|
44 |
os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11')
|
|
|
45 |
|
46 |
-
# In[8]:
|
47 |
import textwrap
|
48 |
from scipy.io.wavfile import write
|
49 |
from pydub import AudioSegment
|
@@ -52,19 +89,15 @@ import numpy as np
|
|
52 |
import torch
|
53 |
import nltk # we'll use this to split into sentences
|
54 |
nltk.download("punkt")
|
55 |
-
|
56 |
import noisereduce as nr
|
57 |
import subprocess
|
58 |
import langid
|
59 |
import uuid
|
60 |
import emoji
|
61 |
import pathlib
|
62 |
-
|
63 |
import datetime
|
64 |
-
|
65 |
from scipy.io.wavfile import write
|
66 |
from pydub import AudioSegment
|
67 |
-
|
68 |
import re
|
69 |
import io, wave
|
70 |
import librosa
|
@@ -73,23 +106,15 @@ from TTS.api import TTS
|
|
73 |
from TTS.tts.configs.xtts_config import XttsConfig
|
74 |
from TTS.tts.models.xtts import Xtts
|
75 |
from TTS.utils.generic_utils import get_user_data_dir
|
76 |
-
|
77 |
-
|
78 |
import gradio as gr
|
79 |
import os
|
80 |
import time
|
81 |
-
|
82 |
import gradio as gr
|
83 |
-
from transformers import pipeline
|
84 |
import numpy as np
|
85 |
-
|
86 |
from gradio_client import Client
|
87 |
from huggingface_hub import InferenceClient
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
# In[9]:
|
92 |
-
|
93 |
|
94 |
# This will trigger downloading model
|
95 |
print("Downloading if not downloaded Coqui XTTS V2")
|
@@ -98,16 +123,13 @@ model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
|
|
98 |
ModelManager().download_model(model_name)
|
99 |
model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
|
100 |
print("XTTS downloaded")
|
101 |
-
|
102 |
if is_gpu:
|
103 |
use_deepspeed=True
|
104 |
else:
|
105 |
use_deepspeed=False
|
106 |
-
|
107 |
print("Loading XTTS")
|
108 |
config = XttsConfig()
|
109 |
config.load_json(os.path.join(model_path, "config.json"))
|
110 |
-
|
111 |
model = Xtts.init_from_config(config)
|
112 |
model.load_checkpoint(
|
113 |
config,
|
@@ -116,23 +138,11 @@ model.load_checkpoint(
|
|
116 |
eval=True,
|
117 |
use_deepspeed=use_deepspeed,
|
118 |
)
|
119 |
-
|
120 |
-
#if is_gpu:
|
121 |
-
# model.cuda()
|
122 |
-
|
123 |
print("Done loading TTS")
|
124 |
-
|
125 |
-
|
126 |
-
# In[60]:
|
127 |
-
|
128 |
-
|
129 |
#####llm_model = os.environ.get("LLM_MODEL", "mistral") # or "zephyr"
|
130 |
-
|
131 |
title = "Voice chat with Zephyr/Mistral and Coqui XTTS"
|
132 |
-
|
133 |
DESCRIPTION = """# Voice chat with Zephyr/Mistral and Coqui XTTS"""
|
134 |
css = """.toast-wrap { display: none !important } """
|
135 |
-
|
136 |
from huggingface_hub import HfApi
|
137 |
|
138 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
@@ -174,40 +184,14 @@ pirate_system_message = f"You as {character_name}. {character_scenario} Print ou
|
|
174 |
ROLE_PROMPTS["Pirate"]= pirate_system_message
|
175 |
##"You are an AI assistant with Zephyr model by Mistral and Hugging Face and speech from Coqui XTTS . User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps, your answers should be clear and short sentences"
|
176 |
|
177 |
-
# In[49]:
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
# In[15]:
|
184 |
-
|
185 |
-
|
186 |
### WILL USE LOCAL MISTRAL OR ZEPHYR
|
187 |
-
import os
|
188 |
from huggingface_hub import hf_hub_download
|
189 |
-
|
190 |
print("Downloading LLM")
|
191 |
-
|
192 |
-
#
|
193 |
-
|
194 |
-
# Append the current directory to the zephyr_model_path
|
195 |
-
zephyr_model_path = os.path.join(current_dir, "zephyr-7b-beta.Q5_K_M.gguf")
|
196 |
if not os.path.isfile(zephyr_model_path):
|
197 |
-
|
198 |
-
hf_hub_download(repo_id="TheBloke/zephyr-7B-beta-GGUF", local_dir=current_dir, filename="zephyr-7b-beta.Q5_K_M.gguf")
|
199 |
-
else:
|
200 |
-
print("Zephyr it is already downloaded")
|
201 |
-
|
202 |
-
|
203 |
-
# In[ ]:
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
# In[16]:
|
210 |
-
|
211 |
|
212 |
from llama_cpp import Llama
|
213 |
# set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
|
@@ -223,17 +207,14 @@ LLAMA_VERBOSE=False
|
|
223 |
|
224 |
llm_zephyr = Llama(model_path=zephyr_model_path,
|
225 |
n_gpu_layers=GPU_LAYERS,
|
226 |
-
max_new_tokens=512,
|
227 |
-
context_window=4096,
|
228 |
n_ctx=4096,
|
229 |
n_batch=128,
|
230 |
-
|
231 |
-
|
232 |
print("Running LLM Zephyr")
|
233 |
-
|
234 |
-
|
235 |
-
# In[17]:
|
236 |
-
|
237 |
|
238 |
def split_sentences(text, max_len):
|
239 |
# Apply custom rules to enforce sentence breaks with double punctuation
|
@@ -251,7 +232,7 @@ def split_sentences(text, max_len):
|
|
251 |
sentence_list.extend(wrapped)
|
252 |
else:
|
253 |
sentence_list.append(sent)
|
254 |
-
|
255 |
return sentence_list
|
256 |
|
257 |
|
@@ -292,7 +273,7 @@ def pcm_to_wav(pcm_data, sample_rate=24000, channels=1, bit_depth=16):
|
|
292 |
|
293 |
# Prepare the WAV file headers
|
294 |
wav_header = struct.pack('<4sI4s', b'RIFF', chunk_size, b'WAVE') # 'RIFF' chunk descriptor
|
295 |
-
fmt_subchunk = struct.pack('<4sIHHIIHH',
|
296 |
b'fmt ', fmt_subchunk_size, 1, channels,
|
297 |
sample_rate, sample_rate * channels * bit_depth // 8,
|
298 |
channels * bit_depth // 8, bit_depth)
|
@@ -301,12 +282,8 @@ def pcm_to_wav(pcm_data, sample_rate=24000, channels=1, bit_depth=16):
|
|
301 |
|
302 |
return wav_header + fmt_subchunk + data_subchunk + pcm_data
|
303 |
|
304 |
-
|
305 |
-
# In[23]:
|
306 |
-
|
307 |
-
|
308 |
def generate_local_llm(
|
309 |
-
prompt,
|
310 |
history,
|
311 |
system_message=None,
|
312 |
temperature=0.8,
|
@@ -344,13 +321,13 @@ def generate_local_llm(
|
|
344 |
|
345 |
if "<|user|>" in character:
|
346 |
# end of context
|
347 |
-
return
|
348 |
-
|
349 |
if emoji.is_emoji(character):
|
350 |
# Bad emoji not a meaning messes chat from next lines
|
351 |
return
|
352 |
-
|
353 |
-
|
354 |
output += response["choices"][0]["text"].replace("<|assistant|>","").replace("<|user|>","")
|
355 |
yield output
|
356 |
|
@@ -366,16 +343,6 @@ def generate_local_llm(
|
|
366 |
|
367 |
return output
|
368 |
|
369 |
-
|
370 |
-
# In[28]:
|
371 |
-
|
372 |
-
|
373 |
-
get_ipython().system('pip install OpenAI')
|
374 |
-
|
375 |
-
|
376 |
-
# In[103]:
|
377 |
-
|
378 |
-
|
379 |
def generate_stream(prompt, model="mixtral-8x7b"):
|
380 |
base_url = "https://ruslanmv-hf-llm-api.hf.space"
|
381 |
api_key = "sk-xxxxx"
|
@@ -436,16 +403,12 @@ def generate_local(
|
|
436 |
return output
|
437 |
|
438 |
|
439 |
-
# In[ ]:
|
440 |
-
|
441 |
-
|
442 |
-
# In[17]:
|
443 |
|
444 |
|
445 |
def get_latents(speaker_wav,voice_cleanup=False):
|
446 |
if (voice_cleanup):
|
447 |
try:
|
448 |
-
cleanup_filter="lowpass=8000,highpass=75,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02"
|
449 |
resample_filter="-ac 1 -ar 22050"
|
450 |
out_filename = speaker_wav + str(uuid.uuid4()) + ".wav" #ffmpeg to know output format
|
451 |
#we will use newer ffmpeg as that has afftn denoise filter
|
@@ -459,7 +422,7 @@ def get_latents(speaker_wav,voice_cleanup=False):
|
|
459 |
print("Error: failed filtering, use original microphone input")
|
460 |
else:
|
461 |
speaker_wav=speaker_wav
|
462 |
-
|
463 |
# create as function as we can populate here with voice cleanup/filtering
|
464 |
(
|
465 |
gpt_cond_latent,
|
@@ -485,15 +448,15 @@ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=2
|
|
485 |
#Config will have more correct languages, they may be added before we append here
|
486 |
##["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn","ja"]
|
487 |
|
488 |
-
xtts_supported_languages=config.languages
|
489 |
def detect_language(prompt):
|
490 |
# Fast language autodetection
|
491 |
if len(prompt)>15:
|
492 |
language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
|
493 |
-
if language_predicted == "zh":
|
494 |
#we use zh-cn on xtts
|
495 |
language_predicted = "zh-cn"
|
496 |
-
|
497 |
if language_predicted not in xtts_supported_languages:
|
498 |
print(f"Detected a language not supported by xtts :{language_predicted}, switching to english for now")
|
499 |
gr.Warning(f"Language detected '{language_predicted}' can not be spoken properly 'yet' ")
|
@@ -508,10 +471,6 @@ def detect_language(prompt):
|
|
508 |
|
509 |
return language
|
510 |
|
511 |
-
|
512 |
-
# In[18]:
|
513 |
-
|
514 |
-
|
515 |
def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
|
516 |
gpt_cond_latent, speaker_embedding = latent_tuple
|
517 |
|
@@ -559,16 +518,6 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
|
|
559 |
except:
|
560 |
return None
|
561 |
|
562 |
-
|
563 |
-
# In[ ]:
|
564 |
-
|
565 |
-
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
# In[19]:
|
570 |
-
|
571 |
-
|
572 |
# Will be triggered on text submit (will send to generate_speech)
|
573 |
def add_text(history, text):
|
574 |
history = [] if history is None else history
|
@@ -593,7 +542,7 @@ def add_file(history, file):
|
|
593 |
|
594 |
|
595 |
def get_sentence(history, chatbot_role):
|
596 |
-
|
597 |
history = [["", None]] if history is None else history
|
598 |
|
599 |
history[-1][1] = ""
|
@@ -606,14 +555,14 @@ def get_sentence(history, chatbot_role):
|
|
606 |
stored_sentence_hash = None
|
607 |
|
608 |
print(chatbot_role)
|
609 |
-
|
610 |
for character in generate_local(history[-1][0], history[:-1], system_message=ROLE_PROMPTS[chatbot_role]):
|
611 |
history[-1][1] = character.replace("<|assistant|>","")
|
612 |
# It is coming word by word
|
613 |
|
614 |
text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|assistant|>"," ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())
|
615 |
if len(text_to_generate) > 1:
|
616 |
-
|
617 |
dif = len(text_to_generate) - len(sentence_list)
|
618 |
|
619 |
if dif == 1 and len(sentence_list) != 0:
|
@@ -632,22 +581,22 @@ def get_sentence(history, chatbot_role):
|
|
632 |
stored_sentence_hash = None
|
633 |
else:
|
634 |
sentence = text_to_generate[len(sentence_list)]
|
635 |
-
|
636 |
# too short sentence just append to next one if there is any
|
637 |
-
# this is for proper language detection
|
638 |
if len(sentence)<=15 and stored_sentence_hash is None and stored_sentence is None:
|
639 |
if sentence[-1] in [".","!","?"]:
|
640 |
if stored_sentence_hash != hash(sentence):
|
641 |
stored_sentence = sentence
|
642 |
-
stored_sentence_hash = hash(sentence)
|
643 |
print("Storing:",stored_sentence)
|
644 |
continue
|
645 |
-
|
646 |
-
|
647 |
sentence_hash = hash(sentence)
|
648 |
if stored_sentence_hash is not None and sentence_hash == stored_sentence_hash:
|
649 |
continue
|
650 |
-
|
651 |
if sentence_hash not in sentence_hash_list:
|
652 |
sentence_hash_list.append(sentence_hash)
|
653 |
sentence_list.append(sentence)
|
@@ -663,28 +612,22 @@ def get_sentence(history, chatbot_role):
|
|
663 |
last_sentence = stored_sentence + last_sentence
|
664 |
stored_sentence = stored_sentence_hash = None
|
665 |
print("Last Sentence with stored:",last_sentence)
|
666 |
-
|
667 |
sentence_hash_list.append(sentence_hash)
|
668 |
sentence_list.append(last_sentence)
|
669 |
print("Last Sentence: ", last_sentence)
|
670 |
-
|
671 |
yield (last_sentence, history)
|
672 |
except:
|
673 |
print("ERROR on last sentence history is :", history)
|
674 |
|
675 |
|
676 |
-
# In[19]:
|
677 |
-
|
678 |
-
|
679 |
from scipy.io.wavfile import write
|
680 |
from pydub import AudioSegment
|
681 |
|
682 |
second_of_silence = AudioSegment.silent() # use default
|
683 |
second_of_silence.export("sil.wav", format='wav')
|
684 |
-
|
685 |
-
|
686 |
-
|
687 |
-
# In[20]:
|
688 |
|
689 |
|
690 |
def generate_speech_from_history(history, chatbot_role, sentence):
|
@@ -692,7 +635,7 @@ def generate_speech_from_history(history, chatbot_role, sentence):
|
|
692 |
# total_wav_bytestream = b""
|
693 |
if len(sentence)==0:
|
694 |
print("EMPTY SENTENCE")
|
695 |
-
return
|
696 |
# Sometimes prompt </s> coming on output remove it
|
697 |
# Some post process for speech only
|
698 |
sentence = sentence.replace("</s>", "")
|
@@ -714,20 +657,20 @@ def generate_speech_from_history(history, chatbot_role, sentence):
|
|
714 |
#if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
|
715 |
# # just add a space
|
716 |
# sentence = sentence[:-1] + " " + sentence[-1]
|
717 |
-
|
718 |
# regex does the job well
|
719 |
sentence = re.sub("([^\x00-\x7F]|\w)([\.。?!]+)",r"\1 \2",sentence)
|
720 |
-
|
721 |
print("Sentence for speech:", sentence)
|
722 |
|
723 |
results = []
|
724 |
-
|
725 |
try:
|
726 |
if len(sentence) < SENTENCE_SPLIT_LENGTH:
|
727 |
# no problem continue on
|
728 |
sentence_list = [sentence]
|
729 |
else:
|
730 |
-
# Until now nltk likely split sentences properly but we need additional
|
731 |
# check for longer sentence and split at last possible position
|
732 |
# Do whatever necessary, first break at hypens then spaces and then even split very long words
|
733 |
# sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
|
@@ -738,7 +681,7 @@ def generate_speech_from_history(history, chatbot_role, sentence):
|
|
738 |
if any(c.isalnum() for c in sentence):
|
739 |
if language=="autodetect":
|
740 |
#on first call autodetect, nexts sentence calls will use same language
|
741 |
-
language = detect_language(sentence)
|
742 |
#exists at least 1 alphanumeric (utf-8)
|
743 |
|
744 |
#print("Inserting data to get_voice_streaming:")
|
@@ -766,7 +709,7 @@ def generate_speech_from_history(history, chatbot_role, sentence):
|
|
766 |
continue
|
767 |
|
768 |
# Filter output for better voice
|
769 |
-
filter_output=True
|
770 |
if filter_output:
|
771 |
try:
|
772 |
data_s16 = np.frombuffer(sentence_wav_bytestream, dtype=np.int16, count=len(sentence_wav_bytestream)//2, offset=0)
|
@@ -776,7 +719,7 @@ def generate_speech_from_history(history, chatbot_role, sentence):
|
|
776 |
sentence_wav_bytestream = sentence_wav_bytestream.tobytes()
|
777 |
except:
|
778 |
print("failed to remove noise")
|
779 |
-
|
780 |
# Directly encode the WAV bytestream to base64
|
781 |
base64_audio = base64.b64encode(pcm_to_wav(sentence_wav_bytestream)).decode('utf8')
|
782 |
|
@@ -804,9 +747,6 @@ def generate_speech_from_history(history, chatbot_role, sentence):
|
|
804 |
return results
|
805 |
|
806 |
|
807 |
-
# In[21]:
|
808 |
-
|
809 |
-
|
810 |
latent_map = {}
|
811 |
try:
|
812 |
# get the current working directory
|
@@ -822,16 +762,12 @@ try:
|
|
822 |
|
823 |
except Exception as e:
|
824 |
print("Error:", str(e))
|
825 |
-
|
826 |
-
|
827 |
-
# In[ ]:
|
828 |
-
|
829 |
-
|
830 |
# Define the main function for the API endpoint that takes the input text and chatbot role
|
831 |
def generate_story_and_speech(secret_token, input_text, chatbot_role):
|
832 |
if secret_token != SECRET_TOKEN:
|
833 |
raise gr.Error(
|
834 |
-
f'Invalid secret token.
|
835 |
# Initialize a list of lists for history with the user input as the first entry
|
836 |
history = [[input_text, None]]
|
837 |
story_sentences = get_sentence(history, chatbot_role) # get_sentence function generates text
|
@@ -849,7 +785,7 @@ def generate_story_and_speech(secret_token, input_text, chatbot_role):
|
|
849 |
# Convert the list of lists back into a list of tuples for the history
|
850 |
history_tuples = [tuple(entry) for entry in last_history]
|
851 |
|
852 |
-
return generate_speech_from_history(history_tuples, chatbot_role, story_text)
|
853 |
|
854 |
else:
|
855 |
return []
|
@@ -862,5 +798,4 @@ demo = gr.Interface(
|
|
862 |
)
|
863 |
|
864 |
demo.queue()
|
865 |
-
demo.launch(debug=True)
|
866 |
-
|
|
|
1 |
from __future__ import annotations
|
2 |
+
# Downloading files of the server
|
|
|
3 |
import os
|
4 |
+
import requests
|
5 |
+
def download_file(url, save_path):
|
6 |
+
response = requests.get(url)
|
7 |
+
with open(save_path, 'wb') as file:
|
8 |
+
file.write(response.content)
|
9 |
+
file_names = [
|
10 |
+
'cloee-1.wav',
|
11 |
+
'julian-bedtime-style-1.wav',
|
12 |
+
'julian-bedtime-style-2.wav',
|
13 |
+
'pirate_by_coqui.wav',
|
14 |
+
'thera-1.wav'
|
15 |
+
]
|
16 |
+
base_url = 'https://raw.githubusercontent.com/ruslanmv/ai-story-server/main/voices/'
|
17 |
+
save_folder = 'voices/'
|
18 |
+
if not os.path.exists(save_folder):
|
19 |
+
os.makedirs(save_folder)
|
20 |
+
for file_name in file_names:
|
21 |
+
url = base_url + file_name
|
22 |
+
save_path = os.path.join(save_folder, file_name)
|
23 |
+
download_file(url, save_path)
|
24 |
+
print(f'Downloaded {file_name}')
|
25 |
+
requirements_url = 'https://raw.githubusercontent.com/ruslanmv/ai-story-server/main/requirements.txt'
|
26 |
+
save_path = 'requirements.txt'
|
27 |
+
download_file(requirements_url, save_path)
|
28 |
#os.system('pip install gradio==3.48.0')
|
29 |
+
os.system('pip install -r requirements.txt')
|
30 |
os.system('pip install python-dotenv')
|
31 |
+
os.system('pip install ipython')
|
32 |
+
from IPython.display import clear_output
|
33 |
+
clear_output()
|
34 |
+
import os
|
35 |
+
import shutil
|
36 |
+
from IPython.display import clear_output
|
37 |
+
# Use GPU
|
38 |
+
def is_nvidia_smi_available():
|
39 |
+
return shutil.which("nvidia-smi") is not None
|
40 |
+
if is_nvidia_smi_available():
|
41 |
+
gpu_info = os.popen("nvidia-smi").read()
|
42 |
+
if gpu_info.find('failed') >= 0:
|
43 |
+
print('Not connected to a GPU')
|
44 |
+
is_gpu = False
|
45 |
+
else:
|
46 |
+
print(gpu_info)
|
47 |
+
is_gpu = True
|
48 |
+
else:
|
49 |
+
print('nvidia-smi command not found')
|
50 |
print('Not connected to a GPU')
|
51 |
is_gpu = False
|
|
|
|
|
|
|
|
|
|
|
52 |
import os
|
53 |
import dotenv
|
54 |
# Load the environment variables from the .env file
|
55 |
+
# You can change the default secret
|
56 |
+
with open(".env", "w") as env_file:
|
57 |
+
env_file.write("SECRET_TOKEN=secret")
|
58 |
dotenv.load_dotenv()
|
59 |
# Access the value of the SECRET_TOKEN variable
|
60 |
secret_token = os.getenv("SECRET_TOKEN")
|
|
|
61 |
import os
|
62 |
#download for mecab
|
63 |
+
# Check if unidic is installed
|
64 |
os.system("python -m unidic download")
|
65 |
+
|
66 |
# By using XTTS you agree to CPML license https://coqui.ai/cpml
|
67 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
68 |
# NOTE: for streaming will require gradio audio streaming fix
|
69 |
# pip install --upgrade -y gradio==0.50.2 git+https://github.com/gorkemgoknar/gradio.git@patch-1
|
70 |
#Now you’re ready to install 🤗 Transformers with the following command:
|
|
|
|
|
71 |
if not is_gpu:
|
72 |
+
#For CPU-support only, Transformers and PyTorch with:
|
73 |
+
os.system('pip install transformers[tf-cpu]')
|
74 |
+
#os.system('pip install transformers[torch] accelerate==0.26.1')
|
75 |
#pip install 'transformers[tf-cpu]' #Transformers and TensorFlow 2.0:
|
76 |
os.system('pip install llama-cpp-python==0.2.11')
|
77 |
else:
|
78 |
+
os.system('pip install transformers[torch]')
|
79 |
+
# we need to compile a CUBLAS version
|
80 |
# Or get it from https://jllllll.github.io/llama-cpp-python-cuBLAS-wheels/
|
81 |
os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11')
|
82 |
+
clear_output()
|
83 |
|
|
|
84 |
import textwrap
|
85 |
from scipy.io.wavfile import write
|
86 |
from pydub import AudioSegment
|
|
|
89 |
import torch
|
90 |
import nltk # we'll use this to split into sentences
|
91 |
nltk.download("punkt")
|
|
|
92 |
import noisereduce as nr
|
93 |
import subprocess
|
94 |
import langid
|
95 |
import uuid
|
96 |
import emoji
|
97 |
import pathlib
|
|
|
98 |
import datetime
|
|
|
99 |
from scipy.io.wavfile import write
|
100 |
from pydub import AudioSegment
|
|
|
101 |
import re
|
102 |
import io, wave
|
103 |
import librosa
|
|
|
106 |
from TTS.tts.configs.xtts_config import XttsConfig
|
107 |
from TTS.tts.models.xtts import Xtts
|
108 |
from TTS.utils.generic_utils import get_user_data_dir
|
|
|
|
|
109 |
import gradio as gr
|
110 |
import os
|
111 |
import time
|
|
|
112 |
import gradio as gr
|
|
|
113 |
import numpy as np
|
114 |
+
from transformers import pipeline
|
115 |
from gradio_client import Client
|
116 |
from huggingface_hub import InferenceClient
|
117 |
+
clear_output()
|
|
|
|
|
|
|
|
|
118 |
|
119 |
# This will trigger downloading model
|
120 |
print("Downloading if not downloaded Coqui XTTS V2")
|
|
|
123 |
ModelManager().download_model(model_name)
|
124 |
model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
|
125 |
print("XTTS downloaded")
|
|
|
126 |
if is_gpu:
|
127 |
use_deepspeed=True
|
128 |
else:
|
129 |
use_deepspeed=False
|
|
|
130 |
print("Loading XTTS")
|
131 |
config = XttsConfig()
|
132 |
config.load_json(os.path.join(model_path, "config.json"))
|
|
|
133 |
model = Xtts.init_from_config(config)
|
134 |
model.load_checkpoint(
|
135 |
config,
|
|
|
138 |
eval=True,
|
139 |
use_deepspeed=use_deepspeed,
|
140 |
)
|
|
|
|
|
|
|
|
|
141 |
print("Done loading TTS")
|
|
|
|
|
|
|
|
|
|
|
142 |
#####llm_model = os.environ.get("LLM_MODEL", "mistral") # or "zephyr"
|
|
|
143 |
title = "Voice chat with Zephyr/Mistral and Coqui XTTS"
|
|
|
144 |
DESCRIPTION = """# Voice chat with Zephyr/Mistral and Coqui XTTS"""
|
145 |
css = """.toast-wrap { display: none !important } """
|
|
|
146 |
from huggingface_hub import HfApi
|
147 |
|
148 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
|
|
184 |
ROLE_PROMPTS["Pirate"]= pirate_system_message
|
185 |
##"You are an AI assistant with Zephyr model by Mistral and Hugging Face and speech from Coqui XTTS . User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps, your answers should be clear and short sentences"
|
186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
### WILL USE LOCAL MISTRAL OR ZEPHYR
|
|
|
188 |
from huggingface_hub import hf_hub_download
|
|
|
189 |
print("Downloading LLM")
|
190 |
+
print("Downloading Zephyr")
|
191 |
+
# use new gguf format
|
192 |
+
zephyr_model_path = "./zephyr-7b-beta.Q5_K_M.gguf"
|
|
|
|
|
193 |
if not os.path.isfile(zephyr_model_path):
|
194 |
+
hf_hub_download(repo_id="TheBloke/zephyr-7B-beta-GGUF", local_dir=".", filename="zephyr-7b-beta.Q5_K_M.gguf")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
|
196 |
from llama_cpp import Llama
|
197 |
# set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
|
|
|
207 |
|
208 |
llm_zephyr = Llama(model_path=zephyr_model_path,
|
209 |
n_gpu_layers=GPU_LAYERS,
|
210 |
+
max_new_tokens=512,
|
211 |
+
context_window=4096,
|
212 |
n_ctx=4096,
|
213 |
n_batch=128,
|
214 |
+
)
|
215 |
+
llm_zephyr.verbose = LLAMA_VERBOSE
|
216 |
print("Running LLM Zephyr")
|
217 |
+
clear_output()
|
|
|
|
|
|
|
218 |
|
219 |
def split_sentences(text, max_len):
|
220 |
# Apply custom rules to enforce sentence breaks with double punctuation
|
|
|
232 |
sentence_list.extend(wrapped)
|
233 |
else:
|
234 |
sentence_list.append(sent)
|
235 |
+
|
236 |
return sentence_list
|
237 |
|
238 |
|
|
|
273 |
|
274 |
# Prepare the WAV file headers
|
275 |
wav_header = struct.pack('<4sI4s', b'RIFF', chunk_size, b'WAVE') # 'RIFF' chunk descriptor
|
276 |
+
fmt_subchunk = struct.pack('<4sIHHIIHH',
|
277 |
b'fmt ', fmt_subchunk_size, 1, channels,
|
278 |
sample_rate, sample_rate * channels * bit_depth // 8,
|
279 |
channels * bit_depth // 8, bit_depth)
|
|
|
282 |
|
283 |
return wav_header + fmt_subchunk + data_subchunk + pcm_data
|
284 |
|
|
|
|
|
|
|
|
|
285 |
def generate_local_llm(
|
286 |
+
prompt,
|
287 |
history,
|
288 |
system_message=None,
|
289 |
temperature=0.8,
|
|
|
321 |
|
322 |
if "<|user|>" in character:
|
323 |
# end of context
|
324 |
+
return
|
325 |
+
|
326 |
if emoji.is_emoji(character):
|
327 |
# Bad emoji not a meaning messes chat from next lines
|
328 |
return
|
329 |
+
|
330 |
+
|
331 |
output += response["choices"][0]["text"].replace("<|assistant|>","").replace("<|user|>","")
|
332 |
yield output
|
333 |
|
|
|
343 |
|
344 |
return output
|
345 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
def generate_stream(prompt, model="mixtral-8x7b"):
|
347 |
base_url = "https://ruslanmv-hf-llm-api.hf.space"
|
348 |
api_key = "sk-xxxxx"
|
|
|
403 |
return output
|
404 |
|
405 |
|
|
|
|
|
|
|
|
|
406 |
|
407 |
|
408 |
def get_latents(speaker_wav,voice_cleanup=False):
|
409 |
if (voice_cleanup):
|
410 |
try:
|
411 |
+
cleanup_filter="lowpass=8000,highpass=75,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02"
|
412 |
resample_filter="-ac 1 -ar 22050"
|
413 |
out_filename = speaker_wav + str(uuid.uuid4()) + ".wav" #ffmpeg to know output format
|
414 |
#we will use newer ffmpeg as that has afftn denoise filter
|
|
|
422 |
print("Error: failed filtering, use original microphone input")
|
423 |
else:
|
424 |
speaker_wav=speaker_wav
|
425 |
+
|
426 |
# create as function as we can populate here with voice cleanup/filtering
|
427 |
(
|
428 |
gpt_cond_latent,
|
|
|
448 |
#Config will have more correct languages, they may be added before we append here
|
449 |
##["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn","ja"]
|
450 |
|
451 |
+
xtts_supported_languages=config.languages
|
452 |
def detect_language(prompt):
|
453 |
# Fast language autodetection
|
454 |
if len(prompt)>15:
|
455 |
language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
|
456 |
+
if language_predicted == "zh":
|
457 |
#we use zh-cn on xtts
|
458 |
language_predicted = "zh-cn"
|
459 |
+
|
460 |
if language_predicted not in xtts_supported_languages:
|
461 |
print(f"Detected a language not supported by xtts :{language_predicted}, switching to english for now")
|
462 |
gr.Warning(f"Language detected '{language_predicted}' can not be spoken properly 'yet' ")
|
|
|
471 |
|
472 |
return language
|
473 |
|
|
|
|
|
|
|
|
|
474 |
def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
|
475 |
gpt_cond_latent, speaker_embedding = latent_tuple
|
476 |
|
|
|
518 |
except:
|
519 |
return None
|
520 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
521 |
# Will be triggered on text submit (will send to generate_speech)
|
522 |
def add_text(history, text):
|
523 |
history = [] if history is None else history
|
|
|
542 |
|
543 |
|
544 |
def get_sentence(history, chatbot_role):
|
545 |
+
|
546 |
history = [["", None]] if history is None else history
|
547 |
|
548 |
history[-1][1] = ""
|
|
|
555 |
stored_sentence_hash = None
|
556 |
|
557 |
print(chatbot_role)
|
558 |
+
|
559 |
for character in generate_local(history[-1][0], history[:-1], system_message=ROLE_PROMPTS[chatbot_role]):
|
560 |
history[-1][1] = character.replace("<|assistant|>","")
|
561 |
# It is coming word by word
|
562 |
|
563 |
text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|assistant|>"," ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())
|
564 |
if len(text_to_generate) > 1:
|
565 |
+
|
566 |
dif = len(text_to_generate) - len(sentence_list)
|
567 |
|
568 |
if dif == 1 and len(sentence_list) != 0:
|
|
|
581 |
stored_sentence_hash = None
|
582 |
else:
|
583 |
sentence = text_to_generate[len(sentence_list)]
|
584 |
+
|
585 |
# too short sentence just append to next one if there is any
|
586 |
+
# this is for proper language detection
|
587 |
if len(sentence)<=15 and stored_sentence_hash is None and stored_sentence is None:
|
588 |
if sentence[-1] in [".","!","?"]:
|
589 |
if stored_sentence_hash != hash(sentence):
|
590 |
stored_sentence = sentence
|
591 |
+
stored_sentence_hash = hash(sentence)
|
592 |
print("Storing:",stored_sentence)
|
593 |
continue
|
594 |
+
|
595 |
+
|
596 |
sentence_hash = hash(sentence)
|
597 |
if stored_sentence_hash is not None and sentence_hash == stored_sentence_hash:
|
598 |
continue
|
599 |
+
|
600 |
if sentence_hash not in sentence_hash_list:
|
601 |
sentence_hash_list.append(sentence_hash)
|
602 |
sentence_list.append(sentence)
|
|
|
612 |
last_sentence = stored_sentence + last_sentence
|
613 |
stored_sentence = stored_sentence_hash = None
|
614 |
print("Last Sentence with stored:",last_sentence)
|
615 |
+
|
616 |
sentence_hash_list.append(sentence_hash)
|
617 |
sentence_list.append(last_sentence)
|
618 |
print("Last Sentence: ", last_sentence)
|
619 |
+
|
620 |
yield (last_sentence, history)
|
621 |
except:
|
622 |
print("ERROR on last sentence history is :", history)
|
623 |
|
624 |
|
|
|
|
|
|
|
625 |
from scipy.io.wavfile import write
|
626 |
from pydub import AudioSegment
|
627 |
|
628 |
second_of_silence = AudioSegment.silent() # use default
|
629 |
second_of_silence.export("sil.wav", format='wav')
|
630 |
+
clear_output()
|
|
|
|
|
|
|
631 |
|
632 |
|
633 |
def generate_speech_from_history(history, chatbot_role, sentence):
|
|
|
635 |
# total_wav_bytestream = b""
|
636 |
if len(sentence)==0:
|
637 |
print("EMPTY SENTENCE")
|
638 |
+
return
|
639 |
# Sometimes prompt </s> coming on output remove it
|
640 |
# Some post process for speech only
|
641 |
sentence = sentence.replace("</s>", "")
|
|
|
657 |
#if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
|
658 |
# # just add a space
|
659 |
# sentence = sentence[:-1] + " " + sentence[-1]
|
660 |
+
|
661 |
# regex does the job well
|
662 |
sentence = re.sub("([^\x00-\x7F]|\w)([\.。?!]+)",r"\1 \2",sentence)
|
663 |
+
|
664 |
print("Sentence for speech:", sentence)
|
665 |
|
666 |
results = []
|
667 |
+
|
668 |
try:
|
669 |
if len(sentence) < SENTENCE_SPLIT_LENGTH:
|
670 |
# no problem continue on
|
671 |
sentence_list = [sentence]
|
672 |
else:
|
673 |
+
# Until now nltk likely split sentences properly but we need additional
|
674 |
# check for longer sentence and split at last possible position
|
675 |
# Do whatever necessary, first break at hypens then spaces and then even split very long words
|
676 |
# sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
|
|
|
681 |
if any(c.isalnum() for c in sentence):
|
682 |
if language=="autodetect":
|
683 |
#on first call autodetect, nexts sentence calls will use same language
|
684 |
+
language = detect_language(sentence)
|
685 |
#exists at least 1 alphanumeric (utf-8)
|
686 |
|
687 |
#print("Inserting data to get_voice_streaming:")
|
|
|
709 |
continue
|
710 |
|
711 |
# Filter output for better voice
|
712 |
+
filter_output=True
|
713 |
if filter_output:
|
714 |
try:
|
715 |
data_s16 = np.frombuffer(sentence_wav_bytestream, dtype=np.int16, count=len(sentence_wav_bytestream)//2, offset=0)
|
|
|
719 |
sentence_wav_bytestream = sentence_wav_bytestream.tobytes()
|
720 |
except:
|
721 |
print("failed to remove noise")
|
722 |
+
|
723 |
# Directly encode the WAV bytestream to base64
|
724 |
base64_audio = base64.b64encode(pcm_to_wav(sentence_wav_bytestream)).decode('utf8')
|
725 |
|
|
|
747 |
return results
|
748 |
|
749 |
|
|
|
|
|
|
|
750 |
latent_map = {}
|
751 |
try:
|
752 |
# get the current working directory
|
|
|
762 |
|
763 |
except Exception as e:
|
764 |
print("Error:", str(e))
|
765 |
+
|
|
|
|
|
|
|
|
|
766 |
# Define the main function for the API endpoint that takes the input text and chatbot role
|
767 |
def generate_story_and_speech(secret_token, input_text, chatbot_role):
|
768 |
if secret_token != SECRET_TOKEN:
|
769 |
raise gr.Error(
|
770 |
+
f'Invalid secret token. Secret Token: secret')
|
771 |
# Initialize a list of lists for history with the user input as the first entry
|
772 |
history = [[input_text, None]]
|
773 |
story_sentences = get_sentence(history, chatbot_role) # get_sentence function generates text
|
|
|
785 |
# Convert the list of lists back into a list of tuples for the history
|
786 |
history_tuples = [tuple(entry) for entry in last_history]
|
787 |
|
788 |
+
return generate_speech_from_history(history_tuples, chatbot_role, story_text)
|
789 |
|
790 |
else:
|
791 |
return []
|
|
|
798 |
)
|
799 |
|
800 |
demo.queue()
|
801 |
+
demo.launch(debug=True)
|
|
requirements.txt
CHANGED
@@ -20,4 +20,6 @@ noisereduce==3.0.0
|
|
20 |
#deepspeed
|
21 |
#deepspeed==0.12.6
|
22 |
deepspeed==0.10.0
|
23 |
-
ipython
|
|
|
|
|
|
20 |
#deepspeed
|
21 |
#deepspeed==0.12.6
|
22 |
deepspeed==0.10.0
|
23 |
+
ipython
|
24 |
+
python-dotenv
|
25 |
+
OpenAI
|