ruslanmv commited on
Commit
8018b87
1 Parent(s): 69f52c4
Files changed (2) hide show
  1. app.py +103 -168
  2. requirements.txt +3 -1
app.py CHANGED
@@ -1,49 +1,86 @@
1
  from __future__ import annotations
2
- from IPython.display import clear_output
3
- from IPython import get_ipython
4
  import os
5
- #os.system('pip install -r requirements.txt')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  #os.system('pip install gradio==3.48.0')
 
7
  os.system('pip install python-dotenv')
8
- # In[1]:
9
- #Use GPU
10
- import subprocess
11
- gpu_info = subprocess.getoutput('nvidia-smi')
12
- if 'failed' in gpu_info:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  print('Not connected to a GPU')
14
  is_gpu = False
15
- else:
16
- print(gpu_info)
17
- is_gpu = True
18
- # In[2]:
19
- # In[3]:
20
  import os
21
  import dotenv
22
  # Load the environment variables from the .env file
 
 
 
23
  dotenv.load_dotenv()
24
  # Access the value of the SECRET_TOKEN variable
25
  secret_token = os.getenv("SECRET_TOKEN")
26
- # In[7]:
27
  import os
28
  #download for mecab
 
29
  os.system("python -m unidic download")
30
- # In[5]:
31
  # By using XTTS you agree to CPML license https://coqui.ai/cpml
32
  os.environ["COQUI_TOS_AGREED"] = "1"
33
  # NOTE: for streaming will require gradio audio streaming fix
34
  # pip install --upgrade -y gradio==0.50.2 git+https://github.com/gorkemgoknar/gradio.git@patch-1
35
  #Now you’re ready to install 🤗 Transformers with the following command:
36
- #For CPU-support only, Transformers and PyTorch with:
37
- os.system('pip install transformers[torch]')
38
  if not is_gpu:
 
 
 
39
  #pip install 'transformers[tf-cpu]' #Transformers and TensorFlow 2.0:
40
  os.system('pip install llama-cpp-python==0.2.11')
41
  else:
42
- # we need to compile a CUBLAS version
 
43
  # Or get it from https://jllllll.github.io/llama-cpp-python-cuBLAS-wheels/
44
  os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11')
 
45
 
46
- # In[8]:
47
  import textwrap
48
  from scipy.io.wavfile import write
49
  from pydub import AudioSegment
@@ -52,19 +89,15 @@ import numpy as np
52
  import torch
53
  import nltk # we'll use this to split into sentences
54
  nltk.download("punkt")
55
-
56
  import noisereduce as nr
57
  import subprocess
58
  import langid
59
  import uuid
60
  import emoji
61
  import pathlib
62
-
63
  import datetime
64
-
65
  from scipy.io.wavfile import write
66
  from pydub import AudioSegment
67
-
68
  import re
69
  import io, wave
70
  import librosa
@@ -73,23 +106,15 @@ from TTS.api import TTS
73
  from TTS.tts.configs.xtts_config import XttsConfig
74
  from TTS.tts.models.xtts import Xtts
75
  from TTS.utils.generic_utils import get_user_data_dir
76
-
77
-
78
  import gradio as gr
79
  import os
80
  import time
81
-
82
  import gradio as gr
83
- from transformers import pipeline
84
  import numpy as np
85
-
86
  from gradio_client import Client
87
  from huggingface_hub import InferenceClient
88
-
89
-
90
-
91
- # In[9]:
92
-
93
 
94
  # This will trigger downloading model
95
  print("Downloading if not downloaded Coqui XTTS V2")
@@ -98,16 +123,13 @@ model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
98
  ModelManager().download_model(model_name)
99
  model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
100
  print("XTTS downloaded")
101
-
102
  if is_gpu:
103
  use_deepspeed=True
104
  else:
105
  use_deepspeed=False
106
-
107
  print("Loading XTTS")
108
  config = XttsConfig()
109
  config.load_json(os.path.join(model_path, "config.json"))
110
-
111
  model = Xtts.init_from_config(config)
112
  model.load_checkpoint(
113
  config,
@@ -116,23 +138,11 @@ model.load_checkpoint(
116
  eval=True,
117
  use_deepspeed=use_deepspeed,
118
  )
119
-
120
- #if is_gpu:
121
- # model.cuda()
122
-
123
  print("Done loading TTS")
124
-
125
-
126
- # In[60]:
127
-
128
-
129
  #####llm_model = os.environ.get("LLM_MODEL", "mistral") # or "zephyr"
130
-
131
  title = "Voice chat with Zephyr/Mistral and Coqui XTTS"
132
-
133
  DESCRIPTION = """# Voice chat with Zephyr/Mistral and Coqui XTTS"""
134
  css = """.toast-wrap { display: none !important } """
135
-
136
  from huggingface_hub import HfApi
137
 
138
  HF_TOKEN = os.environ.get("HF_TOKEN")
@@ -174,40 +184,14 @@ pirate_system_message = f"You as {character_name}. {character_scenario} Print ou
174
  ROLE_PROMPTS["Pirate"]= pirate_system_message
175
  ##"You are an AI assistant with Zephyr model by Mistral and Hugging Face and speech from Coqui XTTS . User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps, your answers should be clear and short sentences"
176
 
177
- # In[49]:
178
-
179
-
180
-
181
-
182
-
183
- # In[15]:
184
-
185
-
186
  ### WILL USE LOCAL MISTRAL OR ZEPHYR
187
- import os
188
  from huggingface_hub import hf_hub_download
189
-
190
  print("Downloading LLM")
191
-
192
- # Get the current directory
193
- current_dir = os.getcwd()
194
- # Append the current directory to the zephyr_model_path
195
- zephyr_model_path = os.path.join(current_dir, "zephyr-7b-beta.Q5_K_M.gguf")
196
  if not os.path.isfile(zephyr_model_path):
197
- print("Downloading Zephyr")
198
- hf_hub_download(repo_id="TheBloke/zephyr-7B-beta-GGUF", local_dir=current_dir, filename="zephyr-7b-beta.Q5_K_M.gguf")
199
- else:
200
- print("Zephyr it is already downloaded")
201
-
202
-
203
- # In[ ]:
204
-
205
-
206
-
207
-
208
-
209
- # In[16]:
210
-
211
 
212
  from llama_cpp import Llama
213
  # set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
@@ -223,17 +207,14 @@ LLAMA_VERBOSE=False
223
 
224
  llm_zephyr = Llama(model_path=zephyr_model_path,
225
  n_gpu_layers=GPU_LAYERS,
226
- max_new_tokens=512,
227
- context_window=4096,
228
  n_ctx=4096,
229
  n_batch=128,
230
- verbose=LLAMA_VERBOSE)
231
-
232
  print("Running LLM Zephyr")
233
-
234
-
235
- # In[17]:
236
-
237
 
238
  def split_sentences(text, max_len):
239
  # Apply custom rules to enforce sentence breaks with double punctuation
@@ -251,7 +232,7 @@ def split_sentences(text, max_len):
251
  sentence_list.extend(wrapped)
252
  else:
253
  sentence_list.append(sent)
254
-
255
  return sentence_list
256
 
257
 
@@ -292,7 +273,7 @@ def pcm_to_wav(pcm_data, sample_rate=24000, channels=1, bit_depth=16):
292
 
293
  # Prepare the WAV file headers
294
  wav_header = struct.pack('<4sI4s', b'RIFF', chunk_size, b'WAVE') # 'RIFF' chunk descriptor
295
- fmt_subchunk = struct.pack('<4sIHHIIHH',
296
  b'fmt ', fmt_subchunk_size, 1, channels,
297
  sample_rate, sample_rate * channels * bit_depth // 8,
298
  channels * bit_depth // 8, bit_depth)
@@ -301,12 +282,8 @@ def pcm_to_wav(pcm_data, sample_rate=24000, channels=1, bit_depth=16):
301
 
302
  return wav_header + fmt_subchunk + data_subchunk + pcm_data
303
 
304
-
305
- # In[23]:
306
-
307
-
308
  def generate_local_llm(
309
- prompt,
310
  history,
311
  system_message=None,
312
  temperature=0.8,
@@ -344,13 +321,13 @@ def generate_local_llm(
344
 
345
  if "<|user|>" in character:
346
  # end of context
347
- return
348
-
349
  if emoji.is_emoji(character):
350
  # Bad emoji not a meaning messes chat from next lines
351
  return
352
-
353
-
354
  output += response["choices"][0]["text"].replace("<|assistant|>","").replace("<|user|>","")
355
  yield output
356
 
@@ -366,16 +343,6 @@ def generate_local_llm(
366
 
367
  return output
368
 
369
-
370
- # In[28]:
371
-
372
-
373
- get_ipython().system('pip install OpenAI')
374
-
375
-
376
- # In[103]:
377
-
378
-
379
  def generate_stream(prompt, model="mixtral-8x7b"):
380
  base_url = "https://ruslanmv-hf-llm-api.hf.space"
381
  api_key = "sk-xxxxx"
@@ -436,16 +403,12 @@ def generate_local(
436
  return output
437
 
438
 
439
- # In[ ]:
440
-
441
-
442
- # In[17]:
443
 
444
 
445
  def get_latents(speaker_wav,voice_cleanup=False):
446
  if (voice_cleanup):
447
  try:
448
- cleanup_filter="lowpass=8000,highpass=75,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02"
449
  resample_filter="-ac 1 -ar 22050"
450
  out_filename = speaker_wav + str(uuid.uuid4()) + ".wav" #ffmpeg to know output format
451
  #we will use newer ffmpeg as that has afftn denoise filter
@@ -459,7 +422,7 @@ def get_latents(speaker_wav,voice_cleanup=False):
459
  print("Error: failed filtering, use original microphone input")
460
  else:
461
  speaker_wav=speaker_wav
462
-
463
  # create as function as we can populate here with voice cleanup/filtering
464
  (
465
  gpt_cond_latent,
@@ -485,15 +448,15 @@ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=2
485
  #Config will have more correct languages, they may be added before we append here
486
  ##["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn","ja"]
487
 
488
- xtts_supported_languages=config.languages
489
  def detect_language(prompt):
490
  # Fast language autodetection
491
  if len(prompt)>15:
492
  language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
493
- if language_predicted == "zh":
494
  #we use zh-cn on xtts
495
  language_predicted = "zh-cn"
496
-
497
  if language_predicted not in xtts_supported_languages:
498
  print(f"Detected a language not supported by xtts :{language_predicted}, switching to english for now")
499
  gr.Warning(f"Language detected '{language_predicted}' can not be spoken properly 'yet' ")
@@ -508,10 +471,6 @@ def detect_language(prompt):
508
 
509
  return language
510
 
511
-
512
- # In[18]:
513
-
514
-
515
  def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
516
  gpt_cond_latent, speaker_embedding = latent_tuple
517
 
@@ -559,16 +518,6 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
559
  except:
560
  return None
561
 
562
-
563
- # In[ ]:
564
-
565
-
566
-
567
-
568
-
569
- # In[19]:
570
-
571
-
572
  # Will be triggered on text submit (will send to generate_speech)
573
  def add_text(history, text):
574
  history = [] if history is None else history
@@ -593,7 +542,7 @@ def add_file(history, file):
593
 
594
 
595
  def get_sentence(history, chatbot_role):
596
-
597
  history = [["", None]] if history is None else history
598
 
599
  history[-1][1] = ""
@@ -606,14 +555,14 @@ def get_sentence(history, chatbot_role):
606
  stored_sentence_hash = None
607
 
608
  print(chatbot_role)
609
-
610
  for character in generate_local(history[-1][0], history[:-1], system_message=ROLE_PROMPTS[chatbot_role]):
611
  history[-1][1] = character.replace("<|assistant|>","")
612
  # It is coming word by word
613
 
614
  text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|assistant|>"," ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())
615
  if len(text_to_generate) > 1:
616
-
617
  dif = len(text_to_generate) - len(sentence_list)
618
 
619
  if dif == 1 and len(sentence_list) != 0:
@@ -632,22 +581,22 @@ def get_sentence(history, chatbot_role):
632
  stored_sentence_hash = None
633
  else:
634
  sentence = text_to_generate[len(sentence_list)]
635
-
636
  # too short sentence just append to next one if there is any
637
- # this is for proper language detection
638
  if len(sentence)<=15 and stored_sentence_hash is None and stored_sentence is None:
639
  if sentence[-1] in [".","!","?"]:
640
  if stored_sentence_hash != hash(sentence):
641
  stored_sentence = sentence
642
- stored_sentence_hash = hash(sentence)
643
  print("Storing:",stored_sentence)
644
  continue
645
-
646
-
647
  sentence_hash = hash(sentence)
648
  if stored_sentence_hash is not None and sentence_hash == stored_sentence_hash:
649
  continue
650
-
651
  if sentence_hash not in sentence_hash_list:
652
  sentence_hash_list.append(sentence_hash)
653
  sentence_list.append(sentence)
@@ -663,28 +612,22 @@ def get_sentence(history, chatbot_role):
663
  last_sentence = stored_sentence + last_sentence
664
  stored_sentence = stored_sentence_hash = None
665
  print("Last Sentence with stored:",last_sentence)
666
-
667
  sentence_hash_list.append(sentence_hash)
668
  sentence_list.append(last_sentence)
669
  print("Last Sentence: ", last_sentence)
670
-
671
  yield (last_sentence, history)
672
  except:
673
  print("ERROR on last sentence history is :", history)
674
 
675
 
676
- # In[19]:
677
-
678
-
679
  from scipy.io.wavfile import write
680
  from pydub import AudioSegment
681
 
682
  second_of_silence = AudioSegment.silent() # use default
683
  second_of_silence.export("sil.wav", format='wav')
684
-
685
-
686
-
687
- # In[20]:
688
 
689
 
690
  def generate_speech_from_history(history, chatbot_role, sentence):
@@ -692,7 +635,7 @@ def generate_speech_from_history(history, chatbot_role, sentence):
692
  # total_wav_bytestream = b""
693
  if len(sentence)==0:
694
  print("EMPTY SENTENCE")
695
- return
696
  # Sometimes prompt </s> coming on output remove it
697
  # Some post process for speech only
698
  sentence = sentence.replace("</s>", "")
@@ -714,20 +657,20 @@ def generate_speech_from_history(history, chatbot_role, sentence):
714
  #if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
715
  # # just add a space
716
  # sentence = sentence[:-1] + " " + sentence[-1]
717
-
718
  # regex does the job well
719
  sentence = re.sub("([^\x00-\x7F]|\w)([\.。?!]+)",r"\1 \2",sentence)
720
-
721
  print("Sentence for speech:", sentence)
722
 
723
  results = []
724
-
725
  try:
726
  if len(sentence) < SENTENCE_SPLIT_LENGTH:
727
  # no problem continue on
728
  sentence_list = [sentence]
729
  else:
730
- # Until now nltk likely split sentences properly but we need additional
731
  # check for longer sentence and split at last possible position
732
  # Do whatever necessary, first break at hypens then spaces and then even split very long words
733
  # sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
@@ -738,7 +681,7 @@ def generate_speech_from_history(history, chatbot_role, sentence):
738
  if any(c.isalnum() for c in sentence):
739
  if language=="autodetect":
740
  #on first call autodetect, nexts sentence calls will use same language
741
- language = detect_language(sentence)
742
  #exists at least 1 alphanumeric (utf-8)
743
 
744
  #print("Inserting data to get_voice_streaming:")
@@ -766,7 +709,7 @@ def generate_speech_from_history(history, chatbot_role, sentence):
766
  continue
767
 
768
  # Filter output for better voice
769
- filter_output=True
770
  if filter_output:
771
  try:
772
  data_s16 = np.frombuffer(sentence_wav_bytestream, dtype=np.int16, count=len(sentence_wav_bytestream)//2, offset=0)
@@ -776,7 +719,7 @@ def generate_speech_from_history(history, chatbot_role, sentence):
776
  sentence_wav_bytestream = sentence_wav_bytestream.tobytes()
777
  except:
778
  print("failed to remove noise")
779
-
780
  # Directly encode the WAV bytestream to base64
781
  base64_audio = base64.b64encode(pcm_to_wav(sentence_wav_bytestream)).decode('utf8')
782
 
@@ -804,9 +747,6 @@ def generate_speech_from_history(history, chatbot_role, sentence):
804
  return results
805
 
806
 
807
- # In[21]:
808
-
809
-
810
  latent_map = {}
811
  try:
812
  # get the current working directory
@@ -822,16 +762,12 @@ try:
822
 
823
  except Exception as e:
824
  print("Error:", str(e))
825
-
826
-
827
- # In[ ]:
828
-
829
-
830
  # Define the main function for the API endpoint that takes the input text and chatbot role
831
  def generate_story_and_speech(secret_token, input_text, chatbot_role):
832
  if secret_token != SECRET_TOKEN:
833
  raise gr.Error(
834
- f'Invalid secret token. Please fork the original space if you want to use it for yourself.')
835
  # Initialize a list of lists for history with the user input as the first entry
836
  history = [[input_text, None]]
837
  story_sentences = get_sentence(history, chatbot_role) # get_sentence function generates text
@@ -849,7 +785,7 @@ def generate_story_and_speech(secret_token, input_text, chatbot_role):
849
  # Convert the list of lists back into a list of tuples for the history
850
  history_tuples = [tuple(entry) for entry in last_history]
851
 
852
- return generate_speech_from_history(history_tuples, chatbot_role, story_text)
853
 
854
  else:
855
  return []
@@ -862,5 +798,4 @@ demo = gr.Interface(
862
  )
863
 
864
  demo.queue()
865
- demo.launch(debug=True)
866
-
 
1
  from __future__ import annotations
2
+ # Downloading files of the server
 
3
  import os
4
+ import requests
5
+ def download_file(url, save_path):
6
+ response = requests.get(url)
7
+ with open(save_path, 'wb') as file:
8
+ file.write(response.content)
9
+ file_names = [
10
+ 'cloee-1.wav',
11
+ 'julian-bedtime-style-1.wav',
12
+ 'julian-bedtime-style-2.wav',
13
+ 'pirate_by_coqui.wav',
14
+ 'thera-1.wav'
15
+ ]
16
+ base_url = 'https://raw.githubusercontent.com/ruslanmv/ai-story-server/main/voices/'
17
+ save_folder = 'voices/'
18
+ if not os.path.exists(save_folder):
19
+ os.makedirs(save_folder)
20
+ for file_name in file_names:
21
+ url = base_url + file_name
22
+ save_path = os.path.join(save_folder, file_name)
23
+ download_file(url, save_path)
24
+ print(f'Downloaded {file_name}')
25
+ requirements_url = 'https://raw.githubusercontent.com/ruslanmv/ai-story-server/main/requirements.txt'
26
+ save_path = 'requirements.txt'
27
+ download_file(requirements_url, save_path)
28
  #os.system('pip install gradio==3.48.0')
29
+ os.system('pip install -r requirements.txt')
30
  os.system('pip install python-dotenv')
31
+ os.system('pip install ipython')
32
+ from IPython.display import clear_output
33
+ clear_output()
34
+ import os
35
+ import shutil
36
+ from IPython.display import clear_output
37
+ # Use GPU
38
+ def is_nvidia_smi_available():
39
+ return shutil.which("nvidia-smi") is not None
40
+ if is_nvidia_smi_available():
41
+ gpu_info = os.popen("nvidia-smi").read()
42
+ if gpu_info.find('failed') >= 0:
43
+ print('Not connected to a GPU')
44
+ is_gpu = False
45
+ else:
46
+ print(gpu_info)
47
+ is_gpu = True
48
+ else:
49
+ print('nvidia-smi command not found')
50
  print('Not connected to a GPU')
51
  is_gpu = False
 
 
 
 
 
52
  import os
53
  import dotenv
54
  # Load the environment variables from the .env file
55
+ # You can change the default secret
56
+ with open(".env", "w") as env_file:
57
+ env_file.write("SECRET_TOKEN=secret")
58
  dotenv.load_dotenv()
59
  # Access the value of the SECRET_TOKEN variable
60
  secret_token = os.getenv("SECRET_TOKEN")
 
61
  import os
62
  #download for mecab
63
+ # Check if unidic is installed
64
  os.system("python -m unidic download")
65
+
66
  # By using XTTS you agree to CPML license https://coqui.ai/cpml
67
  os.environ["COQUI_TOS_AGREED"] = "1"
68
  # NOTE: for streaming will require gradio audio streaming fix
69
  # pip install --upgrade -y gradio==0.50.2 git+https://github.com/gorkemgoknar/gradio.git@patch-1
70
  #Now you’re ready to install 🤗 Transformers with the following command:
 
 
71
  if not is_gpu:
72
+ #For CPU-support only, Transformers and PyTorch with:
73
+ os.system('pip install transformers[tf-cpu]')
74
+ #os.system('pip install transformers[torch] accelerate==0.26.1')
75
  #pip install 'transformers[tf-cpu]' #Transformers and TensorFlow 2.0:
76
  os.system('pip install llama-cpp-python==0.2.11')
77
  else:
78
+ os.system('pip install transformers[torch]')
79
+ # we need to compile a CUBLAS version
80
  # Or get it from https://jllllll.github.io/llama-cpp-python-cuBLAS-wheels/
81
  os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11')
82
+ clear_output()
83
 
 
84
  import textwrap
85
  from scipy.io.wavfile import write
86
  from pydub import AudioSegment
 
89
  import torch
90
  import nltk # we'll use this to split into sentences
91
  nltk.download("punkt")
 
92
  import noisereduce as nr
93
  import subprocess
94
  import langid
95
  import uuid
96
  import emoji
97
  import pathlib
 
98
  import datetime
 
99
  from scipy.io.wavfile import write
100
  from pydub import AudioSegment
 
101
  import re
102
  import io, wave
103
  import librosa
 
106
  from TTS.tts.configs.xtts_config import XttsConfig
107
  from TTS.tts.models.xtts import Xtts
108
  from TTS.utils.generic_utils import get_user_data_dir
 
 
109
  import gradio as gr
110
  import os
111
  import time
 
112
  import gradio as gr
 
113
  import numpy as np
114
+ from transformers import pipeline
115
  from gradio_client import Client
116
  from huggingface_hub import InferenceClient
117
+ clear_output()
 
 
 
 
118
 
119
  # This will trigger downloading model
120
  print("Downloading if not downloaded Coqui XTTS V2")
 
123
  ModelManager().download_model(model_name)
124
  model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
125
  print("XTTS downloaded")
 
126
  if is_gpu:
127
  use_deepspeed=True
128
  else:
129
  use_deepspeed=False
 
130
  print("Loading XTTS")
131
  config = XttsConfig()
132
  config.load_json(os.path.join(model_path, "config.json"))
 
133
  model = Xtts.init_from_config(config)
134
  model.load_checkpoint(
135
  config,
 
138
  eval=True,
139
  use_deepspeed=use_deepspeed,
140
  )
 
 
 
 
141
  print("Done loading TTS")
 
 
 
 
 
142
  #####llm_model = os.environ.get("LLM_MODEL", "mistral") # or "zephyr"
 
143
  title = "Voice chat with Zephyr/Mistral and Coqui XTTS"
 
144
  DESCRIPTION = """# Voice chat with Zephyr/Mistral and Coqui XTTS"""
145
  css = """.toast-wrap { display: none !important } """
 
146
  from huggingface_hub import HfApi
147
 
148
  HF_TOKEN = os.environ.get("HF_TOKEN")
 
184
  ROLE_PROMPTS["Pirate"]= pirate_system_message
185
  ##"You are an AI assistant with Zephyr model by Mistral and Hugging Face and speech from Coqui XTTS . User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps, your answers should be clear and short sentences"
186
 
 
 
 
 
 
 
 
 
 
187
  ### WILL USE LOCAL MISTRAL OR ZEPHYR
 
188
  from huggingface_hub import hf_hub_download
 
189
  print("Downloading LLM")
190
+ print("Downloading Zephyr")
191
+ # use new gguf format
192
+ zephyr_model_path = "./zephyr-7b-beta.Q5_K_M.gguf"
 
 
193
  if not os.path.isfile(zephyr_model_path):
194
+ hf_hub_download(repo_id="TheBloke/zephyr-7B-beta-GGUF", local_dir=".", filename="zephyr-7b-beta.Q5_K_M.gguf")
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
  from llama_cpp import Llama
197
  # set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
 
207
 
208
  llm_zephyr = Llama(model_path=zephyr_model_path,
209
  n_gpu_layers=GPU_LAYERS,
210
+ max_new_tokens=512,
211
+ context_window=4096,
212
  n_ctx=4096,
213
  n_batch=128,
214
+ )
215
+ llm_zephyr.verbose = LLAMA_VERBOSE
216
  print("Running LLM Zephyr")
217
+ clear_output()
 
 
 
218
 
219
  def split_sentences(text, max_len):
220
  # Apply custom rules to enforce sentence breaks with double punctuation
 
232
  sentence_list.extend(wrapped)
233
  else:
234
  sentence_list.append(sent)
235
+
236
  return sentence_list
237
 
238
 
 
273
 
274
  # Prepare the WAV file headers
275
  wav_header = struct.pack('<4sI4s', b'RIFF', chunk_size, b'WAVE') # 'RIFF' chunk descriptor
276
+ fmt_subchunk = struct.pack('<4sIHHIIHH',
277
  b'fmt ', fmt_subchunk_size, 1, channels,
278
  sample_rate, sample_rate * channels * bit_depth // 8,
279
  channels * bit_depth // 8, bit_depth)
 
282
 
283
  return wav_header + fmt_subchunk + data_subchunk + pcm_data
284
 
 
 
 
 
285
  def generate_local_llm(
286
+ prompt,
287
  history,
288
  system_message=None,
289
  temperature=0.8,
 
321
 
322
  if "<|user|>" in character:
323
  # end of context
324
+ return
325
+
326
  if emoji.is_emoji(character):
327
  # Bad emoji not a meaning messes chat from next lines
328
  return
329
+
330
+
331
  output += response["choices"][0]["text"].replace("<|assistant|>","").replace("<|user|>","")
332
  yield output
333
 
 
343
 
344
  return output
345
 
 
 
 
 
 
 
 
 
 
 
346
  def generate_stream(prompt, model="mixtral-8x7b"):
347
  base_url = "https://ruslanmv-hf-llm-api.hf.space"
348
  api_key = "sk-xxxxx"
 
403
  return output
404
 
405
 
 
 
 
 
406
 
407
 
408
  def get_latents(speaker_wav,voice_cleanup=False):
409
  if (voice_cleanup):
410
  try:
411
+ cleanup_filter="lowpass=8000,highpass=75,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02"
412
  resample_filter="-ac 1 -ar 22050"
413
  out_filename = speaker_wav + str(uuid.uuid4()) + ".wav" #ffmpeg to know output format
414
  #we will use newer ffmpeg as that has afftn denoise filter
 
422
  print("Error: failed filtering, use original microphone input")
423
  else:
424
  speaker_wav=speaker_wav
425
+
426
  # create as function as we can populate here with voice cleanup/filtering
427
  (
428
  gpt_cond_latent,
 
448
  #Config will have more correct languages, they may be added before we append here
449
  ##["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn","ja"]
450
 
451
+ xtts_supported_languages=config.languages
452
  def detect_language(prompt):
453
  # Fast language autodetection
454
  if len(prompt)>15:
455
  language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
456
+ if language_predicted == "zh":
457
  #we use zh-cn on xtts
458
  language_predicted = "zh-cn"
459
+
460
  if language_predicted not in xtts_supported_languages:
461
  print(f"Detected a language not supported by xtts :{language_predicted}, switching to english for now")
462
  gr.Warning(f"Language detected '{language_predicted}' can not be spoken properly 'yet' ")
 
471
 
472
  return language
473
 
 
 
 
 
474
  def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
475
  gpt_cond_latent, speaker_embedding = latent_tuple
476
 
 
518
  except:
519
  return None
520
 
 
 
 
 
 
 
 
 
 
 
521
  # Will be triggered on text submit (will send to generate_speech)
522
  def add_text(history, text):
523
  history = [] if history is None else history
 
542
 
543
 
544
  def get_sentence(history, chatbot_role):
545
+
546
  history = [["", None]] if history is None else history
547
 
548
  history[-1][1] = ""
 
555
  stored_sentence_hash = None
556
 
557
  print(chatbot_role)
558
+
559
  for character in generate_local(history[-1][0], history[:-1], system_message=ROLE_PROMPTS[chatbot_role]):
560
  history[-1][1] = character.replace("<|assistant|>","")
561
  # It is coming word by word
562
 
563
  text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|assistant|>"," ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())
564
  if len(text_to_generate) > 1:
565
+
566
  dif = len(text_to_generate) - len(sentence_list)
567
 
568
  if dif == 1 and len(sentence_list) != 0:
 
581
  stored_sentence_hash = None
582
  else:
583
  sentence = text_to_generate[len(sentence_list)]
584
+
585
  # too short sentence just append to next one if there is any
586
+ # this is for proper language detection
587
  if len(sentence)<=15 and stored_sentence_hash is None and stored_sentence is None:
588
  if sentence[-1] in [".","!","?"]:
589
  if stored_sentence_hash != hash(sentence):
590
  stored_sentence = sentence
591
+ stored_sentence_hash = hash(sentence)
592
  print("Storing:",stored_sentence)
593
  continue
594
+
595
+
596
  sentence_hash = hash(sentence)
597
  if stored_sentence_hash is not None and sentence_hash == stored_sentence_hash:
598
  continue
599
+
600
  if sentence_hash not in sentence_hash_list:
601
  sentence_hash_list.append(sentence_hash)
602
  sentence_list.append(sentence)
 
612
  last_sentence = stored_sentence + last_sentence
613
  stored_sentence = stored_sentence_hash = None
614
  print("Last Sentence with stored:",last_sentence)
615
+
616
  sentence_hash_list.append(sentence_hash)
617
  sentence_list.append(last_sentence)
618
  print("Last Sentence: ", last_sentence)
619
+
620
  yield (last_sentence, history)
621
  except:
622
  print("ERROR on last sentence history is :", history)
623
 
624
 
 
 
 
625
  from scipy.io.wavfile import write
626
  from pydub import AudioSegment
627
 
628
  second_of_silence = AudioSegment.silent() # use default
629
  second_of_silence.export("sil.wav", format='wav')
630
+ clear_output()
 
 
 
631
 
632
 
633
  def generate_speech_from_history(history, chatbot_role, sentence):
 
635
  # total_wav_bytestream = b""
636
  if len(sentence)==0:
637
  print("EMPTY SENTENCE")
638
+ return
639
  # Sometimes prompt </s> coming on output remove it
640
  # Some post process for speech only
641
  sentence = sentence.replace("</s>", "")
 
657
  #if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
658
  # # just add a space
659
  # sentence = sentence[:-1] + " " + sentence[-1]
660
+
661
  # regex does the job well
662
  sentence = re.sub("([^\x00-\x7F]|\w)([\.。?!]+)",r"\1 \2",sentence)
663
+
664
  print("Sentence for speech:", sentence)
665
 
666
  results = []
667
+
668
  try:
669
  if len(sentence) < SENTENCE_SPLIT_LENGTH:
670
  # no problem continue on
671
  sentence_list = [sentence]
672
  else:
673
+ # Until now nltk likely split sentences properly but we need additional
674
  # check for longer sentence and split at last possible position
675
  # Do whatever necessary, first break at hypens then spaces and then even split very long words
676
  # sentence_list=textwrap.wrap(sentence,SENTENCE_SPLIT_LENGTH)
 
681
  if any(c.isalnum() for c in sentence):
682
  if language=="autodetect":
683
  #on first call autodetect, nexts sentence calls will use same language
684
+ language = detect_language(sentence)
685
  #exists at least 1 alphanumeric (utf-8)
686
 
687
  #print("Inserting data to get_voice_streaming:")
 
709
  continue
710
 
711
  # Filter output for better voice
712
+ filter_output=True
713
  if filter_output:
714
  try:
715
  data_s16 = np.frombuffer(sentence_wav_bytestream, dtype=np.int16, count=len(sentence_wav_bytestream)//2, offset=0)
 
719
  sentence_wav_bytestream = sentence_wav_bytestream.tobytes()
720
  except:
721
  print("failed to remove noise")
722
+
723
  # Directly encode the WAV bytestream to base64
724
  base64_audio = base64.b64encode(pcm_to_wav(sentence_wav_bytestream)).decode('utf8')
725
 
 
747
  return results
748
 
749
 
 
 
 
750
  latent_map = {}
751
  try:
752
  # get the current working directory
 
762
 
763
  except Exception as e:
764
  print("Error:", str(e))
765
+
 
 
 
 
766
  # Define the main function for the API endpoint that takes the input text and chatbot role
767
  def generate_story_and_speech(secret_token, input_text, chatbot_role):
768
  if secret_token != SECRET_TOKEN:
769
  raise gr.Error(
770
+ f'Invalid secret token. Secret Token: secret')
771
  # Initialize a list of lists for history with the user input as the first entry
772
  history = [[input_text, None]]
773
  story_sentences = get_sentence(history, chatbot_role) # get_sentence function generates text
 
785
  # Convert the list of lists back into a list of tuples for the history
786
  history_tuples = [tuple(entry) for entry in last_history]
787
 
788
+ return generate_speech_from_history(history_tuples, chatbot_role, story_text)
789
 
790
  else:
791
  return []
 
798
  )
799
 
800
  demo.queue()
801
+ demo.launch(debug=True)
 
requirements.txt CHANGED
@@ -20,4 +20,6 @@ noisereduce==3.0.0
20
  #deepspeed
21
  #deepspeed==0.12.6
22
  deepspeed==0.10.0
23
- ipython
 
 
 
20
  #deepspeed
21
  #deepspeed==0.12.6
22
  deepspeed==0.10.0
23
+ ipython
24
+ python-dotenv
25
+ OpenAI