Voice_Assistant_TTS_long

Sleeping

Siddhant commited on 27 days ago

Commit

bd1d7fa

•

1 Parent(s): e066930

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -60,17 +60,19 @@ from transformers import (
     AutoTokenizer,
     pipeline,
 )
-from melo.api import TTS
 # LM_model, LM_tokenizer = load("mlx-community/SmolLM-360M-Instruct")
 chat = Chat(2)
 chat.init_chat({"role": "system", "content": "You are a helpful and friendly AI assistant. You are polite, respectful, and aim to provide concise responses of less than 20 words."})
 user_role = "user"
-tts_model = TTS(language="EN_NEWEST", device="auto")
-speaker_id = tts_model.hps.data.spk2id["EN-Newest"]
 blocksize = 512
-tts_model.tts_to_file("text", speaker_id, quiet=True)
 dummy_input = torch.randn(
         (3000),
         dtype=getattr(torch, "float16"),
@@ -192,9 +194,11 @@ def transcribe(stream, new_chunk):
             chat.append({"role": "assistant", "content": generated_text})
             text_str=generated_text
             # import pdb;pdb.set_trace()
-            audio_chunk = tts_model.tts_to_file(text_str, speaker_id, quiet=True)
             audio_chunk = (audio_chunk * 32768).astype(np.int16)
-            audio_output=(44100, audio_chunk)
             print("--- %s seconds ---" % (time.time() - start_time))
     # else:
     #     audio_output=None

     AutoTokenizer,
     pipeline,
 )
+# from melo.api import TTS
 # LM_model, LM_tokenizer = load("mlx-community/SmolLM-360M-Instruct")
 chat = Chat(2)
 chat.init_chat({"role": "system", "content": "You are a helpful and friendly AI assistant. You are polite, respectful, and aim to provide concise responses of less than 20 words."})
 user_role = "user"
+# tts_model = TTS(language="EN_NEWEST", device="auto")
+# speaker_id = tts_model.hps.data.spk2id["EN-Newest"]
 blocksize = 512
+with torch.no_grad():
+    wav = text2speech("Sid")["wav"]
+# tts_model.tts_to_file("text", speaker_id, quiet=True)
 dummy_input = torch.randn(
         (3000),
         dtype=getattr(torch, "float16"),
             chat.append({"role": "assistant", "content": generated_text})
             text_str=generated_text
             # import pdb;pdb.set_trace()
+            audio_chunk = text2speech(text_str)["wav"]
+            # audio_chunk = tts_model.tts_to_file(text_str, speaker_id, quiet=True)
             audio_chunk = (audio_chunk * 32768).astype(np.int16)
+            print(text2speech.fs)
+            audio_output=(text2speech.fs, audio_chunk)
             print("--- %s seconds ---" % (time.time() - start_time))
     # else:
     #     audio_output=None