speech-to-speech-translation-test

Sleeping

App Files Files Community

juangtzi commited on Oct 10

Commit

f75821a

•

1 Parent(s): 7294493

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -36

app.py CHANGED Viewed

@@ -2,9 +2,9 @@ import gradio as gr
 import numpy as np
 import torch
 from transformers import pipeline, VitsModel, AutoTokenizer, AutoTokenizer
-# from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
 from transformers import WhisperTokenizer, GenerationConfig
-from transformers import BarkModel, AutoProcessor
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
@@ -28,17 +28,18 @@ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-medium
 # ---------------- Speech generator  specht5_tts --------------------------#
-# model = SpeechT5ForTextToSpeech.from_pretrained(
-#     "juangtzi/speecht5_finetuned_voxpopuli_es"
-# )
-# checkpoint = "microsoft/speecht5_tts"
-# processor = SpeechT5Processor.from_pretrained(checkpoint)
-# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
-# speaker_embeddings2 = np.load('speaker_embeddings.npy')
-# speaker_embeddings2 = torch.tensor(speaker_embeddings2)
-# print(speaker_embeddings2)
-# lang_detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection")
 # ---------------- Speech generator  bark--------------------------#
@@ -46,8 +47,8 @@ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-medium
 #model = BarkModel.from_pretrained("suno/bark-small")
 #processor = BarkProcessor.from_pretrained("suno/bark-small")
-processor = AutoProcessor.from_pretrained("suno/bark-small")
-model = BarkModel.from_pretrained("suno/bark-small")
 def language_detector(text):
@@ -62,35 +63,35 @@ def translate(audio):
     return outputs["text"]
-def synthesise(text):
-    inputs = processor(text=text, voice_preset="v2/es_speaker_8")
-    speech_output = model.generate(**inputs).cpu()
-    return speech_output
-def speech_to_speech_translation(audio):
-    translated_text = translate(audio)
-    synthesised_speech = synthesise(translated_text)
-    sample_rate = model.generation_config.sample_rate
-    synthesised_speech = synthesised_speech.numpy().squeeze()
-    return sample_rate, synthesised_speech
-# def synthesise(text):  speecht5_tts
-#     inputs = processor(text=text, return_tensors="pt")
-#     output = model.generate_speech(inputs["input_ids"], speaker_embeddings2, vocoder=vocoder)
-#     return output
-# def speech_to_speech_translation(audio):  speecht5_tts
-#     translated_text = translate(audio)
-#     synthesised_speech = synthesise(translated_text)
-#     audio_data = synthesised_speech.cpu().numpy()
-#     audio_data = np.squeeze(audio_data)
-#     audio_data = audio_data / np.max(np.abs(audio_data))
-#     sample_rate = 16000
-#     return (sample_rate, audio_data)
 title = "Cascaded STST"
 description = """

 import numpy as np
 import torch
 from transformers import pipeline, VitsModel, AutoTokenizer, AutoTokenizer
+from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
 from transformers import WhisperTokenizer, GenerationConfig
+#from transformers import BarkModel, AutoProcessor
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 # ---------------- Speech generator  specht5_tts --------------------------#
+model = SpeechT5ForTextToSpeech.from_pretrained(
+    "juangtzi/speecht5_finetuned_voxpopuli_es"
+)
+checkpoint = "microsoft/speecht5_tts"
+processor = SpeechT5Processor.from_pretrained(checkpoint)
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+speaker_embeddings2 = np.load('speaker_embeddings.npy')
+speaker_embeddings2 = torch.tensor(speaker_embeddings2)
+print(speaker_embeddings2)
+#lang_detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection")
 # ---------------- Speech generator  bark--------------------------#
 #model = BarkModel.from_pretrained("suno/bark-small")
 #processor = BarkProcessor.from_pretrained("suno/bark-small")
+# processor = AutoProcessor.from_pretrained("suno/bark-small")
+# model = BarkModel.from_pretrained("suno/bark-small")
 def language_detector(text):
     return outputs["text"]
+# def synthesise(text):
+#     inputs = processor(text=text, voice_preset="v2/es_speaker_8")
+#     speech_output = model.generate(**inputs).cpu()
+#     return speech_output
+# def speech_to_speech_translation(audio):
+#     translated_text = translate(audio)
+#     synthesised_speech = synthesise(translated_text)
+#     sample_rate = model.generation_config.sample_rate
+#     synthesised_speech = synthesised_speech.numpy().squeeze()
+#     return sample_rate, synthesised_speech
+def synthesise(text):
+    inputs = processor(text=text, return_tensors="pt")
+    output = model.generate_speech(inputs["input_ids"], speaker_embeddings2, vocoder=vocoder)
+    return output
+def speech_to_speech_translation(audio):
+    translated_text = translate(audio)
+    synthesised_speech = synthesise(translated_text)
+    audio_data = synthesised_speech.cpu().numpy()
+    audio_data = np.squeeze(audio_data)
+    audio_data = audio_data / np.max(np.abs(audio_data))
+    sample_rate = 16000
+    return (sample_rate, audio_data)
 title = "Cascaded STST"
 description = """