juangtzi commited on
Commit
f75821a
1 Parent(s): 7294493

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -36
app.py CHANGED
@@ -2,9 +2,9 @@ import gradio as gr
2
  import numpy as np
3
  import torch
4
  from transformers import pipeline, VitsModel, AutoTokenizer, AutoTokenizer
5
- # from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
6
  from transformers import WhisperTokenizer, GenerationConfig
7
- from transformers import BarkModel, AutoProcessor
8
 
9
 
10
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
@@ -28,17 +28,18 @@ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-medium
28
 
29
  # ---------------- Speech generator specht5_tts --------------------------#
30
 
31
- # model = SpeechT5ForTextToSpeech.from_pretrained(
32
- # "juangtzi/speecht5_finetuned_voxpopuli_es"
33
- # )
34
- # checkpoint = "microsoft/speecht5_tts"
35
- # processor = SpeechT5Processor.from_pretrained(checkpoint)
36
- # vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
 
 
 
 
37
 
38
- # speaker_embeddings2 = np.load('speaker_embeddings.npy')
39
- # speaker_embeddings2 = torch.tensor(speaker_embeddings2)
40
- # print(speaker_embeddings2)
41
- # lang_detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection")
42
 
43
  # ---------------- Speech generator bark--------------------------#
44
 
@@ -46,8 +47,8 @@ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-medium
46
  #model = BarkModel.from_pretrained("suno/bark-small")
47
  #processor = BarkProcessor.from_pretrained("suno/bark-small")
48
 
49
- processor = AutoProcessor.from_pretrained("suno/bark-small")
50
- model = BarkModel.from_pretrained("suno/bark-small")
51
 
52
 
53
  def language_detector(text):
@@ -62,35 +63,35 @@ def translate(audio):
62
  return outputs["text"]
63
 
64
 
65
- def synthesise(text):
66
- inputs = processor(text=text, voice_preset="v2/es_speaker_8")
67
- speech_output = model.generate(**inputs).cpu()
68
- return speech_output
69
 
70
- def speech_to_speech_translation(audio):
71
- translated_text = translate(audio)
72
- synthesised_speech = synthesise(translated_text)
73
 
74
- sample_rate = model.generation_config.sample_rate
75
 
76
- synthesised_speech = synthesised_speech.numpy().squeeze()
77
 
78
- return sample_rate, synthesised_speech
79
 
80
 
81
- # def synthesise(text): speecht5_tts
82
- # inputs = processor(text=text, return_tensors="pt")
83
- # output = model.generate_speech(inputs["input_ids"], speaker_embeddings2, vocoder=vocoder)
84
- # return output
85
 
86
- # def speech_to_speech_translation(audio): speecht5_tts
87
- # translated_text = translate(audio)
88
- # synthesised_speech = synthesise(translated_text)
89
- # audio_data = synthesised_speech.cpu().numpy()
90
- # audio_data = np.squeeze(audio_data)
91
- # audio_data = audio_data / np.max(np.abs(audio_data))
92
- # sample_rate = 16000
93
- # return (sample_rate, audio_data)
94
 
95
  title = "Cascaded STST"
96
  description = """
 
2
  import numpy as np
3
  import torch
4
  from transformers import pipeline, VitsModel, AutoTokenizer, AutoTokenizer
5
+ from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor
6
  from transformers import WhisperTokenizer, GenerationConfig
7
+ #from transformers import BarkModel, AutoProcessor
8
 
9
 
10
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
28
 
29
  # ---------------- Speech generator specht5_tts --------------------------#
30
 
31
+ model = SpeechT5ForTextToSpeech.from_pretrained(
32
+ "juangtzi/speecht5_finetuned_voxpopuli_es"
33
+ )
34
+ checkpoint = "microsoft/speecht5_tts"
35
+ processor = SpeechT5Processor.from_pretrained(checkpoint)
36
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
37
+
38
+ speaker_embeddings2 = np.load('speaker_embeddings.npy')
39
+ speaker_embeddings2 = torch.tensor(speaker_embeddings2)
40
+ print(speaker_embeddings2)
41
 
42
+ #lang_detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection")
 
 
 
43
 
44
  # ---------------- Speech generator bark--------------------------#
45
 
 
47
  #model = BarkModel.from_pretrained("suno/bark-small")
48
  #processor = BarkProcessor.from_pretrained("suno/bark-small")
49
 
50
+ # processor = AutoProcessor.from_pretrained("suno/bark-small")
51
+ # model = BarkModel.from_pretrained("suno/bark-small")
52
 
53
 
54
  def language_detector(text):
 
63
  return outputs["text"]
64
 
65
 
66
+ # def synthesise(text):
67
+ # inputs = processor(text=text, voice_preset="v2/es_speaker_8")
68
+ # speech_output = model.generate(**inputs).cpu()
69
+ # return speech_output
70
 
71
+ # def speech_to_speech_translation(audio):
72
+ # translated_text = translate(audio)
73
+ # synthesised_speech = synthesise(translated_text)
74
 
75
+ # sample_rate = model.generation_config.sample_rate
76
 
77
+ # synthesised_speech = synthesised_speech.numpy().squeeze()
78
 
79
+ # return sample_rate, synthesised_speech
80
 
81
 
82
+ def synthesise(text):
83
+ inputs = processor(text=text, return_tensors="pt")
84
+ output = model.generate_speech(inputs["input_ids"], speaker_embeddings2, vocoder=vocoder)
85
+ return output
86
 
87
+ def speech_to_speech_translation(audio):
88
+ translated_text = translate(audio)
89
+ synthesised_speech = synthesise(translated_text)
90
+ audio_data = synthesised_speech.cpu().numpy()
91
+ audio_data = np.squeeze(audio_data)
92
+ audio_data = audio_data / np.max(np.abs(audio_data))
93
+ sample_rate = 16000
94
+ return (sample_rate, audio_data)
95
 
96
  title = "Cascaded STST"
97
  description = """