ykirpichev commited on
Commit
1025309
1 Parent(s): a5c0a2d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -9
app.py CHANGED
@@ -12,17 +12,17 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
12
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
13
 
14
  # load text-to-speech checkpoint and speaker embeddings
15
- # processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
16
 
17
- # model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
18
- # vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
19
 
20
 
21
  model = VitsModel.from_pretrained("Matthijs/mms-tts-deu").to(device)
22
  tokenizer = VitsTokenizer.from_pretrained("Matthijs/mms-tts-deu")
23
 
24
- # embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
25
- # speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
26
 
27
 
28
  def translate(audio):
@@ -37,10 +37,11 @@ def synthesise(text):
37
 
38
  with torch.no_grad():
39
  outputs = model(input_ids)
40
- return outputs['audio'].cpu()
41
- # inputs = processor(text=text, return_tensors="pt")
42
- # speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
43
- # return speech.cpu()
 
44
 
45
 
46
  def speech_to_speech_translation(audio):
 
12
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
13
 
14
  # load text-to-speech checkpoint and speaker embeddings
15
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
16
 
17
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
18
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
19
 
20
 
21
  model = VitsModel.from_pretrained("Matthijs/mms-tts-deu").to(device)
22
  tokenizer = VitsTokenizer.from_pretrained("Matthijs/mms-tts-deu")
23
 
24
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
25
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
26
 
27
 
28
  def translate(audio):
 
37
 
38
  with torch.no_grad():
39
  outputs = model(input_ids)
40
+ print(outputs)
41
+ inputs = processor(text=text, return_tensors="pt")
42
+ speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
43
+ print(speech)
44
+ return speech.cpu()
45
 
46
 
47
  def speech_to_speech_translation(audio):