leo-kwan commited on
Commit
ca21522
1 Parent(s): 5285dde

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -13
app.py CHANGED
@@ -18,17 +18,17 @@ tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="fr
18
  whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
19
  forced_decoder_ids = tokenizer.get_decoder_prompt_ids(language="french", task="translate")
20
 
21
- # load text-to-speech checkpoint and speaker embeddings
22
- # processor = SpeechT5Processor.from_pretrained("leo-kwan/speecht5_finetuned_voxpopuli_lt")
23
 
24
- # model = SpeechT5ForTextToSpeech.from_pretrained("leo-kwan/speecht5_finetuned_voxpopuli_lt").to(device)
25
- # vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
26
 
27
- # embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
28
- # speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
29
 
30
- bark_model = BarkModel.from_pretrained("suno/bark-small")
31
- processor = BarkProcessor.from_pretrained("suno/bark-small")
32
 
33
  def translate(audio):
34
  # load speech translation checkpoint
@@ -45,18 +45,18 @@ def translate(audio):
45
 
46
  def synthesise(text):
47
 
48
- inputs = processor(text, voice_preset="v2/fr_speaker_1")
49
- speech = bark_model.generate(**inputs).cpu()
50
 
51
- # inputs = processor(text=text, return_tensors="pt")
52
- # speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
53
  return speech.cpu()
54
 
55
 
56
  def speech_to_speech_translation(audio):
57
  translated_text = translate(audio)
58
  synthesised_speech = synthesise(translated_text)
59
- # synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
60
  return 16000, synthesised_speech
61
 
62
 
 
18
  whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
19
  forced_decoder_ids = tokenizer.get_decoder_prompt_ids(language="french", task="translate")
20
 
21
+ load text-to-speech checkpoint and speaker embeddings
22
+ processor = SpeechT5Processor.from_pretrained("leo-kwan/speecht5_finetuned_voxpopuli_lt")
23
 
24
+ model = SpeechT5ForTextToSpeech.from_pretrained("leo-kwan/speecht5_finetuned_voxpopuli_lt").to(device)
25
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
26
 
27
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
28
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
29
 
30
+ # bark_model = BarkModel.from_pretrained("suno/bark-small")
31
+ # processor = BarkProcessor.from_pretrained("suno/bark-small")
32
 
33
  def translate(audio):
34
  # load speech translation checkpoint
 
45
 
46
  def synthesise(text):
47
 
48
+ # inputs = processor(text, voice_preset="v2/fr_speaker_1")
49
+ # speech = bark_model.generate(**inputs).cpu()
50
 
51
+ inputs = processor(text=text, return_tensors="pt")
52
+ speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
53
  return speech.cpu()
54
 
55
 
56
  def speech_to_speech_translation(audio):
57
  translated_text = translate(audio)
58
  synthesised_speech = synthesise(translated_text)
59
+ synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
60
  return 16000, synthesised_speech
61
 
62