TRIT0N commited on
Commit
c162e37
1 Parent(s): a1416ea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -13
app.py CHANGED
@@ -3,10 +3,10 @@ import numpy as np
3
  import torch
4
  from datasets import load_dataset
5
 
6
- #from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
7
- from transformers import pipeline
8
- from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
9
- from transformers import BarkModel, BarkProcessor
10
 
11
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
12
 
@@ -14,12 +14,12 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
14
  asr_pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-100h", device=device)
15
 
16
  # load text-to-speech checkpoint and speaker embeddings
17
- #processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
18
- #model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
19
- #vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
20
 
21
- model = BarkModel.from_pretrained("suno/bark-small")
22
- processor = BarkProcessor.from_pretrained("suno/bark-small")
23
 
24
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
25
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
@@ -32,8 +32,8 @@ def translate(audio):
32
 
33
  def synthesise(text):
34
  inputs = processor(text=text, return_tensors="pt")
35
- #speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
36
- speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device))
37
  return speech.cpu()
38
 
39
 
@@ -46,8 +46,7 @@ def speech_to_speech_translation(audio):
46
 
47
  title = "Cascaded STST"
48
  description = """
49
- Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
50
- [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
51
 
52
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
53
  """
 
3
  import torch
4
  from datasets import load_dataset
5
 
6
+ from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
7
+ #from transformers import pipeline
8
+ #from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
9
+ #from transformers import BarkModel, BarkProcessor
10
 
11
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
12
 
 
14
  asr_pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-100h", device=device)
15
 
16
  # load text-to-speech checkpoint and speaker embeddings
17
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
18
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
19
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
20
 
21
+ #model = BarkModel.from_pretrained("suno/bark-small")
22
+ #processor = BarkProcessor.from_pretrained("suno/bark-small")
23
 
24
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
25
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 
32
 
33
  def synthesise(text):
34
  inputs = processor(text=text, return_tensors="pt")
35
+ speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
36
+ #speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device))
37
  return speech.cpu()
38
 
39
 
 
46
 
47
  title = "Cascaded STST"
48
  description = """
49
+ Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses facebook/wav2vec2-base-100h model for speech translation, and microsoft/speecht5_tts model for text-to-speech:
 
50
 
51
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
52
  """