leo-kwan commited on
Commit
5285dde
1 Parent(s): f1d52a5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -10
app.py CHANGED
@@ -12,6 +12,12 @@ from transformers import (
12
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
13
 
14
 
 
 
 
 
 
 
15
  # load text-to-speech checkpoint and speaker embeddings
16
  # processor = SpeechT5Processor.from_pretrained("leo-kwan/speecht5_finetuned_voxpopuli_lt")
17
 
@@ -21,17 +27,14 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
21
  # embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
22
  # speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
23
 
 
 
24
 
25
  def translate(audio):
26
  # load speech translation checkpoint
27
- feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
28
- tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="french", task="translate")
29
-
30
- model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
31
- forced_decoder_ids = tokenizer.get_decoder_prompt_ids(language="french", task="translate")
32
  asr_pipe = pipeline(
33
  "automatic-speech-recognition",
34
- model=model,
35
  feature_extractor=feature_extractor,
36
  tokenizer=tokenizer,
37
  device=device
@@ -41,10 +44,9 @@ def translate(audio):
41
 
42
 
43
  def synthesise(text):
44
- model = BarkModel.from_pretrained("suno/bark-small")
45
- processor = BarkProcessor.from_pretrained("suno/bark-small")
46
  inputs = processor(text, voice_preset="v2/fr_speaker_1")
47
- speech = model.generate(**inputs).cpu()
48
 
49
  # inputs = processor(text=text, return_tensors="pt")
50
  # speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
@@ -54,7 +56,7 @@ def synthesise(text):
54
  def speech_to_speech_translation(audio):
55
  translated_text = translate(audio)
56
  synthesised_speech = synthesise(translated_text)
57
- synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
58
  return 16000, synthesised_speech
59
 
60
 
 
12
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
13
 
14
 
15
+ feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
16
+ tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="french", task="translate")
17
+
18
+ whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
19
+ forced_decoder_ids = tokenizer.get_decoder_prompt_ids(language="french", task="translate")
20
+
21
  # load text-to-speech checkpoint and speaker embeddings
22
  # processor = SpeechT5Processor.from_pretrained("leo-kwan/speecht5_finetuned_voxpopuli_lt")
23
 
 
27
  # embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
28
  # speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
29
 
30
+ bark_model = BarkModel.from_pretrained("suno/bark-small")
31
+ processor = BarkProcessor.from_pretrained("suno/bark-small")
32
 
33
  def translate(audio):
34
  # load speech translation checkpoint
 
 
 
 
 
35
  asr_pipe = pipeline(
36
  "automatic-speech-recognition",
37
+ model=whisper_model,
38
  feature_extractor=feature_extractor,
39
  tokenizer=tokenizer,
40
  device=device
 
44
 
45
 
46
  def synthesise(text):
47
+
 
48
  inputs = processor(text, voice_preset="v2/fr_speaker_1")
49
+ speech = bark_model.generate(**inputs).cpu()
50
 
51
  # inputs = processor(text=text, return_tensors="pt")
52
  # speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
 
56
  def speech_to_speech_translation(audio):
57
  translated_text = translate(audio)
58
  synthesised_speech = synthesise(translated_text)
59
+ # synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
60
  return 16000, synthesised_speech
61
 
62