leo-kwan commited on
Commit
f1d52a5
1 Parent(s): ad11d3d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -21
app.py CHANGED
@@ -3,43 +3,51 @@ import numpy as np
3
  import torch
4
  from datasets import load_dataset
5
 
6
- from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, WhisperForConditionalGeneration, WhisperFeatureExtractor, WhisperTokenizer, pipeline
 
 
 
7
 
8
 
9
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
 
11
- # load speech translation checkpoint
12
- feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
13
- tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="french", task="translate")
14
-
15
- model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
16
- forced_decoder_ids = tokenizer.get_decoder_prompt_ids(language="french", task="translate")
17
- asr_pipe = pipeline(
18
- "automatic-speech-recognition",
19
- model=model,
20
- feature_extractor=feature_extractor,
21
- tokenizer=tokenizer,
22
- device=device
23
- )
24
 
25
  # load text-to-speech checkpoint and speaker embeddings
26
- processor = SpeechT5Processor.from_pretrained("leo-kwan/speecht5_finetuned_voxpopuli_lt")
27
 
28
- model = SpeechT5ForTextToSpeech.from_pretrained("leo-kwan/speecht5_finetuned_voxpopuli_lt").to(device)
29
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
30
 
31
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
32
- speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
33
 
34
 
35
  def translate(audio):
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate", "forced_decoder_ids": forced_decoder_ids})
37
  return outputs["text"]
38
 
39
 
40
  def synthesise(text):
41
- inputs = processor(text=text, return_tensors="pt")
42
- speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
 
 
 
 
 
43
  return speech.cpu()
44
 
45
 
 
3
  import torch
4
  from datasets import load_dataset
5
 
6
+ from transformers import (
7
+ SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, WhisperForConditionalGeneration, WhisperFeatureExtractor, WhisperTokenizer, pipeline,
8
+ BarkModel, BarkProcessor
9
+ )
10
 
11
 
12
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # load text-to-speech checkpoint and speaker embeddings
16
+ # processor = SpeechT5Processor.from_pretrained("leo-kwan/speecht5_finetuned_voxpopuli_lt")
17
 
18
+ # model = SpeechT5ForTextToSpeech.from_pretrained("leo-kwan/speecht5_finetuned_voxpopuli_lt").to(device)
19
+ # vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
20
 
21
+ # embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
22
+ # speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
23
 
24
 
25
  def translate(audio):
26
+ # load speech translation checkpoint
27
+ feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
28
+ tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="french", task="translate")
29
+
30
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
31
+ forced_decoder_ids = tokenizer.get_decoder_prompt_ids(language="french", task="translate")
32
+ asr_pipe = pipeline(
33
+ "automatic-speech-recognition",
34
+ model=model,
35
+ feature_extractor=feature_extractor,
36
+ tokenizer=tokenizer,
37
+ device=device
38
+ )
39
  outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate", "forced_decoder_ids": forced_decoder_ids})
40
  return outputs["text"]
41
 
42
 
43
  def synthesise(text):
44
+ model = BarkModel.from_pretrained("suno/bark-small")
45
+ processor = BarkProcessor.from_pretrained("suno/bark-small")
46
+ inputs = processor(text, voice_preset="v2/fr_speaker_1")
47
+ speech = model.generate(**inputs).cpu()
48
+
49
+ # inputs = processor(text=text, return_tensors="pt")
50
+ # speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
51
  return speech.cpu()
52
 
53