ckandemir commited on
Commit
a9ccbef
1 Parent(s): 6700c91

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -2
app.py CHANGED
@@ -9,7 +9,7 @@ from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Proce
9
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
 
11
  # load speech translation checkpoint
12
- asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
13
 
14
  # load text-to-speech checkpoint and speaker embeddings
15
  processor = SpeechT5Processor.from_pretrained("ckandemir/speecht5_finetuned_voxpopuli_fr")
@@ -21,12 +21,41 @@ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validat
21
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def translate(audio):
25
- outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "fr"})
26
  return outputs["text"]
27
 
28
 
29
  def synthesise(text):
 
30
  inputs = processor(text=text, return_tensors="pt")
31
  speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
32
  return speech.cpu()
 
9
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
 
11
  # load speech translation checkpoint
12
+ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device=device)
13
 
14
  # load text-to-speech checkpoint and speaker embeddings
15
  processor = SpeechT5Processor.from_pretrained("ckandemir/speecht5_finetuned_voxpopuli_fr")
 
21
  speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
22
 
23
 
24
+ replacements = [
25
+ ("á", "a"),
26
+ ("ç", "c"),
27
+ ("è", "e"),
28
+ ("ì", "i"),
29
+ ("í", "i"),
30
+ ("ò", "o"),
31
+ ("ó", "o"),
32
+ ("ù", "u"),
33
+ ("ú", "u"),
34
+ ("š", "s"),
35
+ ("ï", "i"),
36
+ ("ñ", "n"),
37
+ ("ü", "u"),
38
+ ]
39
+
40
+ def cleanup_text(text):
41
+ for src, dst in replacements:
42
+ text = text.replace(src, dst)
43
+ return text
44
+
45
+ def synthesize_speech(text):
46
+ text = cleanup_text(text)
47
+ inputs = processor(text=text, return_tensors="pt")
48
+ speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
49
+
50
+ return gr.Audio.update(value=(16000, speech.cpu().numpy()))
51
+
52
  def translate(audio):
53
+ outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "french"})
54
  return outputs["text"]
55
 
56
 
57
  def synthesise(text):
58
+ text = cleanup_text(text)
59
  inputs = processor(text=text, return_tensors="pt")
60
  speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
61
  return speech.cpu()