ckandemir commited on
Commit
6a27ecf
1 Parent(s): 835f922

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -26
app.py CHANGED
@@ -11,7 +11,7 @@ device = "cuda:0" if torch.cuda.is_available() else "cpu"
11
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device=device)
12
 
13
  # load text-to-speech checkpoint and speaker embeddings
14
- model_id = "Sandiago21/speecht5_finetuned_facebook_voxpopuli_spanish" # update with your model id
15
  # pipe = pipeline("automatic-speech-recognition", model=model_id)
16
  model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
17
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
@@ -21,44 +21,31 @@ speaker_embeddings = torch.tensor(embeddings_dataset[7440]["xvector"]).unsqueeze
21
  processor = SpeechT5Processor.from_pretrained(model_id)
22
 
23
  replacements = [
24
- ("á", "a"),
25
- ("ç", "c"),
26
- ("è", "e"),
27
- ("ì", "i"),
28
- ("í", "i"),
29
- ("ò", "o"),
30
- ("ó", "o"),
31
- ("ù", "u"),
32
- ("ú", "u"),
33
- ("š", "s"),
34
- ("ï", "i"),
35
- ("ñ", "n"),
36
- ("ü", "u"),
37
  ]
38
 
 
39
  def cleanup_text(text):
40
  for src, dst in replacements:
41
  text = text.replace(src, dst)
42
  return text
43
 
44
- def synthesize_speech(text):
45
- text = cleanup_text(text)
46
- inputs = processor(text=text, return_tensors="pt")
47
- speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
48
-
49
- return gr.Audio.update(value=(16000, speech.cpu().numpy()))
50
-
51
  def translate(audio):
52
- outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "spanish"})
53
  return outputs["text"]
54
 
55
 
 
56
  def synthesise(text):
57
  text = cleanup_text(text)
58
  inputs = processor(text=text, return_tensors="pt")
59
  speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
60
- return speech.cpu()
61
-
62
 
63
  def speech_to_speech_translation(audio):
64
  translated_text = translate(audio)
@@ -69,8 +56,8 @@ def speech_to_speech_translation(audio):
69
 
70
  title = "Cascaded STST"
71
  description = """
72
- Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Spanish. Demo uses OpenAI's [Whisper Large v2](https://huggingface.co/openai/whisper-large-v2) model for speech translation, and [Sandiago21/speecht5_finetuned_facebook_voxpopuli_spanish](https://huggingface.co/Sandiago21/speecht5_finetuned_facebook_voxpopuli_spanish) checkpoint for text-to-speech, which is based on Microsoft's
73
- [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech, fine-tuned in Spanish Audio dataset:
74
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
75
  """
76
 
 
11
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device=device)
12
 
13
  # load text-to-speech checkpoint and speaker embeddings
14
+ model_id = "ckandemir/speecht5_finetuned_voxpopuli_fr" # update with your model id
15
  # pipe = pipeline("automatic-speech-recognition", model=model_id)
16
  model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
17
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
 
21
  processor = SpeechT5Processor.from_pretrained(model_id)
22
 
23
  replacements = [
24
+ ("à", "a"), ("â", "a"),
25
+ ("ç", "c"),
26
+ ("é", "e"), ("è", "e"), ("ê", "e"), ("ë", "e"),
27
+ ("î", "i"), ("ï", "i"),
28
+ ("ô", "o"),
29
+ ("ù", "u"), ("û", "u"),
 
 
 
 
 
 
 
30
  ]
31
 
32
+
33
  def cleanup_text(text):
34
  for src, dst in replacements:
35
  text = text.replace(src, dst)
36
  return text
37
 
 
 
 
 
 
 
 
38
  def translate(audio):
39
+ outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "french"})
40
  return outputs["text"]
41
 
42
 
43
+
44
  def synthesise(text):
45
  text = cleanup_text(text)
46
  inputs = processor(text=text, return_tensors="pt")
47
  speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
48
+ return gr.Audio.update(value=(16000, speech.cpu().numpy()))
 
49
 
50
  def speech_to_speech_translation(audio):
51
  translated_text = translate(audio)
 
56
 
57
  title = "Cascaded STST"
58
  description = """
59
+ Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in French. Demo uses OpenAI's [Whisper Large v2](https://huggingface.co/openai/whisper-large-v2) model for speech translation, and [ckandemir/speecht5_finetuned_voxpopuli_fr"](https://huggingface.co/ckandemir/speecht5_finetuned_voxpopuli_fr) checkpoint for text-to-speech, which is based on Microsoft's
60
+ [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech, fine-tuned in French Audio dataset:
61
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
62
  """
63