hewliyang commited on
Commit
0323180
1 Parent(s): dbfdf1a

use whisper-large-v3 & mms-tts-zlm

Browse files
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__
2
+ .venv
README.md CHANGED
@@ -9,4 +9,11 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
 
 
 
 
 
 
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
9
  pinned: false
10
  ---
11
 
12
+ Part of the HuggingFace Audio Processing course.
13
+
14
+ This is a Gradio wrapper around a (X -> Malay) speech2speech pipeline, where **X** is any language supported by
15
+ `openai/whisper-base`.
16
+
17
+ The TTS model used is `facebook/mms-tts-zlm`, a pretrained checkpoint for speech in Malay which is part of their **Massively Multilingual Speech** project. The underlying architecture is based on VITS, which generates waveforms directly and does not need a seperate vocoder.
18
+
19
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,47 +1,62 @@
 
1
  import gradio as gr
2
  import numpy as np
3
- import torch
4
- from datasets import load_dataset
5
 
6
- from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
 
 
 
 
7
 
8
 
9
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
 
10
 
11
  # load speech translation checkpoint
12
- asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
 
 
 
 
 
13
 
14
- # load text-to-speech checkpoint and speaker embeddings
15
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
16
 
17
- model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
18
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
19
 
20
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
21
- speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
22
 
 
 
 
23
 
24
- def translate(audio):
25
- outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
26
- return outputs["text"]
27
 
 
 
28
 
29
- def synthesise(text):
30
- inputs = processor(text=text, return_tensors="pt")
31
- speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
32
- return speech.cpu()
 
 
 
 
33
 
34
 
35
  def speech_to_speech_translation(audio):
36
  translated_text = translate(audio)
37
  synthesised_speech = synthesise(translated_text)
38
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
39
- return 16000, synthesised_speech
40
 
41
 
42
  title = "Cascaded STST"
43
  description = """
44
- Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
45
  [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
46
 
47
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
@@ -61,9 +76,10 @@ file_translate = gr.Interface(
61
  fn=speech_to_speech_translation,
62
  inputs=gr.Audio(source="upload", type="filepath"),
63
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
64
- examples=[["./example.wav"]],
65
  title=title,
66
  description=description,
 
67
  )
68
 
69
  with demo:
 
1
+ import torch
2
  import gradio as gr
3
  import numpy as np
 
 
4
 
5
+ from transformers import (
6
+ VitsModel,
7
+ VitsTokenizer,
8
+ pipeline,
9
+ )
10
 
11
 
12
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
13
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
14
+ print(f"Using {device} with fp {torch_dtype}")
15
 
16
  # load speech translation checkpoint
17
+ asr_pipe = pipeline( # noqa: F821
18
+ "automatic-speech-recognition",
19
+ model="openai/whisper-large-v3",
20
+ device=device,
21
+ torch_dtype=torch_dtype,
22
+ )
23
 
24
+ # load text-to-speech checkpoint
 
25
 
26
+ model = VitsModel.from_pretrained("facebook/mms-tts-zlm")
27
+ tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-zlm")
28
 
 
 
29
 
30
+ def synthesise(text):
31
+ inputs = tokenizer(text=text, return_tensors="pt")
32
+ input_ids = inputs["input_ids"]
33
 
34
+ with torch.no_grad():
35
+ outputs = model(input_ids)
 
36
 
37
+ speech = outputs["waveform"]
38
+ return speech
39
 
40
+
41
+ def translate(audio):
42
+ outputs = asr_pipe(
43
+ audio,
44
+ max_new_tokens=256,
45
+ generate_kwargs={"task": "transcribe", "language": "ms"},
46
+ )
47
+ return outputs["text"]
48
 
49
 
50
  def speech_to_speech_translation(audio):
51
  translated_text = translate(audio)
52
  synthesised_speech = synthesise(translated_text)
53
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
54
+ return 16000, synthesised_speech.T
55
 
56
 
57
  title = "Cascaded STST"
58
  description = """
59
+ Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in **Malay**. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
60
  [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
61
 
62
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
 
76
  fn=speech_to_speech_translation,
77
  inputs=gr.Audio(source="upload", type="filepath"),
78
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
79
+ examples="./examples",
80
  title=title,
81
  description=description,
82
+ live=True,
83
  )
84
 
85
  with demo:
dev.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
examples/anime-jap.mp3 ADDED
Binary file (608 kB). View file
 
examples/english-1.wav ADDED
Binary file (465 kB). View file
 
examples/english-2.wav ADDED
Binary file (426 kB). View file
 
examples/spanish.wav ADDED
Binary file (263 kB). View file
 
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
  torch
2
- git+https://github.com/huggingface/transformers
3
  datasets
4
  sentencepiece
 
 
1
  torch
 
2
  datasets
3
  sentencepiece
4
+ transformers