crowbarmassage commited on
Commit
147fb27
1 Parent(s): 316ede8

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -36
app.py CHANGED
@@ -1,72 +1,135 @@
1
- import gradio as gr
2
- import numpy as np
3
- import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from datasets import load_dataset
 
 
 
 
 
 
 
 
5
 
6
- from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
 
 
 
 
7
 
 
8
 
9
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
 
11
- # load speech translation checkpoint
12
- asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
 
13
 
14
- # load text-to-speech checkpoint and speaker embeddings
15
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
16
 
17
- model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
18
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 
 
 
 
19
 
20
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
21
- speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 
22
 
 
 
23
 
24
- def translate(audio):
25
- outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
26
- return outputs["text"]
27
 
28
 
29
  def synthesise(text):
30
- inputs = processor(text=text, return_tensors="pt")
31
- speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
32
- return speech.cpu()
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  def speech_to_speech_translation(audio):
36
- translated_text = translate(audio)
 
37
  synthesised_speech = synthesise(translated_text)
38
- synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
39
  return 16000, synthesised_speech
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- title = "Cascaded STST"
43
- description = """
44
- Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
45
- [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
46
-
47
- ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
48
- """
49
 
50
  demo = gr.Blocks()
51
 
52
  mic_translate = gr.Interface(
53
- fn=speech_to_speech_translation,
54
  inputs=gr.Audio(source="microphone", type="filepath"),
55
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
56
- title=title,
57
- description=description,
58
  )
59
 
60
  file_translate = gr.Interface(
61
- fn=speech_to_speech_translation,
62
  inputs=gr.Audio(source="upload", type="filepath"),
63
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
64
- examples=[["./example.wav"]],
65
- title=title,
66
- description=description,
67
  )
68
 
69
  with demo:
70
  gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
71
 
72
- demo.launch()
 
1
+ # -*- coding: utf-8 -*-
2
+ """app.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/143eWt9oxUTcF59OBiVybOgKXJB3QOTsK
8
+ """
9
+
10
+ # Beginning of Unit 7
11
+ #!pip install git+https://github.com/huggingface/transformers.git
12
+ !pip install torch accelerate torchaudio datasets gradio sentencepiece
13
+ !pip install -U transformers
14
+ #!pip install sacremoses
15
+ #!pip install -Uqq datasets[audio]
16
+ #!pip install git+https://github.com/huggingface/transformers
17
+
18
+ from transformers.models.markuplm.tokenization_markuplm import MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
19
+ import torch, torchaudio
20
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
21
+ import sentencepiece
22
+ from transformers import MarianMTModel, MarianTokenizer
23
  from datasets import load_dataset
24
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
25
+ from IPython.display import Audio
26
+ import numpy as np
27
+
28
+ target_dtype = np.int16
29
+ max_range = np.iinfo(target_dtype).max
30
+
31
+ # Load Spanish Audio
32
 
33
+ def transcribe(audio):
34
+ model_id_asr = "openai/whisper-small"
35
+ processor_asr = WhisperProcessor.from_pretrained(model_id_asr)
36
+ model_asr = WhisperForConditionalGeneration.from_pretrained(model_id_asr)
37
+ model_asr.config.forced_decoder_ids = None
38
 
39
+ input_features = processor_asr(audio["audio"]["array"], sampling_rate=audio["audio"]["sampling_rate"], return_tensors="pt").input_features
40
 
41
+ predicted_ids = model_asr.generate(input_features)
42
 
43
+ # decode token ids to text
44
+ transcription = processor_asr.batch_decode(predicted_ids, skip_special_tokens=True)
45
+ return transcription[0]
46
 
47
+ # Run inference on Spanish Audio vector
 
48
 
49
+ def translate(text):
50
+ model_id_mt = "Helsinki-NLP/opus-mt-es-fr"
51
+ tokenizer_mt = MarianTokenizer.from_pretrained(model_id_mt)
52
+ model_mt = MarianMTModel.from_pretrained(model_id_mt)
53
+ # Tokenize the input text
54
+ input_ids = tokenizer_mt.encode(text, return_tensors="pt")
55
 
56
+ # Generate translation
57
+ with torch.no_grad():
58
+ translated_ids = model_mt.generate(input_ids)
59
 
60
+ # Decode the translated text
61
+ translated_text = tokenizer_mt.decode(translated_ids[0], skip_special_tokens=True)
62
 
63
+ return translated_text
 
 
64
 
65
 
66
  def synthesise(text):
 
 
 
67
 
68
+ processor_tts = SpeechT5Processor.from_pretrained("crowbarmassage/speecht5_finetuned_voxpopuli_fr")
69
+
70
+ model_tts = SpeechT5ForTextToSpeech.from_pretrained("crowbarmassage/speecht5_finetuned_voxpopuli_fr")
71
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
72
+
73
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
74
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
75
+
76
+ inputs = processor_tts(text=text, return_tensors="pt")
77
+ speech = model_tts.generate_speech(
78
+ inputs["input_ids"], speaker_embeddings, vocoder=vocoder
79
+ )
80
+ return speech
81
 
82
  def speech_to_speech_translation(audio):
83
+ transcribed_text = transcribe(audio)
84
+ translated_text = translate(transcribed_text)
85
  synthesised_speech = synthesise(translated_text)
 
86
  return 16000, synthesised_speech
87
 
88
+ def adjusted_speech_to_speech_translation(audio_filepath):
89
+ # Load the audio file
90
+ waveform, sampling_rate = torchaudio.load(audio_filepath)
91
+
92
+ if sampling_rate != 16000:
93
+ resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
94
+ waveform = resampler(waveform)
95
+ sampling_rate = 16000
96
+ # Convert the waveform to a numpy array and construct the expected dictionary format
97
+ audio_dict = {
98
+ "audio": {
99
+ "array": waveform.numpy(),
100
+ "sampling_rate": sampling_rate
101
+ }
102
+ }
103
+
104
+ transcribed_text = transcribe(audio_dict)
105
+ translated_text = translate(transcribed_text)
106
+ #print(transcribed_text)
107
+ #print(translated_text)
108
+ synthesised_speech = synthesise(translated_text)
109
+ #print(synthesised_speech)
110
+ #print(torch.min(synthesised_speech), torch.max(synthesised_speech))
111
+ synthesised_speech = (synthesised_speech * 32767).numpy().astype(np.int16)
112
+ #print(synthesised_speech)
113
+ #print(np.min(synthesised_speech), np.max(synthesised_speech))
114
+ return 16000, synthesised_speech
115
 
116
+ import gradio as gr
 
 
 
 
 
 
117
 
118
  demo = gr.Blocks()
119
 
120
  mic_translate = gr.Interface(
121
+ fn=adjusted_speech_to_speech_translation,
122
  inputs=gr.Audio(source="microphone", type="filepath"),
123
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
 
 
124
  )
125
 
126
  file_translate = gr.Interface(
127
+ fn=adjusted_speech_to_speech_translation,
128
  inputs=gr.Audio(source="upload", type="filepath"),
129
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
 
 
 
130
  )
131
 
132
  with demo:
133
  gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
134
 
135
+ demo.launch(debug=True, share=False)