juulaii commited on
Commit
64a4879
1 Parent(s): 1b7e0be

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -3
app.py CHANGED
@@ -2,11 +2,34 @@ import gradio as gr
2
 
3
  #Get models
4
  #ASR model for input speech
5
- speech2text = gr.Interface.load("huggingface/facebook/hubert-large-ls960-ft",
6
- inputs=gr.inputs.Audio(label="Record Audio File", type="file", source = "microphone"))
7
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  #translates english to spanish text
9
  translator = gr.Interface.load("huggingface/Helsinki-NLP/opus-mt-en-es",
 
10
  outputs=gr.outputs.Textbox(label="English to Spanish Translated Text"))
11
  #TTS model for output speech
12
  text2speech = gr.Interface.load("huggingface/facebook/tts_transformer-es-css10",
 
2
 
3
  #Get models
4
  #ASR model for input speech
5
+ import torch
6
+ from transformers import Wav2Vec2Processor, HubertForCTC
7
+ from datasets import load_dataset
8
+ import soundfile as sf
9
+
10
+ processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
11
+ model = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft")
12
+
13
+ def map_to_array(batch):
14
+ speech, _ = sf.read(batch["file"])
15
+ batch["speech"] = speech
16
+ return batch
17
+
18
+ ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
19
+ ds = ds.map(map_to_array)
20
+
21
+ input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1
22
+ logits = model(input_values).logits
23
+ predicted_ids = torch.argmax(logits, dim=-1)
24
+ transcription = processor.decode(predicted_ids[0])
25
+
26
+ #speech2text = gr.Interface.load("huggingface/facebook/hubert-large-ls960-ft",
27
+ # inputs=gr.inputs.Audio(label="Record Audio File", type="file", source = "microphone"))
28
+ speech2text = gr.Interface.(transcription,
29
+ inputs=gr.inputs.Audio(label="Record Audio File", type="file", source = "microphone"))
30
  #translates english to spanish text
31
  translator = gr.Interface.load("huggingface/Helsinki-NLP/opus-mt-en-es",
32
+ input=transcription
33
  outputs=gr.outputs.Textbox(label="English to Spanish Translated Text"))
34
  #TTS model for output speech
35
  text2speech = gr.Interface.load("huggingface/facebook/tts_transformer-es-css10",