Ahsen Khaliq commited on
Commit
aba08c5
1 Parent(s): c16e1a2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -25
app.py CHANGED
@@ -1,29 +1,29 @@
1
- from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
2
- import soundfile as sf
3
- import torch
4
- import gradio as gr
5
-
6
 
7
- # load model and processor
8
- processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-robust-ft-libri-960h")
9
- model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-robust-ft-libri-960h")
10
 
11
- # define function to read in sound file
12
- def map_to_array(file):
13
- speech, _ = sf.read(file)
14
- return speech
15
 
16
- # tokenize
17
- def inference(audio):
18
- input_values = processor(map_to_array('/content/sample_data/sample2.flac'), return_tensors="pt", padding="longest").input_values # Batch size 1
 
19
 
20
- # retrieve logits
21
- logits = model(input_values).logits
 
22
 
23
- # take argmax and decode
24
- predicted_ids = torch.argmax(logits, dim=-1)
25
- transcription = processor.batch_decode(predicted_ids)
26
- return transcription[0]
 
 
 
27
 
28
  inputs = gr.inputs.Audio(label="Input Audio", type="file")
29
  outputs = gr.outputs.Textbox(label="Output Text")
@@ -31,8 +31,6 @@ outputs = gr.outputs.Textbox(label="Output Text")
31
  title = "wav2vec 2.0"
32
  description = "demo for Facebook AI wav2vec 2.0 using Hugging Face transformers. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below."
33
  article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2006.11477'>wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations</a> | <a href='https://github.com/pytorch/fairseq'>Github Repo</a> | <a href='https://huggingface.co/facebook/wav2vec2-base-960h'>Hugging Face model</a></p>"
34
- examples = [
35
- ["poem.wav"]
36
- ]
37
 
38
- gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=examples).launch()
 
 
1
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
2
+ import soundfile as sf
3
+ import torch
4
+ import gradio as gr
 
5
 
 
 
 
6
 
7
+ # load model and processor
8
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-robust-ft-libri-960h")
9
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-robust-ft-libri-960h")
 
10
 
11
+ # define function to read in sound file
12
+ def map_to_array(file):
13
+ speech, _ = sf.read(file)
14
+ return speech
15
 
16
+ # tokenize
17
+ def inference(audio):
18
+ input_values = processor(map_to_array(audio.name), return_tensors="pt", padding="longest").input_values # Batch size 1
19
 
20
+ # retrieve logits
21
+ logits = model(input_values).logits
22
+
23
+ # take argmax and decode
24
+ predicted_ids = torch.argmax(logits, dim=-1)
25
+ transcription = processor.batch_decode(predicted_ids)
26
+ return transcription[0]
27
 
28
  inputs = gr.inputs.Audio(label="Input Audio", type="file")
29
  outputs = gr.outputs.Textbox(label="Output Text")
 
31
  title = "wav2vec 2.0"
32
  description = "demo for Facebook AI wav2vec 2.0 using Hugging Face transformers. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below."
33
  article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2006.11477'>wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations</a> | <a href='https://github.com/pytorch/fairseq'>Github Repo</a> | <a href='https://huggingface.co/facebook/wav2vec2-base-960h'>Hugging Face model</a></p>"
 
 
 
34
 
35
+
36
+ gr.Interface(inference, inputs, outputs, title=title, description=description, article=article).launch()