File size: 1,421 Bytes
6a2fe98
b8ca3c4
6a2fe98
a4f8f10
 
 
 
 
 
6a2fe98
 
a4f8f10
 
 
 
6a2fe98
 
 
 
 
a4f8f10
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import soundfile as sf
import gradio as gr
import torch
from transformers import Speech2Text2Processor, SpeechEncoderDecoder
model = SpeechEncoderDecoder.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
def map_to_array(file):
   speech, _ = sf.read(file)
   return speech
 
def inference(audio):
  inputs = processor(map_to_array(audio.name), sampling_rate=16_000, return_tensors="pt")
  generated_ids = model.generate(input_ids=inputs["input_features"], attention_mask=inputs["attention_mask"])
  transcription = processor.batch_decode(generated_ids)
  return transcription[0]
inputs = gr.inputs.Audio(label="Input Audio", type="file")
outputs =  gr.outputs.Textbox(label="Output Text")
title = "Robust wav2vec 2.0"
description = "Gradio demo for Robust wav2vec 2.0. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below. Currently supports .wav and .flac files"
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2104.01027' target='_blank'>Robust wav2vec 2.0: Analyzing Domain Shift in Self-Supervised Pre-Training</a> | <a href='https://github.com/pytorch/fairseq' target='_blank'>Github Repo</a></p>"
examples=[['poem.wav']]
gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=examples).launch()