File size: 1,457 Bytes
85c00e6
e1a7f99
 
0c8bbcc
 
 
7e2b4a1
5c0879c
0c8bbcc
 
e442b97
0c8bbcc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import os
os.system('pip install transformers')
os.system('pip freeze')
import soundfile as sf
import gradio as gr
import torch
from transformers import SpeechEncoderDecoderModel, Speech2Text2Processor

model = SpeechEncoderDecoder.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")

def map_to_array(file):
   speech, _ = sf.read(file)
   return speech
 
def inference(audio):
  inputs = processor(map_to_array(audio.name), sampling_rate=16_000, return_tensors="pt")
  generated_ids = model.generate(input_ids=inputs["input_features"], attention_mask=inputs["attention_mask"])
  transcription = processor.batch_decode(generated_ids)
  return transcription[0]
inputs = gr.inputs.Audio(label="Input Audio", type="file")
outputs =  gr.outputs.Textbox(label="Output Text")
title = "Robust wav2vec 2.0"
description = "Gradio demo for Robust wav2vec 2.0. To use it, simply upload your audio, or click one of the examples to load them. Read more at the links below. Currently supports .wav and .flac files"
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2104.01027' target='_blank'>Robust wav2vec 2.0: Analyzing Domain Shift in Self-Supervised Pre-Training</a> | <a href='https://github.com/pytorch/fairseq' target='_blank'>Github Repo</a></p>"
gr.Interface(inference, inputs, outputs, title=title, description=description, article=article).launch()