from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC import soundfile as sf import torch import gradio as gr # load model and processor processor = Wav2Vec2Processor.from_pretrained("h4d35/Wav2Vec2-hi") model = Wav2Vec2ForCTC.from_pretrained("h4d35/Wav2Vec2-hi") # define function to read in sound file def map_to_array(file): speech, _ = sf.read(file) return speech # tokenize def inference(audio): input_values = processor(map_to_array(audio.name), return_tensors="pt", padding="longest").input_values # Batch size 1 # retrieve logits logits = model(input_values).logits # take argmax and decode predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids) return transcription[0] inputs = gr.inputs.Audio(label="Input Audio", type="file") outputs = gr.outputs.Textbox(label="Output Text") title = "HindiASR" description = "HindiASR using Wav2Vec2.0" #examples=[['poem.wav']] gr.Interface(inference, inputs, outputs, title=title, description=description).launch()