import gradio as gr import torch import torchaudio from datasets import load_dataset from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import pandas as pd from sklearn.model_selection import train_test_split processor = Wav2Vec2Processor.from_pretrained("gagan3012/wav2vec2-xlsr-nepali") model = Wav2Vec2ForCTC.from_pretrained("gagan3012/wav2vec2-xlsr-nepali") from torchaudio.transforms import Resample import numpy as np def transcribe_audio(audio_file): input_arr, sampling_rate =torchaudio.load(audio_file) resampler = Resample(orig_freq=sampling_rate, new_freq=16000) input_arr = resampler(input_arr).squeeze().numpy() sampling_rate = 16000 inputs = processor(input_arr, sampling_rate=16_000, return_tensors="pt", padding=True) with torch.no_grad(): logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits predicted_ids = torch.argmax(logits, dim=-1) predicted_words= processor.batch_decode(predicted_ids) return predicted_words[0] audio_input = gr.inputs.Audio(source="upload", type="filepath") iface = gr.Interface(fn=transcribe_audio, inputs=audio_input, outputs=["textbox"], title="Speech To Text", description="Upload an audio file and hit the 'Submit'\ button") iface.launch(inline=False)