import gradio as gr import torch import torchaudio from datasets import load_dataset from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline import pandas as pd from sklearn.model_selection import train_test_split from noisereduce.torchgate import TorchGate as TG import re from pydub import AudioSegment # processor = Wav2Vec2Processor.from_pretrained("RikeshSilwal/wav2vec2-nepali") # model = Wav2Vec2ForCTC.from_pretrained("RikeshSilwal/wav2vec2-nepali") processor = Wav2Vec2Processor.from_pretrained("RikeshSilwal/wav2vec2-nepali-rikeshsilwal") model = Wav2Vec2ForCTC.from_pretrained("RikeshSilwal/wav2vec2-nepali-rikeshsilwal") from torchaudio.transforms import Resample import numpy as np # def transcribe_audio(audio_file): # input_arr, sampling_rate =torchaudio.load(audio_file) # resampler = Resample(orig_freq=sampling_rate, new_freq=16000) # input_arr = resampler(input_arr).squeeze().numpy() # sampling_rate = 16000 # inputs = processor(input_arr, sampling_rate=16_000, return_tensors="pt", padding=True) # with torch.no_grad(): # logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits # predicted_ids = torch.argmax(logits, dim=-1) # predicted_words= processor.batch_decode(predicted_ids) # return predicted_words[0] def transcribe_audio(audio_file): audio = AudioSegment.from_wav(audio_file) device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") input_arr, sampling_rate =torchaudio.load(audio_file) # Create TorchGating instance tg = TG(sr=sampling_rate, nonstationary=True).to(device) try: input_arr = tg(input_arr) except: input_arr = input_arr if sampling_rate != 16000: resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000) input_arr = resampler(input_arr).squeeze().numpy() recognizer = pipeline("automatic-speech-recognition", model="Harveenchadha/vakyansh-wav2vec2-nepali-nem-130") prediction = recognizer(input_arr, chunk_length_s=5, stride_length_s=(2,1)) prediction = recognizer(input_arr) prediction = re.sub('[]' , '' , str(prediction['text'])) audio_input = gr.inputs.Audio(source="upload", type="filepath") iface = gr.Interface(fn=transcribe_audio, inputs=audio_input, outputs=["textbox"], title="Speech To Text", description="Upload an audio file and hit the 'Submit'\ button") iface.launch(inline=False)