import gradio as gr import librosa import torch import torchaudio from datasets import load_dataset from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline import pandas as pd from sklearn.model_selection import train_test_split from noisereduce.torchgate import TorchGate as TG import re from pydub import AudioSegment from torchaudio.transforms import Resample import numpy as np def transcribe_audio(audio_file): audio = AudioSegment.from_wav(audio_file) device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") input_arr, sampling_rate =librosa.load(audio_file) # Create TorchGating instance tg = TG(sr=sampling_rate, nonstationary=True).to(device) try: input_arr = tg(input_arr) except: input_arr = input_arr if sampling_rate != 16000: input_arr = librosa.resample(input_arr, orig_sr=sampling_rate, target_sr=16000) MODEL_NAME = "rikeshsilwalekg/whisper-small-wer35-ekg" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model = AutoModelForSpeechSeq2Seq.from_pretrained( MODEL_NAME, torch_dtype=torch_dtype, use_safetensors=True ) model.to(device) processor = AutoProcessor.from_pretrained(MODEL_NAME) pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=30, batch_size=16, return_timestamps=False, torch_dtype=torch_dtype, device=device, ) # return_timestamps=True for sentence level timestaps # for word level timestamps return_timestamps="word" prediction = pipe(input_arr) prediction = prediction['text'] audio_input = gr.inputs.Audio(source="upload", type="filepath") iface = gr.Interface(fn=transcribe_audio, inputs=audio_input, outputs=["textbox"], title="Nepali Speech To Text", description="Upload an audio file and hit the 'Submit'\ button") iface.launch(inline=False)