Spaces:
Runtime error
Runtime error
import gradio as gr | |
import torch | |
import torchaudio | |
from datasets import load_dataset | |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from noisereduce.torchgate import TorchGate as TG | |
import re | |
from pydub import AudioSegment | |
# processor = Wav2Vec2Processor.from_pretrained("RikeshSilwal/wav2vec2-nepali") | |
# model = Wav2Vec2ForCTC.from_pretrained("RikeshSilwal/wav2vec2-nepali") | |
processor = Wav2Vec2Processor.from_pretrained("RikeshSilwal/wav2vec2-nepali-rikeshsilwal") | |
model = Wav2Vec2ForCTC.from_pretrained("RikeshSilwal/wav2vec2-nepali-rikeshsilwal") | |
from torchaudio.transforms import Resample | |
import numpy as np | |
# def transcribe_audio(audio_file): | |
# input_arr, sampling_rate =torchaudio.load(audio_file) | |
# resampler = Resample(orig_freq=sampling_rate, new_freq=16000) | |
# input_arr = resampler(input_arr).squeeze().numpy() | |
# sampling_rate = 16000 | |
# inputs = processor(input_arr, sampling_rate=16_000, return_tensors="pt", padding=True) | |
# with torch.no_grad(): | |
# logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits | |
# predicted_ids = torch.argmax(logits, dim=-1) | |
# predicted_words= processor.batch_decode(predicted_ids) | |
# return predicted_words[0] | |
def transcribe_audio(audio_file): | |
audio = AudioSegment.from_wav(audio_file) | |
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |
input_arr, sampling_rate =torchaudio.load(audio_file) | |
# Create TorchGating instance | |
tg = TG(sr=sampling_rate, nonstationary=True).to(device) | |
try: | |
input_arr = tg(input_arr) | |
except: | |
input_arr = input_arr | |
if sampling_rate != 16000: | |
resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000) | |
input_arr = resampler(input_arr).squeeze().numpy() | |
recognizer = pipeline("automatic-speech-recognition", model="Harveenchadha/vakyansh-wav2vec2-nepali-nem-130") | |
prediction = recognizer(input_arr, chunk_length_s=5, stride_length_s=(2,1)) | |
prediction = recognizer(input_arr) | |
prediction = re.sub('[<s>]' , '' , str(prediction['text'])) | |
audio_input = gr.inputs.Audio(source="upload", type="filepath") | |
iface = gr.Interface(fn=transcribe_audio, inputs=audio_input, | |
outputs=["textbox"], title="Speech To Text", | |
description="Upload an audio file and hit the 'Submit'\ | |
button") | |
iface.launch(inline=False) | |