Vakyansh-STT / app.py
Bishan's picture
Update app.py
26c4803
raw
history blame
No virus
6.23 kB
import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM
import gradio as gr
import sox
import subprocess
def read_file_and_process(wav_file):
filename = wav_file.split('.')[0]
filename_16k = filename + "16k.wav"
resampler(wav_file, filename_16k)
speech, _ = sf.read(filename_16k)
inputs = processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
return inputs
def resampler(input_file_path, output_file_path):
command = (
f"ffmpeg -hide_banner -loglevel panic -i {input_file_path} -ar 16000 -ac 1 -bits_per_raw_sample 16 -vn "
f"{output_file_path}"
)
subprocess.call(command, shell=True)
def parse_transcription(logits,processor):
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
return transcription
def parse(wav_file, language):
if language == 'Hindi':
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
elif language == 'Odia':
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-odia-orm-100")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-odia-orm-100")
elif language == 'Assamese':
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-assamese-asm-8")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-assamese-asm-8")
elif language == 'Sanskrit':
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-sanskrit-sam-60")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-sanskrit-sam-60")
elif language == 'Punjabi':
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-punjabi-pam-10")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-punjabi-pam-10")
elif language == 'Urdu':
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-urdu-urm-60")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-urdu-urm-60")
elif language == 'Rajasthani':
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-rajasthani-raj-45")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-rajasthani-raj-45")
elif language == 'Marathi':
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-marathi-mrm-100")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-marathi-mrm-100")
elif language == 'Malayalam':
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-malayalam-mlm-8")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-malayalam-mlm-8")
elif language == 'Maithili':
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-maithili-maim-50")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-maithili-maim-50")
elif language == 'Dogri':
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-dogri-doi-55")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-dogri-doi-55")
elif language == 'Bhojpuri':
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-bhojpuri-bhom-60")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-bhojpuri-bhom-60")
elif language == 'Tamil':
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-tamil-tam-250")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-tamil-tam-250")
elif language == 'Telugu':
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-telugu-tem-100")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-telugu-tem-100")
elif language == 'Nepali':
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-nepali-nem-130")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-nepali-nem-130")
elif language == 'Kannada':
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-kannada-knm-560")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-kannada-knm-560")
elif language == 'Gujarati':
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-gujarati-gnm-100")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-gujarati-gnm-100")
elif language == 'Bengali':
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-bengali-bnm-200")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-bengali-bnm-200")
elif language == 'English':
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-indian-english-enm-700")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-indian-english-enm-700")
input_values = read_file_and_process(wav_file)
with torch.no_grad():
logits = model(**input_values).logits
return parse_transcription(logits, processor)
options = ['Hindi','Odia','Assamese','Sanskrit','Punjabi','Urdu','Rajasthani','Marathi','Malayalam','Maithili','Dogri','Bhojpuri','Tamil','Telugu','Nepali','Kannada','Gujarati','Bengali','English']
language = gr.Dropdown(options,label="Select language")
input_ = gr.Audio(source="upload", type="filepath")
txtbox = gr.Textbox(
label="Output from model will appear here:",
lines=5
)
gr.Interface(parse, inputs = [input_,language ], outputs=txtbox,
streaming=True, interactive=True,
analytics_enabled=False, show_tips=False, enable_queue=True).launch(inline=False);