Harveenchadha's picture
Update app.py
7738eb6
raw history blame
No virus
2.78 kB
import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import gradio as gr
import sox
def convert(inputfile, outfile):
sox_tfm = sox.Transformer()
sox_tfm.set_output_format(
file_type="wav", channels=1, encoding="signed-integer", rate=16000, bits=16
)
sox_tfm.build(inputfile, outfile)
model_translate = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer_translate = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
inlang='hi'
outlang='en'
tokenizer_translate.src_lang = inlang
def translate(text):
encoded_hi = tokenizer_translate(text, return_tensors="pt")
generated_tokens = model_translate.generate(**encoded_hi, forced_bos_token_id=tokenizer_translate.get_lang_id(outlang))
return tokenizer_translate.batch_decode(generated_tokens, skip_special_tokens=True)[0]
processor = Wav2Vec2Processor.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
model = Wav2Vec2ForCTC.from_pretrained("Harveenchadha/vakyansh-wav2vec2-hindi-him-4200")
def parse_transcription(wav_file):
filename = wav_file.name.split('.')[0]
convert(wav_file.name, filename + "16k.wav")
speech, _ = sf.read(filename + "16k.wav")
input_values = processor(speech, sampling_rate=16_000, return_tensors="pt").input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
return transcription, translate(transcription)
output1 = gr.outputs.Textbox(label="Hindi Output from ASR")
output2 = gr.outputs.Textbox(label="English Translated Output")
input_ = gr.inputs.Audio(source="microphone", type="file")
#gr.Interface(parse_transcription, inputs = input_, outputs="text",
# analytics_enabled=False, show_tips=False, enable_queue=True).launch(inline=False);
gr.Interface(parse_transcription, inputs = input_, outputs=[output1, output2], analytics_enabled=False,
show_tips=False,
theme='huggingface',
layout='vertical',
title="Vakyansh: Speech To text for Indic Languages",
description="This is a live demo for Speech to Text Translation. Models used: vakyansh wav2vec2 hindi + m2m100", enable_queue=True).launch( inline=False)