audio-translate / app.py
vumichien's picture
Update app.py
1f6bb18
raw history blame
No virus
1.99 kB
import gradio as gr
import librosa
from optimum.onnxruntime import ORTModelForSeq2SeqLM
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
# load model and processor
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
tokenizer = AutoTokenizer.from_pretrained("icon-it-tdtu/mt-en-vi-optimum")
model_lm = ORTModelForSeq2SeqLM.from_pretrained("icon-it-tdtu/mt-en-vi-optimum")
def process_audio_file(file):
data, sr = librosa.load(file)
if sr != 16000:
data = librosa.resample(data, sr, 16000)
print(data.shape)
inputs = processor(data, sampling_rate=16000, return_tensors="pt", padding=True)
return inputs
def interpret(file):
inputs = process_audio_file(file)
with torch.no_grad():
output_logit = model(inputs.input_values).logits
pred_ids = torch.argmax(output_logit, dim=-1)
text = processor.batch_decode(pred_ids)[0].lower()
print(text)
translate_text = translate(text)
return translate_text
def translate(text):
batch = tokenizer([text], return_tensors="pt")
generated_ids = model_lm.generate(**batch)
translated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
return translated_text
# Set the starting state to an empty string
iface = gr.Interface(
fn=interpret,
title="Interpret English to Vietnamese",
description="A simple interface to interpret from spoken English to Vietnamese.",
article="Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>.",
inputs=[
gr.Audio(source="microphone", type="filepath", streaming=False),
gr.Audio(source="upload", type="filepath", optional=True),
],
outputs=gr.outputs.Textbox(label="Interpreted text")
,
)
iface.launch(enable_queue=True, debug=True)