audio-translate / app.py
vumichien's picture
Create new file
20ebcf1
raw history blame
No virus
1.66 kB
import gradio as gr
import librosa
from optimum.onnxruntime import ORTModelForSeq2SeqLM
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
# load model and processor
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
tokenizer = AutoTokenizer.from_pretrained("icon-it-tdtu/mt-en-vi-optimum")
model_lm = ORTModelForSeq2SeqLM.from_pretrained("icon-it-tdtu/mt-en-vi-optimum")
def process_audio_file(file):
data, sr = librosa.load(file)
if sr != 16000:
data = librosa.resample(data, sr, 16000)
print(data.shape)
inputs = processor(data, sampling_rate=16000, return_tensors="pt", padding=True)
return inputs
def transcribe(file):
inputs = process_audio_file(file)
with torch.no_grad():
output_logit = model(inputs.input_values).logits
pred_ids = torch.argmax(output_logit, dim=-1)
text = processor.batch_decode(pred_ids)[0].lower()
print(text)
translate_text = translate(text)
return translate_text
def translate(text):
batch = tokenizer([text], return_tensors="pt")
generated_ids = model_lm.generate(**batch)
translated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
return translated_text
# Set the starting state to an empty string
gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(source="microphone", type="filepath", streaming=False),
],
outputs=[
"textbox",
],
live=True).launch(debug=True)