vietnamese_s2t / app.py
aiface's picture
Update app.py
c220a06
import os
os.system("pip install transformers")
os.system("pip install https://github.com/kpu/kenlm/archive/master.zip")
os.system("pip install pyctcdecode")
os.system("pip install gradio")
os.system("pip install librosa")
os.system("pip install torch")
import gradio as gr
import librosa
import torch
from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor
from transformers import Wav2Vec2ForCTC
from transformers import Wav2Vec2ProcessorWithLM
repo_name = "aiface/vietnamese_s2t"
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# device="cpu"
processor = Wav2Vec2ProcessorWithLM.from_pretrained(repo_name, token="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")
model = Wav2Vec2ForCTC.from_pretrained(repo_name, token="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").to(device)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(repo_name, token="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")
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(repo_name, token="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")
def process_audio_file(file):
data, sr = librosa.load(file, sr = 16000)
return data
def transcribe(file_mic, file_upload):
warn_output = ""
if (file_mic is not None) and (file_upload is not None):
warn_output = "WARNING: You've uploaded an audio file and used the microphone. The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
file = file_mic
elif (file_mic is None) and (file_upload is None):
return "ERROR: You have to either use the microphone or upload an audio file"
elif file_mic is not None:
file = file_mic
else:
file = file_upload
input_values = process_audio_file(file)
input_dict = processor(input_values, sampling_rate=16_000, return_tensors="pt", padding=True)
logits = model(input_dict.input_values.to(device)).logits
pred_ids = torch.argmax(logits, dim=-1)[0]
pres = processor.batch_decode(logits.to("cpu").detach().numpy()).text
return warn_output + str(pres[0])
iface = gr.Interface(
fn=transcribe,
inputs=[
gr.inputs.Audio(source="microphone", type='filepath', optional=True),
gr.inputs.Audio(source="upload", type='filepath', optional=True),
],
outputs="text",
layout="horizontal",
theme="huggingface",
title="Speech to text MMS With Language Model",
description="Demo đơn giản speech to text",
)
iface.launch(share=True)