Spaces:
Running
Running
import gradio as gr | |
import numpy as np | |
from difflib import Differ | |
import librosa | |
# import spaces #[uncomment to use ZeroGPU] | |
import torch | |
# ################ CHANGE THIS TO CHANGE THE LANGUAGE ###################### # | |
from TaiwaneseHokkien import TaiwaneseHokkien | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model_repo_id = "emlinking/wav2vec2-large-xls-r-300m-tsm-asr-v6" | |
if torch.cuda.is_available(): | |
torch_dtype = torch.float16 | |
else: | |
torch_dtype = torch.float32 | |
language = TaiwaneseHokkien(device=device, torch_dtype=torch_dtype) | |
# ########################################################################## # | |
# @spaces.GPU #[uncomment to use ZeroGPU] | |
def infer( | |
audio, | |
target | |
): | |
if type(audio) != tuple or type(target) != str: return [None, None] | |
# preprocess | |
sampling_rate, wav = audio | |
if wav.ndim > 1: | |
wav = wav.mean(axis=1) | |
wav = wav.astype(np.float32) | |
wav /= np.max(np.abs(wav)) | |
wav = librosa.resample(y=wav, orig_sr=sampling_rate, target_sr=16_000) | |
user_pron = language.asr(wav) | |
# compare texts | |
d_toks = language.compare(target, user_pron) | |
return (user_pron, d_toks) | |
css = """ | |
#col-container { | |
margin: 0 auto; | |
max-width: 640px; | |
} | |
""" | |
with gr.Blocks(css=css) as demo: | |
gr.Markdown(" # PhonoLearn") | |
target = gr.Textbox(label='Practice Sentence (Tâi-lô)') | |
input_audio = gr.Audio( | |
sources=["microphone", "upload"] | |
) | |
output = gr.Textbox(label='Your Pronunciation') | |
diff = gr.HighlightedText( | |
label='Comparison', | |
combine_adjacent=True, | |
show_legend=True, | |
color_map=language.compare_colors | |
) | |
input_audio.input(fn=infer, inputs=[input_audio, target], outputs=[output, diff]) | |
if __name__ == "__main__": | |
demo.launch() | |