Spaces:
Running
Running
File size: 1,815 Bytes
365d3a1 eb7096f e772b38 365d3a1 912ccda 365d3a1 ee8c172 365d3a1 912ccda 365d3a1 eb7096f 365d3a1 0f4cad5 77ced63 912ccda 298d6a8 e772b38 912ccda eb7096f 912ccda eb7096f 365d3a1 4f40b30 365d3a1 7038cf7 eb7096f 7038cf7 365d3a1 eb7096f 912ccda eb7096f e91b2cd 365d3a1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import gradio as gr
import numpy as np
from difflib import Differ
import librosa
# import spaces #[uncomment to use ZeroGPU]
import torch
# ################ CHANGE THIS TO CHANGE THE LANGUAGE ###################### #
from TaiwaneseHokkien import TaiwaneseHokkien
device = "cuda" if torch.cuda.is_available() else "cpu"
model_repo_id = "emlinking/wav2vec2-large-xls-r-300m-tsm-asr-v6"
if torch.cuda.is_available():
torch_dtype = torch.float16
else:
torch_dtype = torch.float32
language = TaiwaneseHokkien(device=device, torch_dtype=torch_dtype)
# ########################################################################## #
# @spaces.GPU #[uncomment to use ZeroGPU]
def infer(
audio,
target
):
if type(audio) != tuple or type(target) != str: return [None, None]
# preprocess
sampling_rate, wav = audio
if wav.ndim > 1:
wav = wav.mean(axis=1)
wav = wav.astype(np.float32)
wav /= np.max(np.abs(wav))
wav = librosa.resample(y=wav, orig_sr=sampling_rate, target_sr=16_000)
user_pron = language.asr(wav)
# compare texts
d_toks = language.compare(target, user_pron)
return (user_pron, d_toks)
css = """
#col-container {
margin: 0 auto;
max-width: 640px;
}
"""
with gr.Blocks(css=css) as demo:
gr.Markdown(" # PhonoLearn")
target = gr.Textbox(label='Practice Sentence (Tâi-lô)')
input_audio = gr.Audio(
sources=["microphone", "upload"]
)
output = gr.Textbox(label='Your Pronunciation')
diff = gr.HighlightedText(
label='Comparison',
combine_adjacent=True,
show_legend=True,
color_map=language.compare_colors
)
input_audio.input(fn=infer, inputs=[input_audio, target], outputs=[output, diff])
if __name__ == "__main__":
demo.launch()
|