File size: 1,815 Bytes
365d3a1
 
eb7096f
e772b38
365d3a1
 
 
 
912ccda
 
 
365d3a1
ee8c172
365d3a1
 
 
 
 
 
912ccda
 
365d3a1
 
 
eb7096f
 
365d3a1
0f4cad5
77ced63
912ccda
298d6a8
 
 
 
 
e772b38
 
912ccda
 
eb7096f
 
912ccda
eb7096f
365d3a1
4f40b30
 
 
 
 
 
 
365d3a1
7038cf7
eb7096f
7038cf7
 
365d3a1
eb7096f
 
 
 
 
912ccda
eb7096f
 
e91b2cd
365d3a1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import gradio as gr
import numpy as np
from difflib import Differ
import librosa

# import spaces #[uncomment to use ZeroGPU]
import torch

# ################ CHANGE THIS TO CHANGE THE LANGUAGE ###################### #
from TaiwaneseHokkien import TaiwaneseHokkien 

device = "cuda" if torch.cuda.is_available() else "cpu"
model_repo_id = "emlinking/wav2vec2-large-xls-r-300m-tsm-asr-v6" 

if torch.cuda.is_available():
    torch_dtype = torch.float16
else:
    torch_dtype = torch.float32

language = TaiwaneseHokkien(device=device, torch_dtype=torch_dtype)
# ########################################################################## #

# @spaces.GPU #[uncomment to use ZeroGPU]
def infer(
    audio,
    target 
):
    if type(audio) != tuple or type(target) != str: return [None, None]
    
    # preprocess 
    sampling_rate, wav = audio 
    if wav.ndim > 1:
        wav = wav.mean(axis=1)
    wav = wav.astype(np.float32)
    wav /= np.max(np.abs(wav))

    wav = librosa.resample(y=wav, orig_sr=sampling_rate, target_sr=16_000)
    
    user_pron = language.asr(wav)
    
    # compare texts 
    d_toks = language.compare(target, user_pron)
    return (user_pron, d_toks)

css = """
#col-container {
    margin: 0 auto;
    max-width: 640px;
}
"""

with gr.Blocks(css=css) as demo:
    gr.Markdown(" # PhonoLearn")
    target = gr.Textbox(label='Practice Sentence (Tâi-lô)')
    input_audio = gr.Audio(
        sources=["microphone", "upload"]
    )
    output = gr.Textbox(label='Your Pronunciation')
    diff = gr.HighlightedText(
        label='Comparison',
        combine_adjacent=True,
        show_legend=True,
        color_map=language.compare_colors 
    )
    input_audio.input(fn=infer, inputs=[input_audio, target], outputs=[output, diff])
    
if __name__ == "__main__":
    demo.launch()