File size: 4,104 Bytes
5ed89cf
dde51bf
 
 
 
 
e1a80bb
dde51bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e1a80bb
6f88a4c
dde51bf
 
 
 
 
 
 
 
5ed89cf
0b96a3d
5ed89cf
e1a80bb
 
 
 
dde51bf
 
 
 
 
 
 
 
e2aa46f
dde51bf
 
5ed89cf
cb06d8f
e1a80bb
5ed89cf
dde51bf
5ed89cf
 
 
dde51bf
 
 
 
 
 
 
 
e2aa46f
dde51bf
 
5ed89cf
cb06d8f
e1a80bb
5ed89cf
dde51bf
5ed89cf
 
 
dde51bf
36dc535
 
 
 
 
dde51bf
 
 
 
5ed89cf
dde51bf
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import string
import torch
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
import gradio as gr

MODEL_NAME = "vinai/PhoWhisper-large"
BATCH_SIZE = 8

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)


# Copied from https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/utils.py#L50
def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = "."):
    if seconds is not None:
        milliseconds = round(seconds * 1000.0)

        hours = milliseconds // 3_600_000
        milliseconds -= hours * 3_600_000

        minutes = milliseconds // 60_000
        milliseconds -= minutes * 60_000

        seconds = milliseconds // 1_000
        milliseconds -= seconds * 1_000

        hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
        return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
    else:
        # we have a malformed timestamp so just return it as is
        return seconds


def transcribe(file, string, return_timestamps):
    outputs = pipe(file, batch_size=BATCH_SIZE, return_timestamps=return_timestamps)
    text = outputs["text"]
    if return_timestamps:
        timestamps = outputs["chunks"]
        timestamps = [
            f"[{format_timestamp(chunk['timestamp'][0])} -> {format_timestamp(chunk['timestamp'][1])}] {chunk['text']}"
            for chunk in timestamps
        ]
        text = "\n".join(str(feature) for feature in timestamps)
    text_nopunc = text.translate(str.maketrans('', '', string.punctuation))
    grade = ''
    if text_nopunc.lower() == string.lower():
        grade = "good!"
    else:
        grade = "could use some work..."
    return text, grade


demo = gr.Blocks()

mic_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath", optional=True),
        gr.inputs.Textbox(label="Word/Phrase"),
        gr.inputs.Checkbox(default=False, label="Return timestamps"),
    ],
    outputs=[gr.Textbox(label="What I heard..."), gr.Textbox(label="Grade")],
    layout="vertical",
    theme="huggingface",
    title="Vietnamese Pronounciation Checker",
    description=(
        "This space transcribes Vietnamese words, phrases, and sentences via microphone or audio files then compares the user's text input to what the language model hears." 
        "You will then be given a PASS/FAIL grade to tell you if your spoken audio matches the text you entered."
        f"[{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) is a Vietnamese Speech-to-Text model and powers the analysis of the audio files."
    ),
    allow_flagging="never",
)

file_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.inputs.Audio(source="upload", optional=True, label="Audio file", type="filepath"),
        gr.inputs.Textbox(label="Word/Phrase"),
        gr.inputs.Checkbox(default=False, label="Return timestamps"),
    ],
    outputs=[gr.Textbox(label="What I heard..."), gr.Textbox(label="Grade")],
    layout="vertical",
    theme="huggingface",
    title="Vietnamese Pronounciation Checker"",
    description=(
        "This space transcribes Vietnamese words, phrases, and sentences via microphone or audio files then compares the user's text input to what the language model hears." 
        "You will then be given a PASS/FAIL grade to tell you if your spoken audio matches the text you entered."
        f"[{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) is a Vietnamese Speech-to-Text model and powers the analysis of the audio files."
    ),
    examples=[
        ["./example.flac", "transcribe", False],
        ["./example.flac", "transcribe", True],
    ],
    cache_examples=True,
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface([mic_transcribe, file_transcribe], ["Pronounce via Microphone", "Pronounce via Audio File"])

demo.launch(enable_queue=True)