Spaces:
Runtime error
Runtime error
File size: 4,104 Bytes
5ed89cf dde51bf e1a80bb dde51bf e1a80bb 6f88a4c dde51bf 5ed89cf 0b96a3d 5ed89cf e1a80bb dde51bf e2aa46f dde51bf 5ed89cf cb06d8f e1a80bb 5ed89cf dde51bf 5ed89cf dde51bf e2aa46f dde51bf 5ed89cf cb06d8f e1a80bb 5ed89cf dde51bf 5ed89cf dde51bf 36dc535 dde51bf 5ed89cf dde51bf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import string
import torch
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
import gradio as gr
MODEL_NAME = "vinai/PhoWhisper-large"
BATCH_SIZE = 8
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
)
# Copied from https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/utils.py#L50
def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = "."):
if seconds is not None:
milliseconds = round(seconds * 1000.0)
hours = milliseconds // 3_600_000
milliseconds -= hours * 3_600_000
minutes = milliseconds // 60_000
milliseconds -= minutes * 60_000
seconds = milliseconds // 1_000
milliseconds -= seconds * 1_000
hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
else:
# we have a malformed timestamp so just return it as is
return seconds
def transcribe(file, string, return_timestamps):
outputs = pipe(file, batch_size=BATCH_SIZE, return_timestamps=return_timestamps)
text = outputs["text"]
if return_timestamps:
timestamps = outputs["chunks"]
timestamps = [
f"[{format_timestamp(chunk['timestamp'][0])} -> {format_timestamp(chunk['timestamp'][1])}] {chunk['text']}"
for chunk in timestamps
]
text = "\n".join(str(feature) for feature in timestamps)
text_nopunc = text.translate(str.maketrans('', '', string.punctuation))
grade = ''
if text_nopunc.lower() == string.lower():
grade = "good!"
else:
grade = "could use some work..."
return text, grade
demo = gr.Blocks()
mic_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.inputs.Audio(source="microphone", type="filepath", optional=True),
gr.inputs.Textbox(label="Word/Phrase"),
gr.inputs.Checkbox(default=False, label="Return timestamps"),
],
outputs=[gr.Textbox(label="What I heard..."), gr.Textbox(label="Grade")],
layout="vertical",
theme="huggingface",
title="Vietnamese Pronounciation Checker",
description=(
"This space transcribes Vietnamese words, phrases, and sentences via microphone or audio files then compares the user's text input to what the language model hears."
"You will then be given a PASS/FAIL grade to tell you if your spoken audio matches the text you entered."
f"[{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) is a Vietnamese Speech-to-Text model and powers the analysis of the audio files."
),
allow_flagging="never",
)
file_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.inputs.Audio(source="upload", optional=True, label="Audio file", type="filepath"),
gr.inputs.Textbox(label="Word/Phrase"),
gr.inputs.Checkbox(default=False, label="Return timestamps"),
],
outputs=[gr.Textbox(label="What I heard..."), gr.Textbox(label="Grade")],
layout="vertical",
theme="huggingface",
title="Vietnamese Pronounciation Checker"",
description=(
"This space transcribes Vietnamese words, phrases, and sentences via microphone or audio files then compares the user's text input to what the language model hears."
"You will then be given a PASS/FAIL grade to tell you if your spoken audio matches the text you entered."
f"[{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) is a Vietnamese Speech-to-Text model and powers the analysis of the audio files."
),
examples=[
["./example.flac", "transcribe", False],
["./example.flac", "transcribe", True],
],
cache_examples=True,
allow_flagging="never",
)
with demo:
gr.TabbedInterface([mic_transcribe, file_transcribe], ["Pronounce via Microphone", "Pronounce via Audio File"])
demo.launch(enable_queue=True) |