File size: 2,509 Bytes
16ff511
6a91da6
bae72eb
6a91da6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85eedc6
6a91da6
85eedc6
6a91da6
 
 
 
85eedc6
 
 
 
 
 
 
 
 
 
 
 
 
 
6a91da6
 
85eedc6
 
 
6a91da6
 
 
85eedc6
6a91da6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import spaces
import time
import os

import torch
import gradio as gr
from transformers.pipelines import pipeline
import utils

from config import (
    MODEL_PATHS,
    SUPPORTED_LANGUAGES,
    CUSTOM_CSS,
)

# set language here: available are en, de and lb
LANGUAGE = "lb"
if LANGUAGE not in SUPPORTED_LANGUAGES:
    print(f"language ({LANGUAGE}) not supported. Use one of {SUPPORTED_LANGUAGES}")
    exit()
else:
    MODEL_PATH = MODEL_PATHS[LANGUAGE]

_asr_pipeline = None

@spaces.GPU
def transcribe_gradio(audio_path: str | None) -> str:
    if not audio_path:
        return "⚠️  Please record something or choose a file first."

    global _asr_pipeline

    if _asr_pipeline is None:

        _asr_pipeline = pipeline(
            "automatic-speech-recognition",
            model=MODEL_PATH,
            device=0 if torch.cuda.is_available() else -1,
            chunk_length_s=30,
            stride_length_s=(4, 2),
            batch_size=8,
            token=os.getenv("HF_TOKEN"),
        )

    start = time.time()
    try:
        result = _asr_pipeline(audio_path)
        transcript = result["text"] if isinstance(result, dict) else str(result)
    except Exception as err:
        return f"❌ {err}"
    runtime = time.time() - start

    return f"{transcript}\n\nβŒ› Inference time: {runtime:.2f} s"

# gradio interface
with gr.Blocks(title="Wave2Vec (Luxembourgish) ", theme="soft", css=CUSTOM_CSS) as demo:
    gr.Markdown("""
    # πŸŽ™οΈ Speech-to-Text Demo β€” Wave2Vec (Luxembourgish) 
    Use **Record** to capture speech live or **Upload** to select an audio file (.wav, .mp3, .flac).  
    Hit **Transcribe** to convert your recording into text, and **Clear** to reset both fields.
    """)

    with gr.Row():
        audio_input = gr.Audio(
            sources=["microphone", "upload"],
            type="filepath",
            label="Input audio",
            autoplay=False,
        )
        output_text = gr.Textbox(
            label="Transcript",
            placeholder="Your transcript will appear here …",
            show_copy_button=True,
            lines=10,
        )

    with gr.Row(equal_height=True, elem_classes="centered-row") as row:
        transcribe_btn = gr.Button("Transcribe ✨", scale=0)
        clear_btn = gr.ClearButton(
            [audio_input, output_text], scale=0, elem_classes="clear-btn"
        )

    transcribe_btn.click(transcribe_gradio, inputs=audio_input, outputs=output_text)


if __name__ == "__main__":
    demo.launch()