Spaces:

Ganbatte
/

kyutaistt

Running

File size: 1,946 Bytes

f94df04

import gradio as gr
import subprocess
import os
import tempfile
import sys

HF_REPO = "kyutai/stt-2.6b-en"    # โมเดลที่จะโหลด

def transcribe(audio_path: str) -> str:
    """
    รับพาธไฟล์เสียง (.wav/.mp3 ฯลฯ) แล้วเรียก moshi CLI
    คืนค่าเป็น text transcript
    """
    if audio_path is None:
        return ""
    
    # moshi CLI: python -m moshi.run_inference --hf-repo <repo> <wav> :contentReference[oaicite:2]{index=2}
    cmd = [
        sys.executable, "-m", "moshi.run_inference",
        "--hf-repo", HF_REPO,
        audio_path
    ]
    # เก็บ stdout ทั้งหมดไว้ อ่านบรรทัดสุดท้ายเป็น transcription
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        raise RuntimeError(result.stderr)
    
    # moshi จะพิมพ์ผลทีละบรรทัด บรรทัดสุดท้ายคือคำถอดเสียงสมบูรณ์
    lines = [l for l in result.stdout.splitlines() if l.strip()]
    return lines[-1] if lines else "(no output)"

demo = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath", sources=["upload", "microphone"],
                    label="Audio (16-32 kHz)"),
    outputs=gr.Textbox(label="Transcription"),
    title="Kyutai STT-2.6B (Streaming ASR)",
    description=(
        "อัปโหลดหรืออัดเสียงภาษาอังกฤษ แล้วกด Submit เพื่อถอดเสียงด้วยโมเดลขนาด 2.6 B "
        "(ใช้ CLI ของ moshi ภายใน Space)"
    ),
)

if __name__ == "__main__":
    # share=True จะสร้าง public URL ให้อัตโนมัติหากเปิด “Community GPU” Space
    demo.launch()