Spaces:

Quartz4065
/

AudioTranscriber

Sleeping

App Files Files Community

Quartz4065 commited on Sep 18, 2025

Commit

54358d8

verified ·

1 Parent(s): bc616a3

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -59

app.py CHANGED Viewed

@@ -1,79 +1,98 @@
 import os
-import tempfile
-from typing import List, Optional
-from fastapi import FastAPI, File, Form, UploadFile
-from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from faster_whisper import WhisperModel
-APP_PORT = int(os.environ.get("PORT", "7860"))
-_models = {}
-def get_model(name: str):
-    if name not in _models:
-        _models[name] = WhisperModel(
-            name, compute_type="int8", cpu_threads=os.cpu_count() or 2
         )
-    return _models[name]
-class Segment(BaseModel):
-    start: float
-    end: float
-    text: str
 class TranscribeOut(BaseModel):
     text: str
-    segments: List[Segment]
     duration_sec: Optional[float] = None
-    words: Optional[int] = None
     wpm: Optional[float] = None
-    model: str
-app = FastAPI(title="Nuvia Free Transcriber")
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"], allow_credentials=True,
-    allow_methods=["*"], allow_headers=["*"],
-)
-@app.get("/health")
 def health():
-    return {"ok": True}
 @app.post("/transcribe", response_model=TranscribeOut)
-def transcribe(file: UploadFile = File(...), model: str = Form("base.en")):
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
-        tmp.write(file.file.read())
-        tmp_path = tmp.name
     try:
-        m = get_model(model)
-        segments, info = m.transcribe(tmp_path, vad_filter=True)
-        segs = []
-        total_words = 0
-        for s in segments:
-            txt = s.text.strip()
-            segs.append(Segment(start=float(s.start), end=float(s.end), text=txt))
-            total_words += len(txt.split())
-        dur = float(info.duration) if getattr(info, "duration", None) else None
-        wpm = None
-        if dur and dur > 0:
-            wpm = round(total_words / (dur / 60.0), 2)
-        full_text = " ".join([s.text for s in segs]).strip()
-        return TranscribeOut(
-            text=full_text,
-            segments=segs,
-            duration_sec=dur,
-            words=total_words,
-            wpm=wpm,
-            model=model
-        )
-    finally:
-        try:
-            os.remove(tmp_path)
-        except Exception:
-            pass

+import io
 import os
+import math
+import subprocess
+from typing import Optional
+from fastapi import FastAPI, File, UploadFile
+from fastapi.responses import JSONResponse
 from pydantic import BaseModel
+# Optional CORS (safe default in Spaces)
+from fastapi.middleware.cors import CORSMiddleware
+# Transcription (CPU)
 from faster_whisper import WhisperModel
+import soundfile as sf
+# ---------- App ----------
+app = FastAPI(title="Nuvia Free Transcriber", version="1.0.0")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"], allow_credentials=True,
+    allow_methods=["*"], allow_headers=["*"],
+)
+# ---------- Model load (CPU, small for free tier) ----------
+# You can switch to "base.en" if needed; "tiny.en" is faster.
+MODEL_NAME = os.environ.get("WHISPER_MODEL", "tiny.en")
+model = WhisperModel(MODEL_NAME, device="cpu", compute_type="int8")
+# ---------- Helpers ----------
+def ffprobe_duration(path: str) -> Optional[float]:
+    try:
+        out = subprocess.check_output(
+            ["ffprobe", "-v", "error", "-show_entries", "format=duration",
+             "-of", "default=noprint_wrappers=1:nokey=1", path]
         )
+        return float(out.decode("utf-8").strip())
+    except Exception:
+        return None
+def word_count(text: str) -> int:
+    return len([w for w in text.split() if w.strip()])
+# ---------- Schemas ----------
+class HealthOut(BaseModel):
+    ok: bool
 class TranscribeOut(BaseModel):
     text: str
     duration_sec: Optional[float] = None
     wpm: Optional[float] = None
+# ---------- Routes ----------
+@app.get("/", response_model=HealthOut)
+def root():
+    """Root route so probes and GPT 'test connection' don’t 404."""
+    return HealthOut(ok=True)
+@app.get("/health", response_model=HealthOut)
 def health():
+    return HealthOut(ok=True)
 @app.post("/transcribe", response_model=TranscribeOut)
+async def transcribe(file: UploadFile = File(...)):
+    # Read uploaded bytes
+    raw = await file.read()
+    # Save to temp wav (Spaces use ephemeral FS; this is fine)
+    tmp_in = "/tmp/infile"
+    # Keep original extension if present
+    ext = os.path.splitext(file.filename or "")[1].lower() or ".bin"
+    tmp_in = tmp_in + ext
+    with open(tmp_in, "wb") as f:
+        f.write(raw)
+    # Ensure we have a WAV for robust decode
+    tmp_wav = "/tmp/in.wav"
     try:
+        subprocess.check_call(["ffmpeg", "-y", "-i", tmp_in, "-ar", "16000", "-ac", "1", tmp_wav], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    except subprocess.CalledProcessError:
+        return JSONResponse(status_code=400, content={"error": "ffmpeg failed to decode the audio"})
+    # Duration via ffprobe (more accurate than guessing)
+    duration = ffprobe_duration(tmp_wav)
+    # Transcribe
+    segments, info = model.transcribe(tmp_wav, language="en")
+    text = "".join([seg.text for seg in segments]).strip()
+    # WPM (best-effort)
+    wpm = None
+    if duration and duration > 0:
+        wc = word_count(text)
+        wpm = round((wc / (duration / 60.0)), 1)
+    return TranscribeOut(text=text, duration_sec=duration, wpm=wpm)