Spaces:

VladRet2026
/

ConvertAudioToJSON

Sleeping

App Files Files

VladGeekPro commited on 15 days ago

Commit

4b75cf2

2 Parent(s): 58d658a 392dfe4

Merge remote-tracking branch 'origin/main' into RevertedWhisperTurbuAndDeleteTmpDebug

Browse files

Files changed (2) hide show

Dockerfile +2 -2
app.py +21 -2

Dockerfile CHANGED Viewed

@@ -2,13 +2,13 @@ FROM python:3.11-slim
 ENV PYTHONUNBUFFERED=1 PIP_NO_CACHE_DIR=1 HOME=/home/user \
     PATH=/home/user/.local/bin:$PATH PORT=7860 \
-    WHISPER_MODEL=deepdml/faster-whisper-large-v3-turbo-ct2 \
     OMP_NUM_THREADS=2 OPENBLAS_NUM_THREADS=2 \
     TOKENIZERS_PARALLELISM=false \
     WHISPER_CPU_THREADS=2 WHISPER_NUM_BEAMS=1 \
     WHISPER_VAD_FILTER=0 WHISPER_PRELOAD_ON_START=1 \
     WHISPER_BACKEND=auto WHISPER_REMOTE_PROVIDER=hf-inference \
-    WHISPER_REMOTE_MODEL=openai/whisper-large-v3-turbo \
     WHISPER_REMOTE_TIMEOUT=15 WHISPER_PREPROCESS_AUDIO=1
 RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg \

 ENV PYTHONUNBUFFERED=1 PIP_NO_CACHE_DIR=1 HOME=/home/user \
     PATH=/home/user/.local/bin:$PATH PORT=7860 \
+    WHISPER_MODEL=deepdml/faster-whisper-large-v3-ct2 \
     OMP_NUM_THREADS=2 OPENBLAS_NUM_THREADS=2 \
     TOKENIZERS_PARALLELISM=false \
     WHISPER_CPU_THREADS=2 WHISPER_NUM_BEAMS=1 \
     WHISPER_VAD_FILTER=0 WHISPER_PRELOAD_ON_START=1 \
     WHISPER_BACKEND=auto WHISPER_REMOTE_PROVIDER=hf-inference \
+    WHISPER_REMOTE_MODEL=openai/whisper-large-v3 \
     WHISPER_REMOTE_TIMEOUT=15 WHISPER_PREPROCESS_AUDIO=1
 RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg \

app.py CHANGED Viewed

@@ -286,7 +286,7 @@ def transcribe_audio_remote(audio_path: str) -> tuple[str, float]:
     """Транскрибирует аудио через HF Inference."""
     started = time.time()
     client = get_hf_asr_client()
-    model_id = os.getenv("WHISPER_REMOTE_MODEL", "openai/whisper-large-v3-turbo")
     result = client.automatic_speech_recognition(audio=audio_path, model=model_id)
     text = (getattr(result, "text", None) or "").strip()
@@ -328,7 +328,7 @@ def get_whisper_model() -> Any:
     if _WHISPER_MODEL is None:
         from faster_whisper import WhisperModel
-        model_id = os.getenv("WHISPER_MODEL", "deepdml/faster-whisper-large-v3-turbo-ct2")
         cpu_threads = max(1, int(os.getenv("WHISPER_CPU_THREADS", "2")))
         _WHISPER_MODEL = WhisperModel(
@@ -374,6 +374,8 @@ class ExpenseTextExtractor:
         t0 = time.time()
         date_info = self.date_extractor.extract(text, reference_date=reference_date, debug=debug)
         timings["date_extractor"] = round(time.time() - t0, 3)
         t0 = time.time()
         supplier_info = self.supplier_extractor.extract(
@@ -382,6 +384,8 @@ class ExpenseTextExtractor:
             debug=debug,
         )
         timings["supplier_extractor"] = round(time.time() - t0, 3)
         t0 = time.time()
         user_info = self.user_extractor.extract(
@@ -391,6 +395,8 @@ class ExpenseTextExtractor:
             debug=debug,
         )
         timings["user_extractor"] = round(time.time() - t0, 3)
         t0 = time.time()
         amount_info = self.amount_extractor.extract(
@@ -400,6 +406,8 @@ class ExpenseTextExtractor:
             debug=debug,
         )
         timings["amount_extractor"] = round(time.time() - t0, 3)
         if debug:
             print(f"[TIMINGS] {timings}")
@@ -506,6 +514,13 @@ def process_voice_request(audio_path: str, mode: str, payload: dict[str, Any], d
     user_names = extract_names(context.get("users"))
     transcript, whisper_time = transcribe_audio_text(audio_path)
     if mode == "notes":
         notes = polish_notes_text(transcript)
@@ -531,6 +546,8 @@ def process_voice_request(audio_path: str, mode: str, payload: dict[str, Any], d
     print(f"[TIMINGS] pipeline_init: {pipeline_init_time}s")
     extracted = extractor.extract(transcript, reference_date=date.today().isoformat(), debug=debug)
     total_time = round(time.time() - total_start, 3)
     print(f"[TIMINGS] TOTAL: {total_time}s (whisper: {whisper_time}s)")
@@ -546,6 +563,8 @@ def process_voice_request(audio_path: str, mode: str, payload: dict[str, Any], d
     }
     if debug and extracted.get("debug"):
         payload["debug"] = extracted.get("debug")
     return payload

     """Транскрибирует аудио через HF Inference."""
     started = time.time()
     client = get_hf_asr_client()
+    model_id = os.getenv("WHISPER_REMOTE_MODEL", "openai/whisper-large-v3")
     result = client.automatic_speech_recognition(audio=audio_path, model=model_id)
     text = (getattr(result, "text", None) or "").strip()
     if _WHISPER_MODEL is None:
         from faster_whisper import WhisperModel
+        model_id = os.getenv("WHISPER_MODEL", "deepdml/faster-whisper-large-v3-ct2")
         cpu_threads = max(1, int(os.getenv("WHISPER_CPU_THREADS", "2")))
         _WHISPER_MODEL = WhisperModel(
         t0 = time.time()
         date_info = self.date_extractor.extract(text, reference_date=reference_date, debug=debug)
         timings["date_extractor"] = round(time.time() - t0, 3)
+        if debug:
+            print(f"[DEBUG][DATE] {date_info}")
         t0 = time.time()
         supplier_info = self.supplier_extractor.extract(
             debug=debug,
         )
         timings["supplier_extractor"] = round(time.time() - t0, 3)
+        if debug:
+            print(f"[DEBUG][SUPPLIER] {supplier_info}")
         t0 = time.time()
         user_info = self.user_extractor.extract(
             debug=debug,
         )
         timings["user_extractor"] = round(time.time() - t0, 3)
+        if debug:
+            print(f"[DEBUG][USER] {user_info}")
         t0 = time.time()
         amount_info = self.amount_extractor.extract(
             debug=debug,
         )
         timings["amount_extractor"] = round(time.time() - t0, 3)
+        if debug:
+            print(f"[DEBUG][AMOUNT] {amount_info}")
         if debug:
             print(f"[TIMINGS] {timings}")
     user_names = extract_names(context.get("users"))
     transcript, whisper_time = transcribe_audio_text(audio_path)
+    if debug:
+        print(f"[DEBUG][TRANSCRIPT] {transcript}")
+        print(
+            f"[DEBUG][CONTEXT] suppliers_count={len(supplier_names)}, users_count={len(user_names)}"
+        )
+        print(f"[DEBUG][SUPPLIERS] {supplier_names}")
+        print(f"[DEBUG][USERS] {user_names}")
     if mode == "notes":
         notes = polish_notes_text(transcript)
     print(f"[TIMINGS] pipeline_init: {pipeline_init_time}s")
     extracted = extractor.extract(transcript, reference_date=date.today().isoformat(), debug=debug)
+    if debug:
+        print(f"[DEBUG][EXTRACTED_RAW] {extracted}")
     total_time = round(time.time() - total_start, 3)
     print(f"[TIMINGS] TOTAL: {total_time}s (whisper: {whisper_time}s)")
     }
     if debug and extracted.get("debug"):
         payload["debug"] = extracted.get("debug")
+    if debug:
+        print(f"[DEBUG][RESPONSE_PAYLOAD] {payload}")
     return payload