Spaces:
Sleeping
Sleeping
Merge remote-tracking branch 'origin/main' into RevertedWhisperTurbuAndDeleteTmpDebug
Browse files- Dockerfile +2 -2
- app.py +21 -2
Dockerfile
CHANGED
|
@@ -2,13 +2,13 @@ FROM python:3.11-slim
|
|
| 2 |
|
| 3 |
ENV PYTHONUNBUFFERED=1 PIP_NO_CACHE_DIR=1 HOME=/home/user \
|
| 4 |
PATH=/home/user/.local/bin:$PATH PORT=7860 \
|
| 5 |
-
WHISPER_MODEL=deepdml/faster-whisper-large-v3-
|
| 6 |
OMP_NUM_THREADS=2 OPENBLAS_NUM_THREADS=2 \
|
| 7 |
TOKENIZERS_PARALLELISM=false \
|
| 8 |
WHISPER_CPU_THREADS=2 WHISPER_NUM_BEAMS=1 \
|
| 9 |
WHISPER_VAD_FILTER=0 WHISPER_PRELOAD_ON_START=1 \
|
| 10 |
WHISPER_BACKEND=auto WHISPER_REMOTE_PROVIDER=hf-inference \
|
| 11 |
-
WHISPER_REMOTE_MODEL=openai/whisper-large-v3
|
| 12 |
WHISPER_REMOTE_TIMEOUT=15 WHISPER_PREPROCESS_AUDIO=1
|
| 13 |
|
| 14 |
RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg \
|
|
|
|
| 2 |
|
| 3 |
ENV PYTHONUNBUFFERED=1 PIP_NO_CACHE_DIR=1 HOME=/home/user \
|
| 4 |
PATH=/home/user/.local/bin:$PATH PORT=7860 \
|
| 5 |
+
WHISPER_MODEL=deepdml/faster-whisper-large-v3-ct2 \
|
| 6 |
OMP_NUM_THREADS=2 OPENBLAS_NUM_THREADS=2 \
|
| 7 |
TOKENIZERS_PARALLELISM=false \
|
| 8 |
WHISPER_CPU_THREADS=2 WHISPER_NUM_BEAMS=1 \
|
| 9 |
WHISPER_VAD_FILTER=0 WHISPER_PRELOAD_ON_START=1 \
|
| 10 |
WHISPER_BACKEND=auto WHISPER_REMOTE_PROVIDER=hf-inference \
|
| 11 |
+
WHISPER_REMOTE_MODEL=openai/whisper-large-v3 \
|
| 12 |
WHISPER_REMOTE_TIMEOUT=15 WHISPER_PREPROCESS_AUDIO=1
|
| 13 |
|
| 14 |
RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg \
|
app.py
CHANGED
|
@@ -286,7 +286,7 @@ def transcribe_audio_remote(audio_path: str) -> tuple[str, float]:
|
|
| 286 |
"""Транскрибирует аудио через HF Inference."""
|
| 287 |
started = time.time()
|
| 288 |
client = get_hf_asr_client()
|
| 289 |
-
model_id = os.getenv("WHISPER_REMOTE_MODEL", "openai/whisper-large-v3
|
| 290 |
|
| 291 |
result = client.automatic_speech_recognition(audio=audio_path, model=model_id)
|
| 292 |
text = (getattr(result, "text", None) or "").strip()
|
|
@@ -328,7 +328,7 @@ def get_whisper_model() -> Any:
|
|
| 328 |
if _WHISPER_MODEL is None:
|
| 329 |
from faster_whisper import WhisperModel
|
| 330 |
|
| 331 |
-
model_id = os.getenv("WHISPER_MODEL", "deepdml/faster-whisper-large-v3-
|
| 332 |
cpu_threads = max(1, int(os.getenv("WHISPER_CPU_THREADS", "2")))
|
| 333 |
|
| 334 |
_WHISPER_MODEL = WhisperModel(
|
|
@@ -374,6 +374,8 @@ class ExpenseTextExtractor:
|
|
| 374 |
t0 = time.time()
|
| 375 |
date_info = self.date_extractor.extract(text, reference_date=reference_date, debug=debug)
|
| 376 |
timings["date_extractor"] = round(time.time() - t0, 3)
|
|
|
|
|
|
|
| 377 |
|
| 378 |
t0 = time.time()
|
| 379 |
supplier_info = self.supplier_extractor.extract(
|
|
@@ -382,6 +384,8 @@ class ExpenseTextExtractor:
|
|
| 382 |
debug=debug,
|
| 383 |
)
|
| 384 |
timings["supplier_extractor"] = round(time.time() - t0, 3)
|
|
|
|
|
|
|
| 385 |
|
| 386 |
t0 = time.time()
|
| 387 |
user_info = self.user_extractor.extract(
|
|
@@ -391,6 +395,8 @@ class ExpenseTextExtractor:
|
|
| 391 |
debug=debug,
|
| 392 |
)
|
| 393 |
timings["user_extractor"] = round(time.time() - t0, 3)
|
|
|
|
|
|
|
| 394 |
|
| 395 |
t0 = time.time()
|
| 396 |
amount_info = self.amount_extractor.extract(
|
|
@@ -400,6 +406,8 @@ class ExpenseTextExtractor:
|
|
| 400 |
debug=debug,
|
| 401 |
)
|
| 402 |
timings["amount_extractor"] = round(time.time() - t0, 3)
|
|
|
|
|
|
|
| 403 |
|
| 404 |
if debug:
|
| 405 |
print(f"[TIMINGS] {timings}")
|
|
@@ -506,6 +514,13 @@ def process_voice_request(audio_path: str, mode: str, payload: dict[str, Any], d
|
|
| 506 |
user_names = extract_names(context.get("users"))
|
| 507 |
|
| 508 |
transcript, whisper_time = transcribe_audio_text(audio_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 509 |
|
| 510 |
if mode == "notes":
|
| 511 |
notes = polish_notes_text(transcript)
|
|
@@ -531,6 +546,8 @@ def process_voice_request(audio_path: str, mode: str, payload: dict[str, Any], d
|
|
| 531 |
print(f"[TIMINGS] pipeline_init: {pipeline_init_time}s")
|
| 532 |
|
| 533 |
extracted = extractor.extract(transcript, reference_date=date.today().isoformat(), debug=debug)
|
|
|
|
|
|
|
| 534 |
|
| 535 |
total_time = round(time.time() - total_start, 3)
|
| 536 |
print(f"[TIMINGS] TOTAL: {total_time}s (whisper: {whisper_time}s)")
|
|
@@ -546,6 +563,8 @@ def process_voice_request(audio_path: str, mode: str, payload: dict[str, Any], d
|
|
| 546 |
}
|
| 547 |
if debug and extracted.get("debug"):
|
| 548 |
payload["debug"] = extracted.get("debug")
|
|
|
|
|
|
|
| 549 |
return payload
|
| 550 |
|
| 551 |
|
|
|
|
| 286 |
"""Транскрибирует аудио через HF Inference."""
|
| 287 |
started = time.time()
|
| 288 |
client = get_hf_asr_client()
|
| 289 |
+
model_id = os.getenv("WHISPER_REMOTE_MODEL", "openai/whisper-large-v3")
|
| 290 |
|
| 291 |
result = client.automatic_speech_recognition(audio=audio_path, model=model_id)
|
| 292 |
text = (getattr(result, "text", None) or "").strip()
|
|
|
|
| 328 |
if _WHISPER_MODEL is None:
|
| 329 |
from faster_whisper import WhisperModel
|
| 330 |
|
| 331 |
+
model_id = os.getenv("WHISPER_MODEL", "deepdml/faster-whisper-large-v3-ct2")
|
| 332 |
cpu_threads = max(1, int(os.getenv("WHISPER_CPU_THREADS", "2")))
|
| 333 |
|
| 334 |
_WHISPER_MODEL = WhisperModel(
|
|
|
|
| 374 |
t0 = time.time()
|
| 375 |
date_info = self.date_extractor.extract(text, reference_date=reference_date, debug=debug)
|
| 376 |
timings["date_extractor"] = round(time.time() - t0, 3)
|
| 377 |
+
if debug:
|
| 378 |
+
print(f"[DEBUG][DATE] {date_info}")
|
| 379 |
|
| 380 |
t0 = time.time()
|
| 381 |
supplier_info = self.supplier_extractor.extract(
|
|
|
|
| 384 |
debug=debug,
|
| 385 |
)
|
| 386 |
timings["supplier_extractor"] = round(time.time() - t0, 3)
|
| 387 |
+
if debug:
|
| 388 |
+
print(f"[DEBUG][SUPPLIER] {supplier_info}")
|
| 389 |
|
| 390 |
t0 = time.time()
|
| 391 |
user_info = self.user_extractor.extract(
|
|
|
|
| 395 |
debug=debug,
|
| 396 |
)
|
| 397 |
timings["user_extractor"] = round(time.time() - t0, 3)
|
| 398 |
+
if debug:
|
| 399 |
+
print(f"[DEBUG][USER] {user_info}")
|
| 400 |
|
| 401 |
t0 = time.time()
|
| 402 |
amount_info = self.amount_extractor.extract(
|
|
|
|
| 406 |
debug=debug,
|
| 407 |
)
|
| 408 |
timings["amount_extractor"] = round(time.time() - t0, 3)
|
| 409 |
+
if debug:
|
| 410 |
+
print(f"[DEBUG][AMOUNT] {amount_info}")
|
| 411 |
|
| 412 |
if debug:
|
| 413 |
print(f"[TIMINGS] {timings}")
|
|
|
|
| 514 |
user_names = extract_names(context.get("users"))
|
| 515 |
|
| 516 |
transcript, whisper_time = transcribe_audio_text(audio_path)
|
| 517 |
+
if debug:
|
| 518 |
+
print(f"[DEBUG][TRANSCRIPT] {transcript}")
|
| 519 |
+
print(
|
| 520 |
+
f"[DEBUG][CONTEXT] suppliers_count={len(supplier_names)}, users_count={len(user_names)}"
|
| 521 |
+
)
|
| 522 |
+
print(f"[DEBUG][SUPPLIERS] {supplier_names}")
|
| 523 |
+
print(f"[DEBUG][USERS] {user_names}")
|
| 524 |
|
| 525 |
if mode == "notes":
|
| 526 |
notes = polish_notes_text(transcript)
|
|
|
|
| 546 |
print(f"[TIMINGS] pipeline_init: {pipeline_init_time}s")
|
| 547 |
|
| 548 |
extracted = extractor.extract(transcript, reference_date=date.today().isoformat(), debug=debug)
|
| 549 |
+
if debug:
|
| 550 |
+
print(f"[DEBUG][EXTRACTED_RAW] {extracted}")
|
| 551 |
|
| 552 |
total_time = round(time.time() - total_start, 3)
|
| 553 |
print(f"[TIMINGS] TOTAL: {total_time}s (whisper: {whisper_time}s)")
|
|
|
|
| 563 |
}
|
| 564 |
if debug and extracted.get("debug"):
|
| 565 |
payload["debug"] = extracted.get("debug")
|
| 566 |
+
if debug:
|
| 567 |
+
print(f"[DEBUG][RESPONSE_PAYLOAD] {payload}")
|
| 568 |
return payload
|
| 569 |
|
| 570 |
|