VladRet2026 commited on
Commit
369d759
·
1 Parent(s): bd1a487

OptimezedWhisperWorking

Browse files
Files changed (2) hide show
  1. Dockerfile +7 -2
  2. app.py +73 -5
Dockerfile CHANGED
@@ -2,7 +2,12 @@ FROM python:3.11-slim
2
 
3
  ENV PYTHONUNBUFFERED=1 PIP_NO_CACHE_DIR=1 HOME=/home/user \
4
  PATH=/home/user/.local/bin:$PATH PORT=7860 \
5
- WHISPER_MODEL=openai/whisper-large-v3-turbo
 
 
 
 
 
6
 
7
  RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg \
8
  && rm -rf /var/lib/apt/lists/* \
@@ -18,4 +23,4 @@ COPY --chown=user app.py ./
18
  COPY --chown=user extractors/ ./extractors/
19
 
20
  EXPOSE 7860
21
- CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--workers", "1", "--threads", "4", "--timeout", "120", "app:app"]
 
2
 
3
  ENV PYTHONUNBUFFERED=1 PIP_NO_CACHE_DIR=1 HOME=/home/user \
4
  PATH=/home/user/.local/bin:$PATH PORT=7860 \
5
+ WHISPER_MODEL=openai/whisper-large-v3-turbo \
6
+ OMP_NUM_THREADS=2 MKL_NUM_THREADS=2 OPENBLAS_NUM_THREADS=2 NUMEXPR_NUM_THREADS=2 \
7
+ TOKENIZERS_PARALLELISM=false \
8
+ WHISPER_CPU_THREADS=2 WHISPER_CPU_INTEROP_THREADS=1 \
9
+ WHISPER_CHUNK_LENGTH_S=30 WHISPER_BATCH_SIZE=8 WHISPER_NUM_BEAMS=1 \
10
+ WHISPER_ENABLE_PROMPT=0 WHISPER_PRELOAD_ON_START=1
11
 
12
  RUN apt-get update && apt-get install -y --no-install-recommends ffmpeg \
13
  && rm -rf /var/lib/apt/lists/* \
 
23
  COPY --chown=user extractors/ ./extractors/
24
 
25
  EXPOSE 7860
26
+ CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--worker-class", "sync", "--workers", "1", "--timeout", "180", "--preload", "app:app"]
app.py CHANGED
@@ -31,6 +31,7 @@ HF_TOKEN = os.getenv("HF_TOKEN")
31
 
32
  _WHISPER_MODEL: Optional[Any] = None
33
  _WHISPER_PROCESSOR: Optional[Any] = None
 
34
 
35
 
36
  app = Flask(__name__)
@@ -140,18 +141,48 @@ TEST_PHRASES = [
140
  ]
141
 
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  def get_whisper_pipeline() -> Any:
144
  """Возвращает Whisper pipeline (ленивая загрузка)."""
145
  global _WHISPER_MODEL, _WHISPER_PROCESSOR
 
146
 
147
  if _WHISPER_MODEL is None:
148
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
149
 
150
  model_id = os.getenv("WHISPER_MODEL", "openai/whisper-large-v3-turbo")
 
151
 
152
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
153
  model_id,
154
- dtype=torch.float32,
155
  low_cpu_mem_usage=True,
156
  use_safetensors=True,
157
  )
@@ -159,13 +190,18 @@ def get_whisper_pipeline() -> Any:
159
 
160
  _WHISPER_PROCESSOR = AutoProcessor.from_pretrained(model_id)
161
 
 
 
 
162
  _WHISPER_MODEL = pipeline(
163
  "automatic-speech-recognition",
164
  model=model,
165
  tokenizer=_WHISPER_PROCESSOR.tokenizer,
166
  feature_extractor=_WHISPER_PROCESSOR.feature_extractor,
167
- dtype=torch.float32,
168
  device="cpu",
 
 
169
  )
170
 
171
  return _WHISPER_MODEL
@@ -306,22 +342,38 @@ def transcribe_audio_text(audio_path: str, suppliers: list[str] | None = None, u
306
 
307
  try:
308
  t0 = time.time()
 
309
  pipe = get_whisper_pipeline()
 
310
 
311
  generate_kwargs = {
312
  "language": "russian",
313
  "task": "transcribe",
 
 
 
314
  }
315
 
316
- prompt = build_whisper_prompt(suppliers or [], users or [])
317
- if prompt and _WHISPER_PROCESSOR is not None:
 
 
 
 
 
318
  try:
319
  generate_kwargs["prompt_ids"] = _WHISPER_PROCESSOR.get_prompt_ids(prompt, return_tensors="pt")
320
  print(f"[TIMINGS] whisper_prompt_enabled: suppliers={len(suppliers or [])}, users={len(users or [])}")
321
  except Exception as prompt_error:
322
  print(f"[WARN] Whisper prompt disabled: {prompt_error}")
 
 
 
 
 
 
 
323
 
324
- result = pipe(audio_path, generate_kwargs=generate_kwargs)
325
  text = result.get("text", "").strip()
326
  elapsed = round(time.time() - t0, 3)
327
  print(f"[TIMINGS] whisper_transcribe: {elapsed}s")
@@ -401,6 +453,22 @@ def parse_context(raw: str | None) -> dict[str, Any]:
401
  return {}
402
 
403
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
  # ============================================================================
405
  # ENDPOINTS
406
  # ============================================================================
 
31
 
32
  _WHISPER_MODEL: Optional[Any] = None
33
  _WHISPER_PROCESSOR: Optional[Any] = None
34
+ _TORCH_CPU_CONFIGURED = False
35
 
36
 
37
  app = Flask(__name__)
 
141
  ]
142
 
143
 
144
+ def env_flag(name: str, default: bool = False) -> bool:
145
+ """Парсит bool-флаг из переменных окружения."""
146
+ raw = os.getenv(name)
147
+ if raw is None:
148
+ return default
149
+ return raw.strip().lower() in {"1", "true", "yes", "on"}
150
+
151
+
152
+ def configure_torch_for_cpu() -> None:
153
+ """Настраивает torch для CPU-инференса."""
154
+ global _TORCH_CPU_CONFIGURED
155
+ if _TORCH_CPU_CONFIGURED:
156
+ return
157
+
158
+ cpu_count = max(1, os.cpu_count() or 1)
159
+ num_threads = int(os.getenv("WHISPER_CPU_THREADS", str(cpu_count)))
160
+ num_threads = max(1, min(num_threads, cpu_count))
161
+
162
+ interop_threads = int(os.getenv("WHISPER_CPU_INTEROP_THREADS", "1"))
163
+ interop_threads = max(1, interop_threads)
164
+
165
+ torch.set_num_threads(num_threads)
166
+ torch.set_num_interop_threads(interop_threads)
167
+ torch.backends.mkldnn.enabled = True
168
+ _TORCH_CPU_CONFIGURED = True
169
+ print(f"[INFO] torch cpu threads configured: intra={num_threads}, interop={interop_threads}")
170
+
171
+
172
  def get_whisper_pipeline() -> Any:
173
  """Возвращает Whisper pipeline (ленивая загрузка)."""
174
  global _WHISPER_MODEL, _WHISPER_PROCESSOR
175
+ configure_torch_for_cpu()
176
 
177
  if _WHISPER_MODEL is None:
178
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
179
 
180
  model_id = os.getenv("WHISPER_MODEL", "openai/whisper-large-v3-turbo")
181
+ torch_dtype = torch.float32
182
 
183
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
184
  model_id,
185
+ torch_dtype=torch_dtype,
186
  low_cpu_mem_usage=True,
187
  use_safetensors=True,
188
  )
 
190
 
191
  _WHISPER_PROCESSOR = AutoProcessor.from_pretrained(model_id)
192
 
193
+ chunk_length_s = int(os.getenv("WHISPER_CHUNK_LENGTH_S", "30"))
194
+ batch_size = int(os.getenv("WHISPER_BATCH_SIZE", "8"))
195
+
196
  _WHISPER_MODEL = pipeline(
197
  "automatic-speech-recognition",
198
  model=model,
199
  tokenizer=_WHISPER_PROCESSOR.tokenizer,
200
  feature_extractor=_WHISPER_PROCESSOR.feature_extractor,
201
+ torch_dtype=torch_dtype,
202
  device="cpu",
203
+ chunk_length_s=max(0, chunk_length_s),
204
+ batch_size=max(1, batch_size),
205
  )
206
 
207
  return _WHISPER_MODEL
 
342
 
343
  try:
344
  t0 = time.time()
345
+ pipeline_t0 = time.time()
346
  pipe = get_whisper_pipeline()
347
+ print(f"[TIMINGS] whisper_pipeline_ready: {round(time.time() - pipeline_t0, 3)}s")
348
 
349
  generate_kwargs = {
350
  "language": "russian",
351
  "task": "transcribe",
352
+ "num_beams": int(os.getenv("WHISPER_NUM_BEAMS", "1")),
353
+ "do_sample": False,
354
+ "condition_on_prev_text": False,
355
  }
356
 
357
+ use_prompt = env_flag("WHISPER_ENABLE_PROMPT", default=False)
358
+ prompt = ""
359
+ if use_prompt:
360
+ max_items = int(os.getenv("WHISPER_PROMPT_MAX_ITEMS", "12"))
361
+ prompt = build_whisper_prompt(suppliers or [], users or [], max_items=max_items)
362
+
363
+ if use_prompt and prompt and _WHISPER_PROCESSOR is not None:
364
  try:
365
  generate_kwargs["prompt_ids"] = _WHISPER_PROCESSOR.get_prompt_ids(prompt, return_tensors="pt")
366
  print(f"[TIMINGS] whisper_prompt_enabled: suppliers={len(suppliers or [])}, users={len(users or [])}")
367
  except Exception as prompt_error:
368
  print(f"[WARN] Whisper prompt disabled: {prompt_error}")
369
+ elif not use_prompt:
370
+ print("[TIMINGS] whisper_prompt_disabled")
371
+
372
+ infer_t0 = time.time()
373
+ with torch.inference_mode():
374
+ result = pipe(audio_path, generate_kwargs=generate_kwargs)
375
+ print(f"[TIMINGS] whisper_infer_only: {round(time.time() - infer_t0, 3)}s")
376
 
 
377
  text = result.get("text", "").strip()
378
  elapsed = round(time.time() - t0, 3)
379
  print(f"[TIMINGS] whisper_transcribe: {elapsed}s")
 
453
  return {}
454
 
455
 
456
+ def preload_whisper_if_enabled() -> None:
457
+ """Предзагружает Whisper при старте процесса, чтобы убрать холодный старт в запросе."""
458
+ if not env_flag("WHISPER_PRELOAD_ON_START", default=True):
459
+ return
460
+
461
+ started = time.time()
462
+ try:
463
+ get_whisper_pipeline()
464
+ print(f"[TIMINGS] whisper_preload: {round(time.time() - started, 3)}s")
465
+ except Exception as preload_error:
466
+ print(f"[WARN] Whisper preload failed: {preload_error}")
467
+
468
+
469
+ preload_whisper_if_enabled()
470
+
471
+
472
  # ============================================================================
473
  # ENDPOINTS
474
  # ============================================================================