palli23 commited on
Commit
cc6ae2a
·
1 Parent(s): 3785c6a

fix transcribe bug

Browse files
Files changed (2) hide show
  1. app.py +41 -68
  2. requirements.txt +6 -5
app.py CHANGED
@@ -1,80 +1,53 @@
 
1
  import os
2
  import gradio as gr
3
  import spaces
4
- import torch
5
  from transformers import pipeline
 
 
6
 
7
  MODEL_NAME = "palli23/whisper-small-sam_spjall"
8
 
9
- print("Loading optimized Whisper Small for T4...")
10
-
11
- # Load once + T4-specific optimizations
12
- pipe = pipeline(
13
- "automatic-speech-recognition",
14
- model=MODEL_NAME,
15
- torch_dtype=torch.float16, # FP16 = 2x faster, <4GB VRAM on T4
16
- device="cuda",
17
- model_kwargs={
18
- "attn_implementation": "flash_attention_2", # 20–30% faster attention
19
- "use_cache": True,
20
- },
21
- token=os.getenv("HF_TOKEN")
22
- )
23
-
24
- # Pre-set Icelandic for no detection overhead
25
- pipe.model.generation_config.language = "is"
26
- pipe.model.generation_config.task = "transcribe"
27
-
28
- print(f"Model ready! VRAM used: {torch.cuda.memory_allocated() / 1e9:.1f}GB")
29
-
30
- @spaces.GPU # No duration—let T4 run free
31
- def transcribe(audio_path):
32
  if not audio_path:
33
- return "Upload audio first"
34
 
35
- try:
36
- # Clear cache to prevent OOM aborts
37
- torch.cuda.empty_cache()
38
-
39
- result = pipe(
40
- audio_path,
41
- chunk_length_s=15, # Shorter = faster on T4 (less recompute)
42
- batch_size=32, # Max for T4's 16GB VRAM
43
- stride_length_s=(3, 1), # Minimal overlap = speed win
44
- return_timestamps=False,
45
- generate_kwargs={
46
- "do_sample": False, # Deterministic, faster
47
- "num_beams": 1, # No beam search = 2x faster
48
- }
49
- )
50
- text = result["text"].strip()
51
-
52
- # Post-clear to free VRAM
53
- torch.cuda.empty_cache()
54
-
55
- return f"✅ Done in {torch.cuda.max_memory_allocated() / 1e9:.1f}GB VRAM\n\n{text}"
56
-
57
- except RuntimeError as e:
58
- if "out of memory" in str(e):
59
- return "❌ OOM error—try shorter audio (<3min). VRAM spiked too high."
60
- raise gr.Error(f"GPU task failed: {str(e)}") # Catch & re-raise as Gradio error
61
-
62
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
63
- gr.Markdown("# Icelandic Whisper Small – T4 Optimized (No Aborts)")
64
- gr.Markdown("Upload <5min audio → Expect **10–20s** (monitors VRAM to prevent kills)")
65
-
66
- audio = gr.Audio(type="filepath", label="Audio (mp3/wav, <5min for best speed)")
67
- btn = gr.Button("Transcribe", variant="primary")
68
 
69
- # Add VRAM status for debugging
70
- status = gr.Markdown("VRAM: Ready")
71
-
72
- out = gr.Textbox(label="Transcription", lines=25, show_copy_button=True)
73
-
74
- def update_status():
75
- vram = torch.cuda.memory_allocated() / 1e9
76
- return f"VRAM: {vram:.1f}GB used"
 
 
 
 
 
 
77
 
78
- btn.click(transcribe, audio, out).then(update_status, outputs=status)
 
 
 
 
 
 
 
 
 
79
 
80
- demo.launch(auth=("beta", "beta2025"), max_threads=4) # Queue for concurrency
 
1
+ # app.py – ZeroGPU SAFE – 3 mín hljóð án "GPU task aborted"
2
  import os
3
  import gradio as gr
4
  import spaces
 
5
  from transformers import pipeline
6
+ import numpy as np
7
+ import librosa
8
 
9
  MODEL_NAME = "palli23/whisper-small-sam_spjall"
10
 
11
+ @spaces.GPU(duration=60) # MEST 60 sek – ZeroGPU leyfir
12
+ def transcribe_safe(audio_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  if not audio_path:
14
+ return "Hladdu upp hljóðskrá"
15
 
16
+ # Hlaða hljóð og klippa í 20 sek chunkar (mjög öruggt)
17
+ audio, sr = librosa.load(audio_path, sr=16000)
18
+ chunk_len = 16000 * 20 # 20 sek
19
+ stride = 16000 * 2 # 2 sek overlap
20
+ chunks = []
21
+ for i in range(0, len(audio), chunk_len - stride):
22
+ chunk = audio[i:i + chunk_len]
23
+ if len(chunk) < 16000: # undir 1 sek hætta
24
+ break
25
+ chunks.append(chunk)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ # Hlaða ASR á GPU (cached)
28
+ pipe = pipeline(
29
+ "automatic-speech-recognition",
30
+ model=MODEL_NAME,
31
+ device=0,
32
+ token=os.getenv("HF_TOKEN")
33
+ )
34
+
35
+ full_text = ""
36
+ for idx, chunk in enumerate(chunks):
37
+ result = pipe(chunk, batch_size=8)
38
+ full_text += result["text"] + " "
39
+
40
+ return full_text.strip() or "Ekkert heyrt"
41
 
42
+ # Gradio fallegt og tilbúið fyrir 3 mín
43
+ with gr.Blocks(title="Íslenskt ASR – 3 mín ZeroGPU") as demo:
44
+ gr.Markdown("# Íslenskt ASR – 3 mín hljóð")
45
+ gr.Markdown("**~4 % WER · 25–45 sek · ZeroGPU (PRO)**")
46
+
47
+ audio = gr.Audio(type="filepath", label="Hladdu upp .mp3 / .wav (allt að 3 mín)")
48
+ btn = gr.Button("Transcribe (25–45 sek)", variant="primary", size="lg")
49
+ out = gr.Textbox(lines=30, label="Útskrift")
50
+
51
+ btn.click(transcribe_safe, inputs=audio, outputs=out)
52
 
53
+ demo.launch(auth=("beta", "beta2025"))
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
- gradio>=4.44
2
- transformers>=4.45
3
- torch>=2.4
4
- accelerate
5
- spaces
 
 
1
+ gradio
2
+ transformers
3
+ torch
4
+ spaces
5
+ librosa
6
+ soundfile