CleanSong commited on
Commit
61386ba
·
verified ·
1 Parent(s): 2d2ba0c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -18
app.py CHANGED
@@ -1,41 +1,53 @@
1
  import gradio as gr
2
- import whisper
3
  import torch
4
  import torchaudio
5
  import os, json
 
6
 
7
- # preload model once when the Space wakes up
8
  device = "cuda" if torch.cuda.is_available() else "cpu"
9
  MODEL_NAME = os.getenv("WHISPER_MODEL", "large-v3")
10
- model = whisper.load_model(MODEL_NAME, device=device)
 
 
 
 
 
 
11
 
12
  def transcribe(file_path):
13
- # --- fix sample rate and channels ---
14
  wav, sr = torchaudio.load(file_path)
15
  if sr != 16000:
16
  wav = torchaudio.functional.resample(wav, sr, 16000)
17
  if wav.shape[0] > 1:
18
- wav = wav.mean(dim=0, keepdim=True)
19
  fixed_path = "input_fixed.wav"
20
  torchaudio.save(fixed_path, wav, 16000)
21
 
22
- # --- run Whisper ---
23
- result = model.transcribe(fixed_path, word_timestamps=True, fp16=False)
 
 
 
 
 
 
24
 
25
- # --- build simple list of transcript entries ---
26
  transcript = []
27
- for seg in result["segments"]:
28
- for w in seg.get("words", []):
29
  transcript.append({
30
- "word": w["word"].strip(),
31
- "start": w["start"],
32
- "end": w["end"]
33
  })
 
34
  if not transcript:
35
- # fallback to segment-level text
36
- transcript = [{"text": seg["text"], "start": seg["start"], "end": seg["end"]}
37
- for seg in result["segments"]]
38
 
 
39
  return transcript
40
 
41
 
@@ -43,8 +55,8 @@ iface = gr.Interface(
43
  fn=transcribe,
44
  inputs=gr.Audio(type="filepath", label="Upload Vocals"),
45
  outputs=gr.JSON(label="Transcript"),
46
- title="CleanSong AI — Whisper Transcriber",
47
- description="Transcribes vocals with per-word timestamps (16 kHz mono, Whisper Base)."
48
  )
49
 
50
  if __name__ == "__main__":
 
1
  import gradio as gr
 
2
  import torch
3
  import torchaudio
4
  import os, json
5
+ from faster_whisper import WhisperModel
6
 
7
+ # === Load model once ===
8
  device = "cuda" if torch.cuda.is_available() else "cpu"
9
  MODEL_NAME = os.getenv("WHISPER_MODEL", "large-v3")
10
+ COMPUTE_TYPE = "float16" if torch.cuda.is_available() else "int8"
11
+
12
+ model = WhisperModel(
13
+ MODEL_NAME,
14
+ device=device,
15
+ compute_type=COMPUTE_TYPE, # float16 on GPU → identical timestamp precision to OpenAI
16
+ )
17
 
18
  def transcribe(file_path):
19
+ # --- Ensure proper audio format ---
20
  wav, sr = torchaudio.load(file_path)
21
  if sr != 16000:
22
  wav = torchaudio.functional.resample(wav, sr, 16000)
23
  if wav.shape[0] > 1:
24
+ wav = wav.mean(dim=0, keepdim=True) # mono
25
  fixed_path = "input_fixed.wav"
26
  torchaudio.save(fixed_path, wav, 16000)
27
 
28
+ # --- Transcribe ---
29
+ segments, info = model.transcribe(
30
+ fixed_path,
31
+ beam_size=5,
32
+ word_timestamps=True,
33
+ vad_filter=True, # helps prevent drift in pauses
34
+ suppress_silence=True
35
+ )
36
 
37
+ # --- Build transcript list ---
38
  transcript = []
39
+ for seg in segments:
40
+ for w in seg.words:
41
  transcript.append({
42
+ "word": w.word.strip(),
43
+ "start": w.start,
44
+ "end": w.end
45
  })
46
+
47
  if not transcript:
48
+ transcript = [{"text": seg.text, "start": seg.start, "end": seg.end} for seg in segments]
 
 
49
 
50
+ print(f"✅ Transcribed {len(transcript)} words")
51
  return transcript
52
 
53
 
 
55
  fn=transcribe,
56
  inputs=gr.Audio(type="filepath", label="Upload Vocals"),
57
  outputs=gr.JSON(label="Transcript"),
58
+ title="CleanSong AI — Whisper Transcriber (Faster-Whisper Large-V3)",
59
+ description="High-accuracy transcription with precise per-word timestamps at 16 kHz mono (float16)."
60
  )
61
 
62
  if __name__ == "__main__":