Issamohammed commited on
Commit
db55266
·
verified ·
1 Parent(s): 7705134

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -11
app.py CHANGED
@@ -1,21 +1,59 @@
1
- from pydub import AudioSegment
 
 
2
  import mimetypes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  def transcribe(audio_path):
5
  try:
6
- # Detect file type using MIME or extension
7
- mime_type, _ = mimetypes.guess_type(audio_path)
8
  ext = os.path.splitext(audio_path)[1].lower()
9
 
10
- if mime_type == "audio/mp4" or ext == ".m4a":
11
- print("Converting .m4a to .wav...")
12
- sound = AudioSegment.from_file(audio_path, format="m4a")
13
- converted_path = audio_path.replace(".m4a", ".converted.wav")
14
- sound.export(converted_path, format="wav")
15
- audio_path = converted_path
 
 
 
16
 
 
17
  result = pipe(audio_path, chunk_length_s=30, generate_kwargs={"task": "transcribe", "language": "sv"})
18
  return result["text"]
19
-
20
  except Exception as e:
21
- return f"Error during transcription: {str(e)}"
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import gradio as gr
4
  import mimetypes
5
+ from pydub import AudioSegment
6
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
7
+
8
+ # Set device and precision
9
+ device = "cpu"
10
+ torch_dtype = torch.float32
11
+
12
+ # Load KB-Whisper model
13
+ model_id = "KBLab/kb-whisper-large"
14
+
15
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
16
+ model_id, torch_dtype=torch_dtype
17
+ ).to(device)
18
+
19
+ processor = AutoProcessor.from_pretrained(model_id)
20
+
21
+ pipe = pipeline(
22
+ "automatic-speech-recognition",
23
+ model=model,
24
+ tokenizer=processor.tokenizer,
25
+ feature_extractor=processor.feature_extractor,
26
+ device=device,
27
+ torch_dtype=torch_dtype,
28
+ )
29
 
30
  def transcribe(audio_path):
31
  try:
32
+ # Get file extension
 
33
  ext = os.path.splitext(audio_path)[1].lower()
34
 
35
+ # Convert to WAV if not already
36
+ if ext != ".wav":
37
+ try:
38
+ sound = AudioSegment.from_file(audio_path)
39
+ converted_path = audio_path.replace(ext, ".converted.wav")
40
+ sound.export(converted_path, format="wav")
41
+ audio_path = converted_path
42
+ except Exception as e:
43
+ return f"Error converting audio to WAV: {str(e)}"
44
 
45
+ # Transcribe
46
  result = pipe(audio_path, chunk_length_s=30, generate_kwargs={"task": "transcribe", "language": "sv"})
47
  return result["text"]
48
+
49
  except Exception as e:
50
+ return f"Transcription failed: {str(e)}"
51
+
52
+ # Gradio UI
53
+ gr.Interface(
54
+ fn=transcribe,
55
+ inputs=gr.Audio(type="filepath", label="Upload Audio (.m4a, .mp3, .wav)"),
56
+ outputs=gr.Textbox(label="Swedish Transcript"),
57
+ title="Swedish Speech Transcriber with KB-Whisper",
58
+ description="Supports .m4a, .mp3, .wav files. Transcribes spoken Swedish using KBLab's Whisper Large model. May take time on CPU.",
59
+ ).launch()