mkfallah commited on
Commit
5afd83b
·
verified ·
1 Parent(s): f609e9a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -11
app.py CHANGED
@@ -5,14 +5,14 @@ import tempfile
5
  import soundfile as sf
6
  import numpy as np
7
 
8
- # --- Initialize ASR pipeline ---
9
  asr = pipeline(
10
  task="automatic-speech-recognition",
11
  model="vhdm/whisper-large-fa-v1",
12
- device=-1 # CPU; for GPU device=0
13
  )
14
 
15
- # --- Custom vocabulary with multiple forms for accuracy ---
16
  custom_vocab_map = {
17
  "نرد": ["نرد", "نِرد", "نَرد"],
18
  "کامپیوتر": ["کامپیوتر", "کامپیوتره"],
@@ -33,31 +33,31 @@ def replace_fuzzy(text, vocab_map, threshold=85):
33
 
34
  def transcribe(audio):
35
  """
36
- Handle audio input from Gradio: tuple (numpy array, sample_rate) or file path
37
  """
38
  if audio is None:
39
  return "No audio input detected."
40
 
41
- # If tuple (numpy array + sample_rate)
42
- if isinstance(audio, tuple):
43
  data, sr = audio
44
  data = np.asarray(data)
45
- # Convert mono to 2D array for soundfile
46
  if data.ndim == 1:
47
  data = np.expand_dims(data, axis=1)
48
  with tempfile.NamedTemporaryFile(suffix=".wav") as tmp:
49
  sf.write(tmp.name, data, samplerate=sr)
50
- # Run ASR with chunking for long audio
51
  result = asr(tmp.name, chunk_length_s=30, stride_length_s=[5,5])
52
- else:
53
- # If file path
54
  result = asr(audio, chunk_length_s=30, stride_length_s=[5,5])
 
 
55
 
56
  text = result.get("text", "")
57
  final_text = replace_fuzzy(text, custom_vocab_map, threshold=85)
58
  return final_text
59
 
60
- # --- Gradio interface ---
61
  iface = gr.Interface(
62
  fn=transcribe,
63
  inputs=gr.Audio(type="numpy", label="Record or upload audio"),
 
5
  import soundfile as sf
6
  import numpy as np
7
 
8
+ # Initialize ASR pipeline
9
  asr = pipeline(
10
  task="automatic-speech-recognition",
11
  model="vhdm/whisper-large-fa-v1",
12
+ device=-1 # CPU; set device=0 for GPU
13
  )
14
 
15
+ # Custom vocabulary with multiple forms for accuracy
16
  custom_vocab_map = {
17
  "نرد": ["نرد", "نِرد", "نَرد"],
18
  "کامپیوتر": ["کامپیوتر", "کامپیوتره"],
 
33
 
34
  def transcribe(audio):
35
  """
36
+ Handle audio input from Gradio: tuple (numpy array, sample_rate) or file path.
37
  """
38
  if audio is None:
39
  return "No audio input detected."
40
 
41
+ # If audio is a tuple (numpy array, sample_rate)
42
+ if isinstance(audio, tuple) and len(audio) == 2:
43
  data, sr = audio
44
  data = np.asarray(data)
 
45
  if data.ndim == 1:
46
  data = np.expand_dims(data, axis=1)
47
  with tempfile.NamedTemporaryFile(suffix=".wav") as tmp:
48
  sf.write(tmp.name, data, samplerate=sr)
 
49
  result = asr(tmp.name, chunk_length_s=30, stride_length_s=[5,5])
50
+ elif isinstance(audio, str):
51
+ # If audio is a file path
52
  result = asr(audio, chunk_length_s=30, stride_length_s=[5,5])
53
+ else:
54
+ return "Unsupported audio input type."
55
 
56
  text = result.get("text", "")
57
  final_text = replace_fuzzy(text, custom_vocab_map, threshold=85)
58
  return final_text
59
 
60
+ # Gradio interface
61
  iface = gr.Interface(
62
  fn=transcribe,
63
  inputs=gr.Audio(type="numpy", label="Record or upload audio"),