Spaces:

mkfallah
/

pasr

Sleeping

App Files Files Community

mkfallah commited on Sep 4

Commit

5afd83b

verified ·

1 Parent(s): f609e9a

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -11

app.py CHANGED Viewed

@@ -5,14 +5,14 @@ import tempfile
 import soundfile as sf
 import numpy as np
-# --- Initialize ASR pipeline ---
 asr = pipeline(
     task="automatic-speech-recognition",
     model="vhdm/whisper-large-fa-v1",
-    device=-1  # CPU; for GPU device=0
 )
-# --- Custom vocabulary with multiple forms for accuracy ---
 custom_vocab_map = {
     "نرد": ["نرد", "نِرد", "نَرد"],
     "کامپیوتر": ["کامپیوتر", "کامپیوتره"],
@@ -33,31 +33,31 @@ def replace_fuzzy(text, vocab_map, threshold=85):
 def transcribe(audio):
     """
-    Handle audio input from Gradio: tuple (numpy array, sample_rate) or file path
     """
     if audio is None:
         return "No audio input detected."
-    # If tuple (numpy array + sample_rate)
-    if isinstance(audio, tuple):
         data, sr = audio
         data = np.asarray(data)
-        # Convert mono to 2D array for soundfile
         if data.ndim == 1:
             data = np.expand_dims(data, axis=1)
         with tempfile.NamedTemporaryFile(suffix=".wav") as tmp:
             sf.write(tmp.name, data, samplerate=sr)
-            # Run ASR with chunking for long audio
             result = asr(tmp.name, chunk_length_s=30, stride_length_s=[5,5])
-    else:
-        # If file path
         result = asr(audio, chunk_length_s=30, stride_length_s=[5,5])
     text = result.get("text", "")
     final_text = replace_fuzzy(text, custom_vocab_map, threshold=85)
     return final_text
-# --- Gradio interface ---
 iface = gr.Interface(
     fn=transcribe,
     inputs=gr.Audio(type="numpy", label="Record or upload audio"),

 import soundfile as sf
 import numpy as np
+# Initialize ASR pipeline
 asr = pipeline(
     task="automatic-speech-recognition",
     model="vhdm/whisper-large-fa-v1",
+    device=-1  # CPU; set device=0 for GPU
 )
+# Custom vocabulary with multiple forms for accuracy
 custom_vocab_map = {
     "نرد": ["نرد", "نِرد", "نَرد"],
     "کامپیوتر": ["کامپیوتر", "کامپیوتره"],
 def transcribe(audio):
     """
+    Handle audio input from Gradio: tuple (numpy array, sample_rate) or file path.
     """
     if audio is None:
         return "No audio input detected."
+    # If audio is a tuple (numpy array, sample_rate)
+    if isinstance(audio, tuple) and len(audio) == 2:
         data, sr = audio
         data = np.asarray(data)
         if data.ndim == 1:
             data = np.expand_dims(data, axis=1)
         with tempfile.NamedTemporaryFile(suffix=".wav") as tmp:
             sf.write(tmp.name, data, samplerate=sr)
             result = asr(tmp.name, chunk_length_s=30, stride_length_s=[5,5])
+    elif isinstance(audio, str):
+        # If audio is a file path
         result = asr(audio, chunk_length_s=30, stride_length_s=[5,5])
+    else:
+        return "Unsupported audio input type."
     text = result.get("text", "")
     final_text = replace_fuzzy(text, custom_vocab_map, threshold=85)
     return final_text
+# Gradio interface
 iface = gr.Interface(
     fn=transcribe,
     inputs=gr.Audio(type="numpy", label="Record or upload audio"),