Spaces:

gunnerforlife52
/

shuka-demo

Sleeping

gunnerforlife52 Claude commited on Oct 5

Commit

e64f574

1 Parent(s): 78680b9

Fix: Improve model loading and audio processing

Major changes:
- Changed device=0 to device_map="auto" for HF Spaces compatibility
- Increased audio cap from 30 to 60 seconds (Whisper may chunk internally)
- Added comprehensive error handling and debugging
- Improved response parsing with multiple fallback methods
- Added detailed error traceback for troubleshooting

This should fix issues with model not responding to queries.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (1) hide show

app.py +23 -7

app.py CHANGED Viewed

@@ -36,13 +36,14 @@ def load_model():
         pipe = transformers.pipeline(
             model=MODEL_ID,
             trust_remote_code=True,   # required for Shuka custom pipeline
-            device=0,
             torch_dtype="bfloat16",
         )
         print("✅ Pipeline loaded successfully!")
         return "✅ Model pipeline loaded successfully!"
     except Exception as e:
-        err = f"❌ Error loading model: {e}"
         print(err)
         return err
@@ -89,8 +90,9 @@ def load_audio_from_gradio(audio_input):
         audio = librosa.resample(audio, orig_sr=sr, target_sr=TARGET_SR)
         sr = TARGET_SR
-    # Safety cap at 30 seconds (mel features limited to 3000)
-    max_sec = 30
     if len(audio) / float(sr) > max_sec:
         audio = audio[: int(max_sec * sr)]
@@ -143,10 +145,24 @@ def analyze_audio(audio_file, system_prompt):
             {"audio": audio, "turns": turns, "sampling_rate": sr},
             max_new_tokens=512,
         )
-        text = out[0].get("generated_text", str(out)) if isinstance(out, list) and out else str(out)
         return f"✅ Processed.\n\n{text}"
     except Exception as e:
-        return f"❌ Inference error: {e}"
 # ---------------------------
@@ -162,7 +178,7 @@ with gr.Blocks(title="Shuka v1 (8.73B) — Audio Analyzer", theme=gr.themes.Soft
     **Shuka** is a multilingual audio-language model with strong capabilities in **11 Indic languages** including Hindi, Bengali, Tamil, Telugu, Marathi, Gujarati, Kannada, Malayalam, Punjabi, Odia, and Assamese.
-    ⚠️ **Note:** Audio is automatically capped at 30 seconds due to model constraints.
     """)
     with gr.Row():

         pipe = transformers.pipeline(
             model=MODEL_ID,
             trust_remote_code=True,   # required for Shuka custom pipeline
+            device_map="auto",  # Use auto device mapping for HF Spaces
             torch_dtype="bfloat16",
         )
         print("✅ Pipeline loaded successfully!")
         return "✅ Model pipeline loaded successfully!"
     except Exception as e:
+        import traceback
+        err = f"❌ Error loading model: {e}\n\n{traceback.format_exc()}"
         print(err)
         return err
         audio = librosa.resample(audio, orig_sr=sr, target_sr=TARGET_SR)
         sr = TARGET_SR
+    # Safety cap at 60 seconds to allow for longer queries
+    # Note: Whisper encoder has mel features limit, but may chunk internally
+    max_sec = 60
     if len(audio) / float(sr) > max_sec:
         audio = audio[: int(max_sec * sr)]
             {"audio": audio, "turns": turns, "sampling_rate": sr},
             max_new_tokens=512,
         )
+        # Debug: print raw output
+        print(f"Raw output type: {type(out)}")
+        print(f"Raw output: {out}")
+        # Extract text from response
+        if isinstance(out, list) and len(out) > 0:
+            text = out[0].get("generated_text", str(out[0]))
+        elif isinstance(out, dict):
+            text = out.get("generated_text", str(out))
+        else:
+            text = str(out)
         return f"✅ Processed.\n\n{text}"
     except Exception as e:
+        import traceback
+        error_details = traceback.format_exc()
+        print(f"Full error: {error_details}")
+        return f"❌ Inference error: {e}\n\nDetails:\n{error_details}"
 # ---------------------------
     **Shuka** is a multilingual audio-language model with strong capabilities in **11 Indic languages** including Hindi, Bengali, Tamil, Telugu, Marathi, Gujarati, Kannada, Malayalam, Punjabi, Odia, and Assamese.
+    ⚠️ **Note:** Audio is automatically capped at 60 seconds. For best results, use clear audio recordings.
     """)
     with gr.Row():