Spaces:
Sleeping
Sleeping
Commit
Β·
e64f574
1
Parent(s):
78680b9
Fix: Improve model loading and audio processing
Browse filesMajor changes:
- Changed device=0 to device_map="auto" for HF Spaces compatibility
- Increased audio cap from 30 to 60 seconds (Whisper may chunk internally)
- Added comprehensive error handling and debugging
- Improved response parsing with multiple fallback methods
- Added detailed error traceback for troubleshooting
This should fix issues with model not responding to queries.
π€ Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
app.py
CHANGED
|
@@ -36,13 +36,14 @@ def load_model():
|
|
| 36 |
pipe = transformers.pipeline(
|
| 37 |
model=MODEL_ID,
|
| 38 |
trust_remote_code=True, # required for Shuka custom pipeline
|
| 39 |
-
|
| 40 |
torch_dtype="bfloat16",
|
| 41 |
)
|
| 42 |
print("β
Pipeline loaded successfully!")
|
| 43 |
return "β
Model pipeline loaded successfully!"
|
| 44 |
except Exception as e:
|
| 45 |
-
|
|
|
|
| 46 |
print(err)
|
| 47 |
return err
|
| 48 |
|
|
@@ -89,8 +90,9 @@ def load_audio_from_gradio(audio_input):
|
|
| 89 |
audio = librosa.resample(audio, orig_sr=sr, target_sr=TARGET_SR)
|
| 90 |
sr = TARGET_SR
|
| 91 |
|
| 92 |
-
# Safety cap at
|
| 93 |
-
|
|
|
|
| 94 |
if len(audio) / float(sr) > max_sec:
|
| 95 |
audio = audio[: int(max_sec * sr)]
|
| 96 |
|
|
@@ -143,10 +145,24 @@ def analyze_audio(audio_file, system_prompt):
|
|
| 143 |
{"audio": audio, "turns": turns, "sampling_rate": sr},
|
| 144 |
max_new_tokens=512,
|
| 145 |
)
|
| 146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
return f"β
Processed.\n\n{text}"
|
| 148 |
except Exception as e:
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
|
| 152 |
# ---------------------------
|
|
@@ -162,7 +178,7 @@ with gr.Blocks(title="Shuka v1 (8.73B) β Audio Analyzer", theme=gr.themes.Soft
|
|
| 162 |
|
| 163 |
**Shuka** is a multilingual audio-language model with strong capabilities in **11 Indic languages** including Hindi, Bengali, Tamil, Telugu, Marathi, Gujarati, Kannada, Malayalam, Punjabi, Odia, and Assamese.
|
| 164 |
|
| 165 |
-
β οΈ **Note:** Audio is automatically capped at
|
| 166 |
""")
|
| 167 |
|
| 168 |
with gr.Row():
|
|
|
|
| 36 |
pipe = transformers.pipeline(
|
| 37 |
model=MODEL_ID,
|
| 38 |
trust_remote_code=True, # required for Shuka custom pipeline
|
| 39 |
+
device_map="auto", # Use auto device mapping for HF Spaces
|
| 40 |
torch_dtype="bfloat16",
|
| 41 |
)
|
| 42 |
print("β
Pipeline loaded successfully!")
|
| 43 |
return "β
Model pipeline loaded successfully!"
|
| 44 |
except Exception as e:
|
| 45 |
+
import traceback
|
| 46 |
+
err = f"β Error loading model: {e}\n\n{traceback.format_exc()}"
|
| 47 |
print(err)
|
| 48 |
return err
|
| 49 |
|
|
|
|
| 90 |
audio = librosa.resample(audio, orig_sr=sr, target_sr=TARGET_SR)
|
| 91 |
sr = TARGET_SR
|
| 92 |
|
| 93 |
+
# Safety cap at 60 seconds to allow for longer queries
|
| 94 |
+
# Note: Whisper encoder has mel features limit, but may chunk internally
|
| 95 |
+
max_sec = 60
|
| 96 |
if len(audio) / float(sr) > max_sec:
|
| 97 |
audio = audio[: int(max_sec * sr)]
|
| 98 |
|
|
|
|
| 145 |
{"audio": audio, "turns": turns, "sampling_rate": sr},
|
| 146 |
max_new_tokens=512,
|
| 147 |
)
|
| 148 |
+
# Debug: print raw output
|
| 149 |
+
print(f"Raw output type: {type(out)}")
|
| 150 |
+
print(f"Raw output: {out}")
|
| 151 |
+
|
| 152 |
+
# Extract text from response
|
| 153 |
+
if isinstance(out, list) and len(out) > 0:
|
| 154 |
+
text = out[0].get("generated_text", str(out[0]))
|
| 155 |
+
elif isinstance(out, dict):
|
| 156 |
+
text = out.get("generated_text", str(out))
|
| 157 |
+
else:
|
| 158 |
+
text = str(out)
|
| 159 |
+
|
| 160 |
return f"β
Processed.\n\n{text}"
|
| 161 |
except Exception as e:
|
| 162 |
+
import traceback
|
| 163 |
+
error_details = traceback.format_exc()
|
| 164 |
+
print(f"Full error: {error_details}")
|
| 165 |
+
return f"β Inference error: {e}\n\nDetails:\n{error_details}"
|
| 166 |
|
| 167 |
|
| 168 |
# ---------------------------
|
|
|
|
| 178 |
|
| 179 |
**Shuka** is a multilingual audio-language model with strong capabilities in **11 Indic languages** including Hindi, Bengali, Tamil, Telugu, Marathi, Gujarati, Kannada, Malayalam, Punjabi, Odia, and Assamese.
|
| 180 |
|
| 181 |
+
β οΈ **Note:** Audio is automatically capped at 60 seconds. For best results, use clear audio recordings.
|
| 182 |
""")
|
| 183 |
|
| 184 |
with gr.Row():
|