root
commited on
Commit
·
c95399f
1
Parent(s):
5d5eb0f
ss
Browse files
app.py
CHANGED
@@ -24,7 +24,12 @@ from utils import (
|
|
24 |
)
|
25 |
from emotionanalysis import MusicAnalyzer
|
26 |
import librosa
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
28 |
import tempfile
|
29 |
import os
|
30 |
import soundfile as sf
|
@@ -3196,8 +3201,13 @@ def detect_voice_activity(audio_file):
|
|
3196 |
print("To use voice activity detection:")
|
3197 |
print("1. Create an account at https://huggingface.co")
|
3198 |
print("2. Generate a token at https://huggingface.co/settings/tokens")
|
3199 |
-
print("3. Accept the terms for pyannote
|
3200 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
3201 |
|
3202 |
# Create fallback segments based on audio duration
|
3203 |
# This creates segments approximately every 5 seconds
|
@@ -3224,8 +3234,36 @@ def detect_voice_activity(audio_file):
|
|
3224 |
print(f"Created {len(estimated_segments)} estimated voice segments (fallback mode)")
|
3225 |
return estimated_segments
|
3226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3227 |
# Initialize the voice activity detection pipeline
|
3228 |
try:
|
|
|
3229 |
vad_pipeline = Pipeline.from_pretrained(
|
3230 |
"pyannote/voice-activity-detection",
|
3231 |
use_auth_token=hf_token
|
@@ -4339,18 +4377,27 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
|
|
4339 |
# Add voice detection info box
|
4340 |
with gr.Accordion("Voice Activity Detection", open=True):
|
4341 |
gr.Markdown("""
|
4342 |
-
|
4343 |
-
|
4344 |
-
This app uses pyannote/voice-activity-detection to identify vocal segments in music.
|
4345 |
-
|
4346 |
-
**Important:** This model requires Hugging Face authentication:
|
4347 |
-
|
4348 |
-
1. Create an account at [huggingface.co](https://huggingface.co)
|
4349 |
-
2. Generate a token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
|
4350 |
-
3. Accept the terms at [huggingface.co/pyannote/segmentation](https://huggingface.co/pyannote/segmentation)
|
4351 |
-
4. Set the HF_TOKEN environment variable
|
4352 |
|
4353 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4354 |
""")
|
4355 |
|
4356 |
with gr.Column(scale=2):
|
|
|
24 |
)
|
25 |
from emotionanalysis import MusicAnalyzer
|
26 |
import librosa
|
27 |
+
try:
|
28 |
+
from pyannote.audio import Pipeline
|
29 |
+
PYANNOTE_AVAILABLE = True
|
30 |
+
except ImportError:
|
31 |
+
print("WARNING: pyannote.audio is not properly installed. Voice detection will use fallback mode.")
|
32 |
+
PYANNOTE_AVAILABLE = False
|
33 |
import tempfile
|
34 |
import os
|
35 |
import soundfile as sf
|
|
|
3201 |
print("To use voice activity detection:")
|
3202 |
print("1. Create an account at https://huggingface.co")
|
3203 |
print("2. Generate a token at https://huggingface.co/settings/tokens")
|
3204 |
+
print("3. Accept the terms for pyannote models at:")
|
3205 |
+
print(" - https://huggingface.co/pyannote/segmentation")
|
3206 |
+
print(" - https://huggingface.co/pyannote/voice-activity-detection")
|
3207 |
+
print("4. Set 'pyannote' environment variable with your token:")
|
3208 |
+
print(" - Linux/Mac: export pyannote=your_token_here")
|
3209 |
+
print(" - Windows: set pyannote=your_token_here")
|
3210 |
+
print(" - Hugging Face Spaces: Add a 'pyannote' Secret in Settings")
|
3211 |
|
3212 |
# Create fallback segments based on audio duration
|
3213 |
# This creates segments approximately every 5 seconds
|
|
|
3234 |
print(f"Created {len(estimated_segments)} estimated voice segments (fallback mode)")
|
3235 |
return estimated_segments
|
3236 |
|
3237 |
+
# Check if pyannote is available
|
3238 |
+
if not PYANNOTE_AVAILABLE:
|
3239 |
+
print("pyannote.audio is not available. Using fallback voice detection.")
|
3240 |
+
# Create fallback segments based on audio duration
|
3241 |
+
y, sr = load_audio(audio_file, SAMPLE_RATE)
|
3242 |
+
duration = extract_audio_duration(y, sr)
|
3243 |
+
|
3244 |
+
# Create segments of 4-5 seconds each, with small gaps between them
|
3245 |
+
estimated_segments = []
|
3246 |
+
segment_duration = 4.5
|
3247 |
+
gap_duration = 1.0
|
3248 |
+
|
3249 |
+
current_pos = 0.0
|
3250 |
+
while current_pos < duration:
|
3251 |
+
segment_end = min(current_pos + segment_duration, duration)
|
3252 |
+
estimated_segments.append({
|
3253 |
+
"start": current_pos,
|
3254 |
+
"end": segment_end,
|
3255 |
+
"duration": segment_end - current_pos
|
3256 |
+
})
|
3257 |
+
current_pos = segment_end + gap_duration
|
3258 |
+
if current_pos >= duration:
|
3259 |
+
break
|
3260 |
+
|
3261 |
+
print(f"Created {len(estimated_segments)} estimated voice segments (fallback mode)")
|
3262 |
+
return estimated_segments
|
3263 |
+
|
3264 |
# Initialize the voice activity detection pipeline
|
3265 |
try:
|
3266 |
+
print(f"Attempting to load pyannote/voice-activity-detection with auth token: {'[PROVIDED]' if hf_token else '[MISSING]'}")
|
3267 |
vad_pipeline = Pipeline.from_pretrained(
|
3268 |
"pyannote/voice-activity-detection",
|
3269 |
use_auth_token=hf_token
|
|
|
4377 |
# Add voice detection info box
|
4378 |
with gr.Accordion("Voice Activity Detection", open=True):
|
4379 |
gr.Markdown("""
|
4380 |
+
### Voice Detection Authentication Required
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4381 |
|
4382 |
+
This app uses pyannote/voice-activity-detection to identify vocal segments in music.
|
4383 |
+
|
4384 |
+
**Important:** This model requires Hugging Face authentication:
|
4385 |
+
|
4386 |
+
1. Create an account at [huggingface.co](https://huggingface.co)
|
4387 |
+
2. Generate a token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
|
4388 |
+
3. Accept the terms at [huggingface.co/pyannote/segmentation](https://huggingface.co/pyannote/segmentation)
|
4389 |
+
4. Set the "pyannote" environment variable with your token:
|
4390 |
+
- In Linux/Mac: `export pyannote="your_token_here"`
|
4391 |
+
- In Windows: `set pyannote=your_token_here`
|
4392 |
+
- In Hugging Face Spaces: Add a "pyannote" Secret in the Settings tab
|
4393 |
+
|
4394 |
+
Without authentication, the app will use estimated segments based on audio duration.
|
4395 |
+
|
4396 |
+
**Technical Note:** If you're having trouble with authentication, make sure:
|
4397 |
+
1. The pyannote.audio package is properly installed
|
4398 |
+
2. You've accepted the model terms at [huggingface.co/pyannote/voice-activity-detection](https://huggingface.co/pyannote/voice-activity-detection)
|
4399 |
+
3. The provided token has READ access permission
|
4400 |
+
4. You've added hf.co to your allowed domains if using a scoped token
|
4401 |
""")
|
4402 |
|
4403 |
with gr.Column(scale=2):
|