Spaces:

jacob-c
/

syllables_matching_experiment

Paused

App Files Files Community

root commited on 6 days ago

Commit

c95399f

1 Parent(s): 5d5eb0f

ss

Browse files

Files changed (1) hide show

app.py +61 -14

app.py CHANGED Viewed

@@ -24,7 +24,12 @@ from utils import (
 )
 from emotionanalysis import MusicAnalyzer
 import librosa
-from pyannote.audio import Pipeline
 import tempfile
 import os
 import soundfile as sf
@@ -3196,8 +3201,13 @@ def detect_voice_activity(audio_file):
             print("To use voice activity detection:")
             print("1. Create an account at https://huggingface.co")
             print("2. Generate a token at https://huggingface.co/settings/tokens")
-            print("3. Accept the terms for pyannote/segmentation at https://huggingface.co/pyannote/segmentation")
-            print("4. Set HF_TOKEN environment variable or provide it directly in the code")
             # Create fallback segments based on audio duration
             # This creates segments approximately every 5 seconds
@@ -3224,8 +3234,36 @@ def detect_voice_activity(audio_file):
             print(f"Created {len(estimated_segments)} estimated voice segments (fallback mode)")
             return estimated_segments
         # Initialize the voice activity detection pipeline
         try:
             vad_pipeline = Pipeline.from_pretrained(
                 "pyannote/voice-activity-detection",
                 use_auth_token=hf_token
@@ -4339,18 +4377,27 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
             # Add voice detection info box
             with gr.Accordion("Voice Activity Detection", open=True):
                 gr.Markdown("""
-                ### Voice Detection Authentication Required
-                This app uses pyannote/voice-activity-detection to identify vocal segments in music.
-                **Important:** This model requires Hugging Face authentication:
-                1. Create an account at [huggingface.co](https://huggingface.co)
-                2. Generate a token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
-                3. Accept the terms at [huggingface.co/pyannote/segmentation](https://huggingface.co/pyannote/segmentation)
-                4. Set the HF_TOKEN environment variable
-                Without authentication, the app will use estimated segments based on audio duration.
                 """)
         with gr.Column(scale=2):

 )
 from emotionanalysis import MusicAnalyzer
 import librosa
+try:
+    from pyannote.audio import Pipeline
+    PYANNOTE_AVAILABLE = True
+except ImportError:
+    print("WARNING: pyannote.audio is not properly installed. Voice detection will use fallback mode.")
+    PYANNOTE_AVAILABLE = False
 import tempfile
 import os
 import soundfile as sf
             print("To use voice activity detection:")
             print("1. Create an account at https://huggingface.co")
             print("2. Generate a token at https://huggingface.co/settings/tokens")
+            print("3. Accept the terms for pyannote models at:")
+            print("   - https://huggingface.co/pyannote/segmentation")
+            print("   - https://huggingface.co/pyannote/voice-activity-detection")
+            print("4. Set 'pyannote' environment variable with your token:")
+            print("   - Linux/Mac: export pyannote=your_token_here")
+            print("   - Windows: set pyannote=your_token_here")
+            print("   - Hugging Face Spaces: Add a 'pyannote' Secret in Settings")
             # Create fallback segments based on audio duration
             # This creates segments approximately every 5 seconds
             print(f"Created {len(estimated_segments)} estimated voice segments (fallback mode)")
             return estimated_segments
+        # Check if pyannote is available
+        if not PYANNOTE_AVAILABLE:
+            print("pyannote.audio is not available. Using fallback voice detection.")
+            # Create fallback segments based on audio duration
+            y, sr = load_audio(audio_file, SAMPLE_RATE)
+            duration = extract_audio_duration(y, sr)
+            # Create segments of 4-5 seconds each, with small gaps between them
+            estimated_segments = []
+            segment_duration = 4.5
+            gap_duration = 1.0
+            current_pos = 0.0
+            while current_pos < duration:
+                segment_end = min(current_pos + segment_duration, duration)
+                estimated_segments.append({
+                    "start": current_pos,
+                    "end": segment_end,
+                    "duration": segment_end - current_pos
+                })
+                current_pos = segment_end + gap_duration
+                if current_pos >= duration:
+                    break
+            print(f"Created {len(estimated_segments)} estimated voice segments (fallback mode)")
+            return estimated_segments
         # Initialize the voice activity detection pipeline
         try:
+            print(f"Attempting to load pyannote/voice-activity-detection with auth token: {'[PROVIDED]' if hf_token else '[MISSING]'}")
             vad_pipeline = Pipeline.from_pretrained(
                 "pyannote/voice-activity-detection",
                 use_auth_token=hf_token
             # Add voice detection info box
             with gr.Accordion("Voice Activity Detection", open=True):
                 gr.Markdown("""
+                                 ### Voice Detection Authentication Required
+                 This app uses pyannote/voice-activity-detection to identify vocal segments in music.
+                 **Important:** This model requires Hugging Face authentication:
+                 1. Create an account at [huggingface.co](https://huggingface.co)
+                 2. Generate a token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
+                 3. Accept the terms at [huggingface.co/pyannote/segmentation](https://huggingface.co/pyannote/segmentation)
+                 4. Set the "pyannote" environment variable with your token:
+                    - In Linux/Mac: `export pyannote="your_token_here"`
+                    - In Windows: `set pyannote=your_token_here`
+                    - In Hugging Face Spaces: Add a "pyannote" Secret in the Settings tab
+                 Without authentication, the app will use estimated segments based on audio duration.
+                 **Technical Note:** If you're having trouble with authentication, make sure:
+                 1. The pyannote.audio package is properly installed
+                 2. You've accepted the model terms at [huggingface.co/pyannote/voice-activity-detection](https://huggingface.co/pyannote/voice-activity-detection)
+                 3. The provided token has READ access permission
+                 4. You've added hf.co to your allowed domains if using a scoped token
                 """)
         with gr.Column(scale=2):