Spaces:

jacob-c
/

syllables_matching_experiment

Paused

App Files Files Community

root commited on 7 days ago

Commit

bddf9c4

1 Parent(s): 670bed3

qwen30b

Browse files

Files changed (4) hide show

app.py +0 -0
appp.py +0 -0
lastapp.py +1229 -273
utils.py +0 -62

app.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

appp.py ADDED Viewed

The diff for this file is too large to render. See raw diff

lastapp.py CHANGED Viewed

@@ -19,10 +19,8 @@ from utils import (
     load_audio,
     extract_audio_duration,
     extract_mfcc_features,
-    calculate_lyrics_length,
     format_genre_results,
-    ensure_cuda_availability,
-    preprocess_audio_for_model
 )
 from emotionanalysis import MusicAnalyzer
 import librosa
@@ -106,6 +104,75 @@ llm_pipeline = pipeline(
 # Initialize music emotion analyzer
 music_analyzer = MusicAnalyzer()
 # New function: Count syllables in text
 def count_syllables(text):
     """Count syllables in a given text using the pronouncing library."""
@@ -113,31 +180,7 @@ def count_syllables(text):
     syllable_count = 0
     for word in words:
-        # Get pronunciations for the word
-        pronunciations = pronouncing.phones_for_word(word)
-        if pronunciations:
-            # Count syllables in the first pronunciation
-            syllable_count += pronouncing.syllable_count(pronunciations[0])
-        else:
-            # Fallback: estimate syllables based on vowel groups
-            vowels = "aeiouy"
-            count = 0
-            prev_is_vowel = False
-            for char in word:
-                is_vowel = char.lower() in vowels
-                if is_vowel and not prev_is_vowel:
-                    count += 1
-                prev_is_vowel = is_vowel
-            if word.endswith('e'):
-                count -= 1
-            if word.endswith('le') and len(word) > 2 and word[-3] not in vowels:
-                count += 1
-            if count == 0:
-                count = 1
-            syllable_count += count
     return syllable_count
@@ -304,8 +347,7 @@ def detect_beats(y, sr):
         onset_envelope=combined_onset,
         sr=sr,
         tightness=100,
-        start_bpm=60,  # Lower starting BPM helps find different time signatures
-        std_bpm=20     # Allow wider variations
     )
     tempo_candidates.append(tempo2)
     beat_candidates.append(beats2)
@@ -487,6 +529,281 @@ def detect_beats(y, sr):
         "phrases": phrases
     }
 def detect_sections(y, sr):
     """
     Advanced detection of musical sections with adaptive segmentation and improved classification.
@@ -768,6 +1085,24 @@ def create_flexible_syllable_templates(beats_info, genre=None, phrase_mode='defa
     import numpy as np
     from sklearn.cluster import KMeans
     # Extract basic beat information
     beat_times = beats_info.get("beat_times", [])
     beat_strengths = beats_info.get("beat_strengths", [1.0] * len(beat_times))
@@ -1169,10 +1504,10 @@ def format_syllable_templates_for_prompt(syllable_templates, arrow="→", line_w
     return "\n".join(output)
-def verify_flexible_syllable_counts(lyrics, templates):
     """
     Enhanced verification of syllable counts and stress patterns with precise alignment analysis
-    and detailed feedback for all phrases in a template.
     """
     import re
     import pronouncing
@@ -1180,74 +1515,6 @@ def verify_flexible_syllable_counts(lyrics, templates):
     import functools
     from itertools import chain
-    # Apply caching to improve performance for repeated word lookups
-    @functools.lru_cache(maxsize=512)
-    def cached_phones_for_word(word):
-        return pronouncing.phones_for_word(word)
-    @functools.lru_cache(maxsize=512)
-    def count_syllables_for_word(word):
-        """Count syllables in a single word with caching for performance."""
-        # Try using pronouncing library first
-        pronunciations = cached_phones_for_word(word.lower())
-        if pronunciations:
-            return pronouncing.syllable_count(pronunciations[0])
-        # Fallback method for words not in the pronouncing dictionary
-        vowels = "aeiouy"
-        word = word.lower()
-        count = 0
-        prev_is_vowel = False
-        for char in word:
-            is_vowel = char in vowels
-            if is_vowel and not prev_is_vowel:
-                count += 1
-            prev_is_vowel = is_vowel
-        # Handle special cases
-        if word.endswith('e') and not word.endswith('le'):
-            count -= 1
-        if word.endswith('le') and len(word) > 2 and word[-3] not in vowels:
-            count += 1
-        if count == 0:
-            count = 1
-        return count
-    @functools.lru_cache(maxsize=512)
-    def get_word_stress(word):
-        """Get the stress pattern for a word with improved fallback handling."""
-        pronunciations = cached_phones_for_word(word.lower())
-        if pronunciations:
-            return pronouncing.stresses(pronunciations[0])
-        # Enhanced fallback for words not in the dictionary
-        syllables = count_syllables_for_word(word)
-        # Common English stress patterns by word length
-        if syllables == 1:
-            return "1"  # Single syllable words are stressed
-        elif syllables == 2:
-            # Most 2-syllable nouns and adjectives stress first syllable
-            # Common endings that indicate second-syllable stress
-            second_syllable_stress = ["ing", "er", "or", "ize", "ise", "ate", "ect", "end", "ure"]
-            if any(word.endswith(ending) for ending in second_syllable_stress):
-                return "01"
-            else:
-                return "10"  # Default for 2-syllable words
-        elif syllables == 3:
-            # Common endings for specific stress patterns in 3-syllable words
-            if any(word.endswith(ending) for ending in ["ity", "ety", "ify", "ogy", "graphy"]):
-                return "100"  # First syllable stress
-            elif any(word.endswith(ending) for ending in ["ation", "ious", "itis"]):
-                return "010"  # Middle syllable stress
-            else:
-                return "100"  # Default for 3-syllable words
-        else:
-            # For longer words, use common English patterns
-            return "1" + "0" * (syllables - 1)
     # Split lyrics into lines
     lines = [line.strip() for line in lyrics.split("\n") if line.strip()]
@@ -1463,6 +1730,97 @@ def verify_flexible_syllable_counts(lyrics, templates):
             # If no matching template was found
             verification_notes.append(f"Line {i+1}: Unable to find matching template pattern")
     # Only add detailed analysis if we have rhythm mismatches
     if verification_notes:
         lyrics += "\n\n[Note: Potential rhythm mismatches detected in these lines:]\n"
@@ -1660,6 +2018,28 @@ def generate_lyrics(genre, duration, emotion_results, song_structure=None):
     Returns:
         Generated lyrics aligned with the rhythm patterns of the music
     """
     # Extract emotion and theme data from analysis results
     primary_emotion = emotion_results["emotion_analysis"]["primary_emotion"]
     primary_theme = emotion_results["theme_analysis"]["primary_theme"]
@@ -1682,7 +2062,35 @@ def generate_lyrics(genre, duration, emotion_results, song_structure=None):
     structure_visualization += f"Song Duration: {duration:.1f} seconds\n"
     structure_visualization += f"Tempo: {tempo:.1f} BPM\n\n"
-    if song_structure:
         # Try to use flexible structure if available
         if "flexible_structure" in song_structure and song_structure["flexible_structure"]:
             flexible = song_structure["flexible_structure"]
@@ -1982,8 +2390,16 @@ def generate_lyrics(genre, duration, emotion_results, song_structure=None):
     # Store the syllable guidance for later use
     syllable_guidance_text = syllable_guidance
-    # Determine if we should use traditional sections or not based on structure
-    if song_structure and "flexible_structure" in song_structure and song_structure["flexible_structure"]:
         # If we have more than 4 segments, it's likely not a traditional song structure
         if "segments" in song_structure["flexible_structure"]:
             segments = song_structure["flexible_structure"]["segments"]
@@ -1991,12 +2407,57 @@ def generate_lyrics(genre, duration, emotion_results, song_structure=None):
                 use_sections = False
     # Create enhanced prompt with better rhythm alignment instructions
-    if use_sections:
         # Traditional approach with sections
         content = f"""
 You are a talented songwriter who specializes in {genre} music.
 Write original {genre} song lyrics for a song that is {duration:.1f} seconds long.
 Music analysis has detected the following qualities in the music:
 - Tempo: {tempo:.1f} BPM
 - Key: {key} {mode}
@@ -2014,14 +2475,6 @@ CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT:
 6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis)
 7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels
-Think step by step about how to match words to the rhythm pattern:
-1. First, identify the strong beats in each line pattern
-2. Choose words where stressed syllables naturally fall on strong beats
-3. Count syllables carefully to ensure they match the pattern precisely
-4. Test your line against the pattern by mapping each syllable
-IMPORTANT: Each line of lyrics must match exactly to ONE musical phrase/segment.
 The lyrics should:
 - Perfectly capture the essence and style of {genre} music
 - Express the {primary_emotion} emotion and {primary_theme} theme
@@ -2029,6 +2482,8 @@ The lyrics should:
 - Be completely original
 - Match the song duration of {duration:.1f} seconds
 IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]"
 where you analyze how well the lyrics align with the musical rhythm. This section MUST appear
 even if there are no rhythm issues. Include the following in your analysis:
@@ -2044,6 +2499,8 @@ Your lyrics:
 You are a talented songwriter who specializes in {genre} music.
 Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long.
 Music analysis has detected the following qualities:
 - Tempo: {tempo:.1f} BPM
 - Key: {key} {mode}
@@ -2061,19 +2518,6 @@ CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT:
 6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis)
 7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels
-Think step by step about how to match words to the rhythm pattern:
-1. First, identify the strong beats in each line pattern
-2. Choose words where stressed syllables naturally fall on strong beats
-3. Count syllables carefully to ensure they match the pattern precisely
-4. Test your line against the pattern by mapping each syllable
-CRITICAL: Each line of lyrics must match exactly to ONE musical phrase/segment.
-For perfect alignment examples:
-- "FEEL the RHY-thm in your SOUL" – stressed syllables on strong beats
-- "to-DAY we DANCE a-LONG" – natural speech stress matches musical stress
-- "WAIT-ing FOR the SUN to RISE" – syllable emphasis aligns with beat emphasis
 The lyrics should:
 - Perfectly capture the essence and style of {genre} music
 - Express the {primary_emotion} emotion and {primary_theme} theme
@@ -2084,6 +2528,8 @@ The lyrics should:
 Include any section labels like [Verse] or [Chorus] as indicated in the rhythm patterns above.
 Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY.
 IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]"
 where you analyze how well the lyrics align with the musical rhythm. This section MUST appear
 even if there are no rhythm issues. Include the following in your analysis:
@@ -2096,6 +2542,7 @@ Your lyrics:
     # Format as a chat message for the LLM
     messages = [
         {"role": "user", "content": content}
     ]
@@ -2112,13 +2559,21 @@ Your lyrics:
     # Configure generation parameters based on model capability
     generation_params = {
         "do_sample": True,
-        "temperature": 0.6,  # Lower for more consistent rhythm alignment
-        "top_p": 0.95,
-        "top_k": 50,  # Increased from 20 for more diversity
         "repetition_penalty": 1.2,
-        "max_new_tokens": 2048  # Doubled from 1024 for more comprehensive lyrics
     }
     # Generate output
     generated_ids = llm_model.generate(
         **model_inputs,
@@ -2128,24 +2583,123 @@ Your lyrics:
     # Extract output tokens
     output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
-    # Skip the thinking process completely and just get the raw output
     lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip()
-    # If we find <thinking> tags, extract only the content after </thinking>
     if "<thinking>" in lyrics and "</thinking>" in lyrics:
         lyrics = lyrics.split("</thinking>")[1].strip()
-    # Remove any other thinking indicators that might be present
-    thinking_markers = ["<think>", "</think>", "[thinking]", "[/thinking]", "I'll think step by step:"]
     for marker in thinking_markers:
         if marker in lyrics:
             parts = lyrics.split(marker)
             if len(parts) > 1:
                 lyrics = parts[-1].strip()  # Take the last part after any thinking marker
-    # Verify syllable counts with enhanced verification
     if templates_for_verification:
-        verified_lyrics = verify_flexible_syllable_counts(lyrics, templates_for_verification)
         # Check if significant issues were detected
         if "[Note: Potential rhythm mismatches" in verified_lyrics and "Detailed Alignment Analysis" in verified_lyrics:
@@ -2206,7 +2760,7 @@ Improved lyrics with fixed rhythm:
                     refined_lyrics = llm_tokenizer.decode(refined_output_ids, skip_special_tokens=True).strip()
                     # Verify the refined lyrics
-                    refined_verified_lyrics = verify_flexible_syllable_counts(refined_lyrics, templates_for_verification)
                     # Only use refined lyrics if they're better (fewer notes)
                     if "[Note: Potential rhythm mismatches" not in refined_verified_lyrics:
@@ -2274,6 +2828,16 @@ Improved lyrics with fixed rhythm:
             if len(templates_for_verification) > 30:
                 syllable_analysis += f"... and {len(templates_for_verification) - 30} more lines\n\n"
         # Add structure visualization to syllable analysis
         syllable_analysis += "\n" + structure_visualization
@@ -2329,24 +2893,28 @@ def process_audio(audio_file):
             print(f"Error in genre classification: {str(e)}")
             return f"Error in genre classification: {str(e)}", None, ast_results
         print("Step 4/5: Analyzing music emotions, themes, and structure...")
         # Analyze music emotions and themes
         try:
             emotion_results = music_analyzer.analyze_music(audio_file)
         except Exception as e:
             print(f"Error in emotion analysis: {str(e)}")
-            # Continue even if emotion analysis fails
-            emotion_results = {
-                "emotion_analysis": {"primary_emotion": "Unknown"},
-                "theme_analysis": {"primary_theme": "Unknown"},
-                "rhythm_analysis": {"tempo": 0},
-                "tonal_analysis": {"key": "Unknown", "mode": ""},
-                "summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"}
-            }
         # Calculate detailed song structure for better lyrics alignment
         try:
-            # Enhanced song structure calculation for precise lyrics matching
             y, sr = load_audio(audio_file, SAMPLE_RATE)
             # Analyze beats and phrases for music-aligned lyrics
@@ -2427,21 +2995,21 @@ def process_audio(audio_file):
                         "end": segment_end
                     })
-            # Create a flexible structure with the segments
             flexible_structure = {
                 "beats": beats_info,
                 "segments": segments
             }
-            # Add to song structure
             song_structure = {
                 "beats": beats_info,
                 "sections": sections_info,
-                "flexible_structure": flexible_structure
             }
             # Add syllable counts to each section
-            song_structure["syllables"] = []
             for section in sections_info:
                 # Create syllable templates for sections
                 section_beats_info = {
@@ -2477,12 +3045,37 @@ def process_audio(audio_file):
                 song_structure["syllables"].append(section_info)
-            print(f"Successfully analyzed song structure with {len(segments)} segments")
         except Exception as e:
             print(f"Error analyzing song structure: {str(e)}")
-            # Continue with a simpler approach if this fails
-            song_structure = None
         print("Step 5/5: Generating rhythmically aligned lyrics...")
         # Generate lyrics based on top genre, emotion analysis, and song structure
@@ -2526,6 +3119,476 @@ def process_audio(audio_file):
         print(error_msg)
         return error_msg, None, []
 # Create enhanced Gradio interface with tabs for better organization
 with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
     gr.Markdown("# Music Genre Classifier & Lyrics Generator")
@@ -2566,126 +3629,14 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
                 with gr.TabItem("Generated Lyrics"):
                     lyrics_output = gr.Textbox(label="Lyrics", lines=18)
-                with gr.TabItem("Rhythm Analysis"):
-                    rhythm_analysis_output = gr.Textbox(label="Syllable-Beat Alignment Analysis", lines=16)
-                with gr.TabItem("Syllable Analysis"):
-                    syllable_analysis_output = gr.Textbox(label="Detailed Syllable Analysis", lines=16)
-                    prompt_template_output = gr.Textbox(label="Prompt Template", lines=16)
-    # Processing function with better handling of results
-    def display_results(audio_file):
-        if audio_file is None:
-            return "Please upload an audio file.", "No emotion analysis available.", "No audio classification available.", "No lyrics generated.", "No rhythm analysis available.", "No syllable analysis available.", "No prompt template available."
-        try:
-            # Process audio and get results
-            results = process_audio(audio_file)
-            # Check if we got an error message instead of results
-            if isinstance(results, str) and "Error" in results:
-                return results, "Error in analysis", "Error in classification", "No lyrics generated", "No rhythm analysis available", "No syllable analysis available", "No prompt template available"
-            elif isinstance(results, tuple) and isinstance(results[0], str) and "Error" in results[0]:
-                return results[0], "Error in analysis", "Error in classification", "No lyrics generated", "No rhythm analysis available", "No syllable analysis available", "No prompt template available"
-            # For backwards compatibility, handle both dictionary and tuple returns
-            if isinstance(results, dict):
-                genre_results = results.get("genre_results", "Genre classification failed")
-                lyrics = results.get("lyrics", "Lyrics generation failed")
-                ast_results = results.get("ast_results", [])
-                # Use clean lyrics if available
-                clean_lyrics = results.get("clean_lyrics", lyrics)
-                rhythm_analysis = results.get("rhythm_analysis", "No detailed rhythm analysis available")
-                # Extract syllable analysis and prompt template
-                syllable_analysis = results.get("syllable_analysis", "No syllable analysis available")
-                prompt_template = results.get("prompt_template", "No prompt template available")
-            else:
-                # Handle the old tuple return format
-                genre_results, lyrics, ast_results = results
-                clean_lyrics = lyrics
-                # Extract rhythm analysis if present
-                rhythm_analysis = "No detailed rhythm analysis available"
-                if isinstance(lyrics, str):
-                    # First check for new format
-                    if "[Note: Rhythm Analysis]" in lyrics:
-                        clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip()
-                        rhythm_analysis = lyrics.split("[Note: Rhythm Analysis]")[1]
-                    # Check for old format
-                    elif "[Note: Potential rhythm mismatches" in lyrics:
-                        clean_lyrics = lyrics.split("[Note:")[0].strip()
-                        rhythm_analysis = "[Note:" + lyrics.split("[Note:")[1]
-                # Default values for new fields
-                syllable_analysis = "No syllable analysis available"
-                prompt_template = "No prompt template available"
-            # Format emotion analysis results
-            try:
-                emotion_results = music_analyzer.analyze_music(audio_file)
-                emotion_text = f"Tempo: {emotion_results['summary']['tempo']:.1f} BPM\n"
-                emotion_text += f"Key: {emotion_results['summary']['key']} {emotion_results['summary']['mode']}\n"
-                emotion_text += f"Primary Emotion: {emotion_results['summary']['primary_emotion']}\n"
-                emotion_text += f"Primary Theme: {emotion_results['summary']['primary_theme']}"
-                # Add detailed song structure information if available
-                try:
-                    audio_data = extract_audio_features(audio_file)
-                    song_structure = calculate_detailed_song_structure(audio_data)
-                    emotion_text += "\n\nSong Structure:\n"
-                    for section in song_structure["syllables"]:
-                        emotion_text += f"- {section['type'].capitalize()}: {section['start']:.1f}s to {section['end']:.1f}s "
-                        emotion_text += f"({section['duration']:.1f}s, {section['beat_count']} beats, "
-                        if "syllable_template" in section:
-                            emotion_text += f"template: {section['syllable_template']})\n"
-                        else:
-                            emotion_text += f"~{section['syllable_count']} syllables)\n"
-                        # Add flexible structure info if available
-                        if "flexible_structure" in song_structure and song_structure["flexible_structure"]:
-                            flexible = song_structure["flexible_structure"]
-                            if "segments" in flexible and flexible["segments"]:
-                                emotion_text += "\nDetailed Rhythm Analysis:\n"
-                                for i, segment in enumerate(flexible["segments"][:5]):  # Show first 5 segments
-                                    emotion_text += f"- Segment {i+1}: {segment['start']:.1f}s to {segment['end']:.1f}s, "
-                                    emotion_text += f"pattern: {segment.get('syllable_template', 'N/A')}\n"
-                                if len(flexible["segments"]) > 5:
-                                    emotion_text += f"  (+ {len(flexible['segments']) - 5} more segments)\n"
-                except Exception as e:
-                    print(f"Error displaying song structure: {str(e)}")
-                    # Continue without showing structure details
-            except Exception as e:
-                print(f"Error in emotion analysis: {str(e)}")
-                emotion_text = f"Error in emotion analysis: {str(e)}"
-            # Format AST classification results
-            if ast_results and isinstance(ast_results, list):
-                ast_text = "Audio Classification Results:\n"
-                for result in ast_results[:5]:  # Show top 5 results
-                    ast_text += f"{result['label']}: {result['score']*100:.2f}%\n"
-            else:
-                ast_text = "No valid audio classification results available."
-            # Return all results including new fields
-            return genre_results, emotion_text, ast_text, clean_lyrics, rhythm_analysis, syllable_analysis, prompt_template
-        except Exception as e:
-            error_msg = f"Error: {str(e)}"
-            print(error_msg)
-            return error_msg, "Error in emotion analysis", "Error in audio classification", "No lyrics generated", "No rhythm analysis available", "No syllable analysis available", "No prompt template available"
     # Connect the button to the display function with updated outputs
     submit_btn.click(
         fn=display_results,
         inputs=[audio_input],
-        outputs=[genre_output, emotion_output, ast_output, lyrics_output, rhythm_analysis_output, syllable_analysis_output, prompt_template_output]
     )
     # Enhanced explanation of how the system works
@@ -2703,24 +3654,29 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
            - Strong and weak beats
            - Natural phrase boundaries
            - Time signature and tempo variations
-        5. **Syllable Template Creation**: For each musical phrase, the system generates precise syllable templates that reflect:
            - Beat stress patterns (strong, medium, weak)
            - Appropriate syllable counts based on tempo
            - Genre-specific rhythmic qualities
-        6. **Lyrics Generation**: Using the detected genre, emotion, and rhythm patterns, a large language model generates lyrics that:
            - Match the emotional quality of the music
-           - Follow the precise syllable templates
            - Align stressed syllables with strong beats
            - Maintain genre-appropriate style and themes
-        7. **Rhythm Verification**: The system verifies the generated lyrics, analyzing:
            - Syllable count accuracy
            - Stress alignment with strong beats
            - Word stress patterns
-        8. **Refinement**: If significant rhythm mismatches are detected, the system can automatically refine the lyrics for better alignment.
         This multi-step process creates lyrics that feel naturally connected to the music, as if they were written specifically for it.
         """)

     load_audio,
     extract_audio_duration,
     extract_mfcc_features,
     format_genre_results,
+    ensure_cuda_availability
 )
 from emotionanalysis import MusicAnalyzer
 import librosa
 # Initialize music emotion analyzer
 music_analyzer = MusicAnalyzer()
+# New global function moved outside of verify_flexible_syllable_counts
+@functools.lru_cache(maxsize=512)
+def cached_phones_for_word(word):
+    """Get word pronunciations with caching for better performance."""
+    return pronouncing.phones_for_word(word)
+@functools.lru_cache(maxsize=512)
+def count_syllables_for_word(word):
+    """Count syllables in a single word with caching for performance."""
+    # Try using pronouncing library first
+    pronunciations = cached_phones_for_word(word.lower())
+    if pronunciations:
+        return pronouncing.syllable_count(pronunciations[0])
+    # Fallback method for words not in the pronouncing dictionary
+    vowels = "aeiouy"
+    word = word.lower()
+    count = 0
+    prev_is_vowel = False
+    for char in word:
+        is_vowel = char in vowels
+        if is_vowel and not prev_is_vowel:
+            count += 1
+        prev_is_vowel = is_vowel
+    # Handle special cases
+    if word.endswith('e') and not word.endswith('le'):
+        count -= 1
+    if word.endswith('le') and len(word) > 2 and word[-3] not in vowels:
+        count += 1
+    if count == 0:
+        count = 1
+    return count
+@functools.lru_cache(maxsize=512)
+def get_word_stress(word):
+    """Get the stress pattern for a word with improved fallback handling."""
+    pronunciations = cached_phones_for_word(word.lower())
+    if pronunciations:
+        return pronouncing.stresses(pronunciations[0])
+    # Enhanced fallback for words not in the dictionary
+    syllables = count_syllables_for_word(word)
+    # Common English stress patterns by word length
+    if syllables == 1:
+        return "1"  # Single syllable words are stressed
+    elif syllables == 2:
+        # Most 2-syllable nouns and adjectives stress first syllable
+        # Common endings that indicate second-syllable stress
+        second_syllable_stress = ["ing", "er", "or", "ize", "ise", "ate", "ect", "end", "ure"]
+        if any(word.endswith(ending) for ending in second_syllable_stress):
+            return "01"
+        else:
+            return "10"  # Default for 2-syllable words
+    elif syllables == 3:
+        # Common endings for specific stress patterns in 3-syllable words
+        if any(word.endswith(ending) for ending in ["ity", "ety", "ify", "ogy", "graphy"]):
+            return "100"  # First syllable stress
+        elif any(word.endswith(ending) for ending in ["ation", "ious", "itis"]):
+            return "010"  # Middle syllable stress
+        else:
+            return "100"  # Default for 3-syllable words
+    else:
+        # For longer words, use common English patterns
+        return "1" + "0" * (syllables - 1)
 # New function: Count syllables in text
 def count_syllables(text):
     """Count syllables in a given text using the pronouncing library."""
     syllable_count = 0
     for word in words:
+        syllable_count += count_syllables_for_word(word)
     return syllable_count
         onset_envelope=combined_onset,
         sr=sr,
         tightness=100,
+        start_bpm=60  # Lower starting BPM helps find different time signatures
     )
     tempo_candidates.append(tempo2)
     beat_candidates.append(beats2)
         "phrases": phrases
     }
+def detect_beats_and_subbeats(y, sr, subdivision=4):
+    """
+    Detect main beats and interpolate subbeats between consecutive beats.
+    Parameters:
+        y: Audio time series
+        sr: Sample rate
+        subdivision: Number of subdivisions between beats (default: 4 for quarter beats)
+    Returns:
+        Dictionary containing beat times, subbeat times, and tempo information
+    """
+    # Detect main beats using librosa
+    try:
+        tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
+        beat_times = librosa.frames_to_time(beat_frames, sr=sr)
+        # Convert numpy values to native Python types
+        if isinstance(tempo, np.ndarray) or isinstance(tempo, np.number):
+            tempo = float(tempo)
+        # Convert beat_times to a list of floats
+        if isinstance(beat_times, np.ndarray):
+            beat_times = [float(t) for t in beat_times]
+    except Exception as e:
+        print(f"Error in beat detection: {e}")
+        # Default fallbacks
+        tempo = 120.0
+        beat_times = []
+    # Create subbeats by interpolating between main beats
+    subbeat_times = []
+    # Early return if no beats detected
+    if not beat_times or len(beat_times) < 2:
+        return {
+            "tempo": float(tempo) if tempo is not None else 120.0,
+            "beat_times": beat_times,
+            "subbeat_times": []
+        }
+    for i in range(len(beat_times) - 1):
+        # Get current and next beat time
+        try:
+            current_beat = float(beat_times[i])
+            next_beat = float(beat_times[i + 1])
+        except (IndexError, ValueError, TypeError):
+            continue
+        # Calculate time interval between beats
+        interval = (next_beat - current_beat) / subdivision
+        # Add the main beat
+        subbeat_times.append({
+            "time": float(current_beat),
+            "type": "main",
+            "strength": 1.0,
+            "beat_index": i
+        })
+        # Add subbeats
+        for j in range(1, subdivision):
+            subbeat_time = current_beat + j * interval
+            # Calculate strength based on position
+            # For 4/4 time, beat 3 is stronger than beats 2 and 4
+            if j == subdivision // 2 and subdivision == 4:
+                strength = 0.8  # Stronger subbeat (e.g., beat 3 in 4/4)
+            else:
+                strength = 0.5  # Weaker subbeat
+            subbeat_times.append({
+                "time": float(subbeat_time),
+                "type": "sub",
+                "strength": float(strength),
+                "beat_index": i,
+                "subbeat_index": j
+            })
+    # Add the last main beat
+    if beat_times:
+        try:
+            subbeat_times.append({
+                "time": float(beat_times[-1]),
+                "type": "main",
+                "strength": 1.0,
+                "beat_index": len(beat_times) - 1
+            })
+        except (ValueError, TypeError):
+            # Skip if conversion fails
+            pass
+    return {
+        "tempo": float(tempo) if tempo is not None else 120.0,
+        "beat_times": beat_times,
+        "subbeat_times": subbeat_times
+    }
+def map_beats_to_seconds(subbeat_times, duration, fps=1.0):
+    """
+    Map beats and subbeats to second-level intervals.
+    Parameters:
+        subbeat_times: List of dictionaries containing beat and subbeat information
+        duration: Total duration of the audio in seconds
+        fps: Frames per second (default: 1.0 for one-second intervals)
+    Returns:
+        List of dictionaries, each containing beats within a time window
+    """
+    # Safety check for input parameters
+    if not isinstance(subbeat_times, list):
+        print("Warning: subbeat_times is not a list")
+        subbeat_times = []
+    try:
+        duration = float(duration)
+    except (ValueError, TypeError):
+        print("Warning: duration is not convertible to float, defaulting to 30")
+        duration = 30.0
+    # Calculate number of time windows
+    num_windows = int(duration * fps) + 1
+    # Initialize time windows
+    time_windows = []
+    for i in range(num_windows):
+        # Calculate window boundaries
+        start_time = i / fps
+        end_time = (i + 1) / fps
+        # Find beats and subbeats within this window
+        window_beats = []
+        for beat in subbeat_times:
+            # Safety check for beat object
+            if not isinstance(beat, dict):
+                continue
+            # Safely access beat time
+            try:
+                beat_time = float(beat.get("time", 0))
+            except (ValueError, TypeError):
+                continue
+            if start_time <= beat_time < end_time:
+                # Safely extract beat properties with defaults
+                beat_type = beat.get("type", "sub")
+                if not isinstance(beat_type, str):
+                    beat_type = "sub"
+                # Safely handle strength
+                try:
+                    strength = float(beat.get("strength", 0.5))
+                except (ValueError, TypeError):
+                    strength = 0.5
+                # Add beat to this window
+                window_beats.append({
+                    "time": beat_time,
+                    "type": beat_type,
+                    "strength": strength,
+                    "relative_pos": (beat_time - start_time) / (1/fps)  # Position within window (0-1)
+                })
+        # Add window to list
+        time_windows.append({
+            "second": i,
+            "start": start_time,
+            "end": end_time,
+            "beats": window_beats
+        })
+    return time_windows
+def create_second_level_templates(sec_map, tempo, genre=None):
+    """
+    Create syllable templates for each second-level window.
+    Parameters:
+        sec_map: List of second-level time windows with beat information
+        tempo: Tempo in BPM
+        genre: Optional genre for genre-specific adjustments
+    Returns:
+        List of template strings, one for each second
+    """
+    # Helper function to map tempo to base syllable count
+    def tempo_to_syllable_base(tempo):
+        """Continuous function mapping tempo to syllable base count"""
+        # Sigmoid-like function that smoothly transitions between syllable counts
+        if tempo > 180:
+            return 1.0
+        elif tempo > 140:
+            return 1.0 + (180 - tempo) * 0.02  # Gradual increase 1.0 → 1.8
+        elif tempo > 100:
+            return 1.8 + (140 - tempo) * 0.01  # Gradual increase 1.8 → 2.2
+        elif tempo > 70:
+            return 2.2 + (100 - tempo) * 0.02  # Gradual increase 2.2 → 2.8
+        else:
+            return 2.8 + max(0, (70 - tempo) * 0.04)  # Continue increasing for very slow tempos
+    # Calculate base syllable count from tempo
+    base_syllables = tempo_to_syllable_base(tempo)
+    # Apply genre-specific adjustments
+    genre_factor = 1.0
+    if genre:
+        genre_lower = genre.lower()
+        if any(term in genre_lower for term in ["rap", "hip hop", "hip-hop"]):
+            genre_factor = 1.4  # Much higher syllable density for rap
+        elif any(term in genre_lower for term in ["folk", "country", "ballad"]):
+            genre_factor = 0.8  # Lower density for folk styles
+    # Create templates for each second
+    templates = []
+    for window in sec_map:
+        beats = window["beats"]
+        # If no beats in this second, create a default template
+        if not beats:
+            templates.append("w(0.5):1")
+            continue
+        # Create beat patterns for this second
+        beat_patterns = []
+        for beat in beats:
+            # Ensure we're dealing with a dictionary and that it has a "strength" key
+            if not isinstance(beat, dict):
+                continue  # Skip this beat if it's not a dictionary
+            # Safely get beat type and strength
+            if "type" not in beat or not isinstance(beat["type"], str):
+                beat_type = "w"  # Default to weak if type is missing or not a string
+            else:
+                beat_type = "S" if beat["type"] == "main" else "m" if beat.get("strength", 0) >= 0.7 else "w"
+            # Safely get strength value with fallback
+            try:
+                strength = float(beat.get("strength", 0.5))
+            except (ValueError, TypeError):
+                strength = 0.5  # Default if conversion fails
+            # Adjust syllable count based on beat type and strength
+            if beat_type == "S":
+                syllable_factor = 1.2  # More syllables for strong beats
+            elif beat_type == "m":
+                syllable_factor = 1.0  # Normal for medium beats
+            else:
+                syllable_factor = 0.8  # Fewer for weak beats
+            # Calculate final syllable count
+            syllable_count = base_syllables * syllable_factor * genre_factor
+            # Round to half-syllable precision
+            syllable_count = round(syllable_count * 2) / 2
+            # Ensure reasonable limits
+            syllable_count = max(0.5, min(4, syllable_count))
+            # Format with embedded strength value
+            strength_pct = round(strength * 100) / 100
+            beat_patterns.append(f"{beat_type}({strength_pct}):{syllable_count}")
+        # Join patterns with dashes - ensure we have at least one pattern
+        if not beat_patterns:
+            templates.append("w(0.5):1")  # Default if no valid patterns were created
+        else:
+            second_template = "-".join(beat_patterns)
+            templates.append(second_template)
+    return templates
 def detect_sections(y, sr):
     """
     Advanced detection of musical sections with adaptive segmentation and improved classification.
     import numpy as np
     from sklearn.cluster import KMeans
+    # Convert any numpy values to native Python types for safety - directly handle conversions
+    # Process the dictionary to convert numpy values to Python native types
+    if isinstance(beats_info, dict):
+        processed_beats_info = {}
+        for k, v in beats_info.items():
+            if isinstance(v, np.ndarray):
+                if v.size == 1:
+                    processed_beats_info[k] = float(v.item())
+                else:
+                    processed_beats_info[k] = [float(x) if isinstance(x, np.number) else x for x in v]
+            elif isinstance(v, np.number):
+                processed_beats_info[k] = float(v)
+            elif isinstance(v, list):
+                processed_beats_info[k] = [float(x) if isinstance(x, np.number) else x for x in v]
+            else:
+                processed_beats_info[k] = v
+        beats_info = processed_beats_info
     # Extract basic beat information
     beat_times = beats_info.get("beat_times", [])
     beat_strengths = beats_info.get("beat_strengths", [1.0] * len(beat_times))
     return "\n".join(output)
+def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=None):
     """
     Enhanced verification of syllable counts and stress patterns with precise alignment analysis
+    for both phrase-level and second-level templates.
     """
     import re
     import pronouncing
     import functools
     from itertools import chain
     # Split lyrics into lines
     lines = [line.strip() for line in lyrics.split("\n") if line.strip()]
             # If no matching template was found
             verification_notes.append(f"Line {i+1}: Unable to find matching template pattern")
+    # Add second-level verification if templates are provided
+    if second_level_templates:
+        verification_notes.append("\n=== SECOND-LEVEL VERIFICATION ===\n")
+        # Check each second against corresponding line
+        for i, template in enumerate(second_level_templates):
+            if i >= len(lines):
+                break
+            line = lines[i]
+            # Skip section headers
+            if line.startswith('[') and ']' in line:
+                continue
+            actual_count = count_syllables(line)
+            # Parse template to get expected syllable count
+            total_expected = 0
+            beat_patterns = []
+            # Handle templates with beat patterns like "S(0.95):2-w(0.4):1"
+            if isinstance(template, str) and "-" in template:
+                for beat in template.split("-"):
+                    if ":" in beat:
+                        try:
+                            count_part = beat.split(":")[1]
+                            count = float(count_part)
+                            total_expected += count
+                            # Extract beat type for alignment check
+                            beat_type = beat.split("(")[0] if "(" in beat else beat[0]
+                            beat_patterns.append((beat_type, count))
+                        except (IndexError, ValueError):
+                            pass
+            # Compare actual vs expected count
+            if total_expected > 0:
+                # Calculate adaptive threshold based on expected syllables
+                expected_ratio = 0.2  # More strict at second level
+                threshold = max(0.5, round(total_expected * expected_ratio))
+                difference = abs(actual_count - total_expected)
+                if difference > threshold:
+                    verification_notes.append(f"Second {i+1}: Expected {total_expected} syllables, got {actual_count}")
+                    total_mismatch_count += 1
+                    # Check for stress misalignment in this second
+                    words = re.findall(r'\b[a-zA-Z]+\b', line.lower())
+                    word_analysis = []
+                    cumulative_syllables = 0
+                    for word in words:
+                        syllable_count = count_syllables_for_word(word)
+                        stress_pattern = get_word_stress(word)
+                        word_analysis.append({
+                            "word": word,
+                            "syllables": syllable_count,
+                            "stress_pattern": stress_pattern,
+                            "position": cumulative_syllables
+                        })
+                        cumulative_syllables += syllable_count
+                    # Check if stressed syllables align with strong beats
+                    if beat_patterns:
+                        strong_positions = []
+                        current_pos = 0
+                        for beat_type, count in beat_patterns:
+                            if beat_type == "S":
+                                strong_positions.append(current_pos)
+                            current_pos += count
+                        # Look for misalignments
+                        for pos in strong_positions:
+                            for word_info in word_analysis:
+                                word_start = word_info["position"]
+                                word_end = word_start + word_info["syllables"]
+                                if word_start <= pos < word_end:
+                                    # Check if a stressed syllable falls on this position
+                                    syllable_in_word = int(pos - word_start)
+                                    stress = word_info["stress_pattern"]
+                                    if stress and syllable_in_word < len(stress) and stress[syllable_in_word] != '1':
+                                        verification_notes.append(f"  → In second {i+1}, '{word_info['word']}' has unstressed syllable on strong beat")
+                                    break
     # Only add detailed analysis if we have rhythm mismatches
     if verification_notes:
         lyrics += "\n\n[Note: Potential rhythm mismatches detected in these lines:]\n"
     Returns:
         Generated lyrics aligned with the rhythm patterns of the music
     """
+    # Ensure emotion_results is a dictionary with the expected structure
+    if not isinstance(emotion_results, dict):
+        emotion_results = {
+            "emotion_analysis": {"primary_emotion": "Unknown"},
+            "theme_analysis": {"primary_theme": "Unknown"},
+            "rhythm_analysis": {"tempo": 0},
+            "tonal_analysis": {"key": "Unknown", "mode": ""},
+            "summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"}
+        }
+    # Extract emotion and theme data with safe defaults
+    primary_emotion = emotion_results.get("emotion_analysis", {}).get("primary_emotion", "Unknown")
+    primary_theme = emotion_results.get("theme_analysis", {}).get("primary_theme", "Unknown")
+    # Extract numeric values safely with fallbacks
+    try:
+        tempo = float(emotion_results.get("rhythm_analysis", {}).get("tempo", 0.0))
+    except (ValueError, TypeError):
+        tempo = 0.0
+    key = emotion_results.get("tonal_analysis", {}).get("key", "Unknown")
+    mode = emotion_results.get("tonal_analysis", {}).get("mode", "")
     # Extract emotion and theme data from analysis results
     primary_emotion = emotion_results["emotion_analysis"]["primary_emotion"]
     primary_theme = emotion_results["theme_analysis"]["primary_theme"]
     structure_visualization += f"Song Duration: {duration:.1f} seconds\n"
     structure_visualization += f"Tempo: {tempo:.1f} BPM\n\n"
+    # Add second-level template guidance if available
+    if song_structure and "second_level" in song_structure and song_structure["second_level"]:
+        second_level_templates = song_structure["second_level"]["templates"]
+        # Create second-level guidance
+        second_level_guidance = "\nSECOND-BY-SECOND RHYTHM INSTRUCTIONS:\n"
+        second_level_guidance += "Each line below corresponds to ONE SECOND of audio. Follow these rhythm patterns EXACTLY:\n\n"
+        # Format each second's template
+        formatted_second_templates = []
+        for i, template in enumerate(second_level_templates):
+            if i < min(60, len(second_level_templates)):  # Limit to 60 seconds to avoid overwhelming the LLM
+                formatted_template = format_syllable_templates_for_prompt(template, arrow="→", line_wrap=0)
+                formatted_second_templates.append(f"Second {i+1}: {formatted_template}")
+        second_level_guidance += "\n".join(formatted_second_templates)
+        # Add critical instructions for second-level alignment
+        second_level_guidance += "\n\nCRITICAL: Create ONE LINE of lyrics for EACH SECOND, following the exact rhythm pattern."
+        second_level_guidance += "\nIf a second has no beats, use it for a breath or pause in the lyrics."
+        second_level_guidance += "\nThe first line of your lyrics MUST match Second 1, the second line matches Second 2, and so on."
+        # Add to syllable guidance
+        syllable_guidance = second_level_guidance
+        # Store templates for verification
+        templates_for_verification = second_level_templates
+    elif song_structure:
         # Try to use flexible structure if available
         if "flexible_structure" in song_structure and song_structure["flexible_structure"]:
             flexible = song_structure["flexible_structure"]
     # Store the syllable guidance for later use
     syllable_guidance_text = syllable_guidance
+    # Determine if we should use traditional sections or second-level alignment
+    use_sections = True
+    use_second_level = False
+    if song_structure and "second_level" in song_structure and song_structure["second_level"]:
+        use_second_level = True
+        # If we have second-level templates, prioritize those over traditional sections
+        if len(song_structure["second_level"]["templates"]) > 0:
+            use_sections = False
+    elif song_structure and "flexible_structure" in song_structure and song_structure["flexible_structure"]:
         # If we have more than 4 segments, it's likely not a traditional song structure
         if "segments" in song_structure["flexible_structure"]:
             segments = song_structure["flexible_structure"]["segments"]
                 use_sections = False
     # Create enhanced prompt with better rhythm alignment instructions
+    if use_second_level:
+        # Second-level approach with per-second alignment
+        content = f"""
+You are a talented songwriter who specializes in {genre} music.
+Write original {genre} song lyrics for a song that is {duration:.1f} seconds long.
+IMPORTANT: DO NOT include any thinking process, explanations, or analysis before the lyrics. Start directly with the song lyrics.
+Music analysis has detected the following qualities in the music:
+- Tempo: {tempo:.1f} BPM
+- Key: {key} {mode}
+- Primary emotion: {primary_emotion}
+- Primary theme: {primary_theme}
+{syllable_guidance}
+CRITICAL INSTRUCTIONS FOR SECOND-LEVEL RHYTHM ALIGNMENT:
+1. Each line of lyrics MUST correspond to ONE SECOND of audio.
+2. The first line of your lyrics MUST match Second 1, the second line matches Second 2, etc.
+3. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern)
+4. Natural word stress patterns must match the beat strength (strong words on strong beats)
+5. For seconds with no beats, use a pause, breath, or continue a phrase from previous line
+6. Pay attention to strength values in the pattern (higher values need stronger emphasis)
+7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables
+The lyrics should:
+- Perfectly capture the essence and style of {genre} music
+- Express the {primary_emotion} emotion and {primary_theme} theme
+- Match EXACTLY with the second-by-second rhythm patterns provided above
+- Be completely original
+- Create a coherent song that flows naturally despite the precise timing requirements
+IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics.
+IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]"
+where you analyze how well the lyrics align with the musical rhythm. This section MUST appear
+even if there are no rhythm issues. Include the following in your analysis:
+1. How well each line matches its corresponding second's rhythm pattern
+2. Where stressed syllables align with strong beats
+3. Any potential misalignments or improvements
+Your lyrics:
+"""
+    elif use_sections:
         # Traditional approach with sections
         content = f"""
 You are a talented songwriter who specializes in {genre} music.
 Write original {genre} song lyrics for a song that is {duration:.1f} seconds long.
+IMPORTANT: DO NOT include any thinking process, explanations, or analysis before the lyrics. Start directly with the song lyrics.
 Music analysis has detected the following qualities in the music:
 - Tempo: {tempo:.1f} BPM
 - Key: {key} {mode}
 6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis)
 7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels
 The lyrics should:
 - Perfectly capture the essence and style of {genre} music
 - Express the {primary_emotion} emotion and {primary_theme} theme
 - Be completely original
 - Match the song duration of {duration:.1f} seconds
+IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics.
 IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]"
 where you analyze how well the lyrics align with the musical rhythm. This section MUST appear
 even if there are no rhythm issues. Include the following in your analysis:
 You are a talented songwriter who specializes in {genre} music.
 Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long.
+IMPORTANT: DO NOT include any thinking process, explanations, or analysis before the lyrics. Start directly with the song lyrics.
 Music analysis has detected the following qualities:
 - Tempo: {tempo:.1f} BPM
 - Key: {key} {mode}
 6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis)
 7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels
 The lyrics should:
 - Perfectly capture the essence and style of {genre} music
 - Express the {primary_emotion} emotion and {primary_theme} theme
 Include any section labels like [Verse] or [Chorus] as indicated in the rhythm patterns above.
 Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY.
+IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics.
 IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]"
 where you analyze how well the lyrics align with the musical rhythm. This section MUST appear
 even if there are no rhythm issues. Include the following in your analysis:
     # Format as a chat message for the LLM
     messages = [
+        {"role": "system", "content": "You are a professional songwriter. Create lyrics that match the specified rhythm patterns exactly. Start with the lyrics immediately without any explanation or thinking. Be concise and direct."},
         {"role": "user", "content": content}
     ]
     # Configure generation parameters based on model capability
     generation_params = {
         "do_sample": True,
+        "temperature": 0.5,  # Lower for more consistent and direct output
+        "top_p": 0.85,  # Slightly lower for more predictable responses
+        "top_k": 50,
         "repetition_penalty": 1.2,
+        "max_new_tokens": 2048,
+        "num_return_sequences": 1
     }
+    # Add specific stop sequences to prevent excessive explanation
+    if hasattr(llm_model.generation_config, "stopping_criteria"):
+        thinking_stops = ["Let me think", "First, I need to", "Let's analyze", "I'll approach this", "Step 1:", "To start,"]
+        for stop in thinking_stops:
+            if stop not in llm_model.generation_config.stopping_criteria:
+                llm_model.generation_config.stopping_criteria.append(stop)
     # Generate output
     generated_ids = llm_model.generate(
         **model_inputs,
     # Extract output tokens
     output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
+    # Get the raw output and strip any thinking process
     lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip()
+    # Enhanced thinking process removal - handle multiple formats
+    # First check for standard thinking tags
     if "<thinking>" in lyrics and "</thinking>" in lyrics:
         lyrics = lyrics.split("</thinking>")[1].strip()
+    # Check for alternative thinking indicators with improved detection
+    thinking_markers = [
+        "<think>", "</think>",
+        "[thinking]", "[/thinking]",
+        "I'll think step by step:",
+        "First, I need to understand",
+        "Let me think about",
+        "Let's tackle this query",
+        "Okay, let's tackle this query",
+        "First, I need to understand the requirements",
+        "Looking at the rhythm patterns"
+    ]
+    # First try to find clear section breaks
     for marker in thinking_markers:
         if marker in lyrics:
             parts = lyrics.split(marker)
             if len(parts) > 1:
                 lyrics = parts[-1].strip()  # Take the last part after any thinking marker
+    # Look for long analytical sections followed by clear lyrics
+    analytical_patterns = [
+        "Let me analyze",
+        "I need to understand",
+        "The tempo is",
+        "First, let's look at",
+        "Wait, maybe",
+        "Considering the emotional tone",
+        "Starting with the first line",
+        "Let me check the examples"
+    ]
+    # Check if lyrics begin with any analytical patterns
+    for pattern in analytical_patterns:
+        if lyrics.startswith(pattern):
+            # Try to find where the actual lyrics start - look for common lyrics markers
+            lyrics_markers = [
+                "\n\n[Verse",
+                "\n\n[Chorus",
+                "\n\nVerse",
+                "\n\nChorus",
+                "\n\n[Verse 1]",
+                "\n\n[Intro]"
+            ]
+            for marker in lyrics_markers:
+                if marker in lyrics:
+                    lyrics = lyrics[lyrics.index(marker):].strip()
+                    break
+    # One last effort to clean up - if the text is very long and contains obvious thinking
+    # before getting to actual lyrics, try to find a clear starting point
+    if len(lyrics.split()) > 100 and "\n\n" in lyrics:
+        paragraphs = lyrics.split("\n\n")
+        for i, paragraph in enumerate(paragraphs):
+            # Look for typical song structure indicators in a paragraph
+            if any(marker in paragraph for marker in ["[Verse", "[Chorus", "Verse 1", "Chorus:"]):
+                lyrics = "\n\n".join(paragraphs[i:])
+                break
+    # Clean up any remaining thinking artifacts at the beginning
+    lines = lyrics.split('\n')
+    clean_lines = []
+    lyrics_started = False
+    for line in lines:
+        # Skip initial commentary/thinking lines until we hit what looks like lyrics
+        if not lyrics_started:
+            if (line.strip().startswith('[') and ']' in line) or not any(thinking in line.lower() for thinking in ["i think", "let me", "maybe", "perhaps", "alternatively", "checking"]):
+                lyrics_started = True
+        if lyrics_started:
+            clean_lines.append(line)
+    # Only use the cleaning logic if we found some actual lyrics
+    if clean_lines:
+        lyrics = '\n'.join(clean_lines)
+    # Special handling for second-level templates
+    second_level_verification = None
+    if song_structure and "second_level" in song_structure and song_structure["second_level"]:
+        second_level_verification = song_structure["second_level"]["templates"]
+    # Verify syllable counts with enhanced verification - pass second-level templates if available
     if templates_for_verification:
+        # Convert any NumPy values to native types before verification - directly handle conversions
+        # Simple conversion for basic templates (non-recursive)
+        if isinstance(templates_for_verification, list):
+            safe_templates = []
+            for template in templates_for_verification:
+                if isinstance(template, dict):
+                    processed_template = {}
+                    for k, v in template.items():
+                        if isinstance(v, np.ndarray):
+                            if v.size == 1:
+                                processed_template[k] = float(v.item())
+                            else:
+                                processed_template[k] = [float(x) if isinstance(x, np.number) else x for x in v]
+                        elif isinstance(v, np.number):
+                            processed_template[k] = float(v)
+                        else:
+                            processed_template[k] = v
+                    safe_templates.append(processed_template)
+                else:
+                    safe_templates.append(template)
+        else:
+            safe_templates = templates_for_verification
+        verified_lyrics = verify_flexible_syllable_counts(lyrics, safe_templates, second_level_verification)
         # Check if significant issues were detected
         if "[Note: Potential rhythm mismatches" in verified_lyrics and "Detailed Alignment Analysis" in verified_lyrics:
                     refined_lyrics = llm_tokenizer.decode(refined_output_ids, skip_special_tokens=True).strip()
                     # Verify the refined lyrics
+                    refined_verified_lyrics = verify_flexible_syllable_counts(refined_lyrics, safe_templates, second_level_verification)
                     # Only use refined lyrics if they're better (fewer notes)
                     if "[Note: Potential rhythm mismatches" not in refined_verified_lyrics:
             if len(templates_for_verification) > 30:
                 syllable_analysis += f"... and {len(templates_for_verification) - 30} more lines\n\n"
+        # Add second-level analysis if available
+        if second_level_verification:
+            syllable_analysis += "\nSecond-Level Template Analysis:\n"
+            for i, template in enumerate(second_level_verification):
+                if i < min(len(second_level_verification), 30):  # Limit to 30 seconds
+                    syllable_analysis += f"Second {i+1}: {template}\n"
+            if len(second_level_verification) > 30:
+                syllable_analysis += f"... and {len(second_level_verification) - 30} more seconds\n"
         # Add structure visualization to syllable analysis
         syllable_analysis += "\n" + structure_visualization
             print(f"Error in genre classification: {str(e)}")
             return f"Error in genre classification: {str(e)}", None, ast_results
+        # Initialize default values
+        ast_results = ast_results if ast_results else []
+        song_structure = None
+        emotion_results = {
+            "emotion_analysis": {"primary_emotion": "Unknown"},
+            "theme_analysis": {"primary_theme": "Unknown"},
+            "rhythm_analysis": {"tempo": 0},
+            "tonal_analysis": {"key": "Unknown", "mode": ""},
+            "summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"}
+        }
         print("Step 4/5: Analyzing music emotions, themes, and structure...")
         # Analyze music emotions and themes
         try:
             emotion_results = music_analyzer.analyze_music(audio_file)
         except Exception as e:
             print(f"Error in emotion analysis: {str(e)}")
+            # Continue with default emotion_results
         # Calculate detailed song structure for better lyrics alignment
         try:
+            # Load audio data
             y, sr = load_audio(audio_file, SAMPLE_RATE)
             # Analyze beats and phrases for music-aligned lyrics
                         "end": segment_end
                     })
+            # Create flexible structure with the segments
             flexible_structure = {
                 "beats": beats_info,
                 "segments": segments
             }
+            # Create song structure object
             song_structure = {
                 "beats": beats_info,
                 "sections": sections_info,
+                "flexible_structure": flexible_structure,
+                "syllables": []
             }
             # Add syllable counts to each section
             for section in sections_info:
                 # Create syllable templates for sections
                 section_beats_info = {
                 song_structure["syllables"].append(section_info)
+            # Add second-level beat analysis
+            try:
+                # Get enhanced beat information with subbeats
+                subbeat_info = detect_beats_and_subbeats(y, sr, subdivision=4)
+                # Map beats to second-level windows
+                sec_map = map_beats_to_seconds(
+                    subbeat_info["subbeat_times"],
+                    audio_data["duration"]
+                )
+                # Create second-level templates
+                second_level_templates = create_second_level_templates(
+                    sec_map,
+                    subbeat_info["tempo"],
+                    top_genres[0][0]  # Use top genre
+                )
+                # Add to song structure
+                song_structure["second_level"] = {
+                    "sec_map": sec_map,
+                    "templates": second_level_templates
+                }
+            except Exception as e:
+                print(f"Error in second-level beat analysis: {str(e)}")
+                # Continue without second-level data
         except Exception as e:
             print(f"Error analyzing song structure: {str(e)}")
+            # Continue without song structure
         print("Step 5/5: Generating rhythmically aligned lyrics...")
         # Generate lyrics based on top genre, emotion analysis, and song structure
         print(error_msg)
         return error_msg, None, []
+def format_complete_beat_timeline(audio_file, lyrics=None):
+    """Creates a complete formatted timeline showing all beat timings and their syllable patterns without truncation"""
+    if audio_file is None:
+        return "Please upload an audio file to see beat timeline."
+    try:
+        # Extract audio data
+        y, sr = load_audio(audio_file, SAMPLE_RATE)
+        # Get beat information
+        beats_info = detect_beats(y, sr)
+    def ensure_float(value):
+        if isinstance(value, np.ndarray) or isinstance(v, np.number):  # Should be 'value', not 'v'
+            return float(value)
+        return value
+        # Format the timeline
+        timeline = "=== BEAT & SYLLABLE TIMELINE ===\n\n"
+        # Convert tempo to float before formatting if it's a numpy array
+        tempo = ensure_float(beats_info['tempo'])
+        timeline += f"Tempo: {tempo:.1f} BPM\n"
+        timeline += f"Time Signature: {beats_info['time_signature']}/4\n"
+        timeline += f"Total Beats: {beats_info['beat_count']}\n\n"
+        # Create a table header
+        timeline += "| Beat # | Time (s) | Beat Strength | Syllable Pattern |\n"
+        timeline += "|--------|----------|--------------|------------------|\n"
+        # Add beat-by-beat information - show ALL beats
+        for i, (time, strength) in enumerate(zip(beats_info['beat_times'], beats_info['beat_strengths'])):
+            # Convert numpy values to Python float if needed
+            time = ensure_float(time)
+            strength = ensure_float(strength)
+            # Determine beat type based on strength
+            if strength >= 0.8:
+                beat_type = "STRONG"
+            elif strength >= 0.5:
+                beat_type = "medium"
+            else:
+                beat_type = "weak"
+            # Create beat pattern indicator
+            if i % beats_info['time_signature'] == 0:
+                pattern = "S"  # Strong beat at start of measure
+            elif i % beats_info['time_signature'] == beats_info['time_signature'] // 2 and beats_info['time_signature'] > 3:
+                pattern = "m"  # Medium beat (3rd beat in 4/4)
+            else:
+                pattern = "w"  # Weak beat
+            # Add row to table
+            timeline += f"| {i+1:<6} | {time:.2f}s | {beat_type:<12} | {pattern}:{1.5 if pattern=='S' else 1.0} |\n"
+            # No truncation - show all beats
+        # Add a visual timeline of beats
+        timeline += "\n=== VISUAL BEAT TIMELINE ===\n\n"
+        timeline += "Each character represents 0.5 seconds. Beats are marked as:\n"
+        timeline += "S = Strong beat | m = Medium beat | w = Weak beat | · = No beat\n\n"
+        # Calculate total duration and create time markers
+        if 'beat_times' in beats_info and len(beats_info['beat_times']) > 0:
+            # Get the max value safely
+            max_beat_time = max([ensure_float(t) for t in beats_info['beat_times']])
+            total_duration = max_beat_time + 2  # Add 2 seconds of padding
+        else:
+            total_duration = 30  # Default duration if no beats found
+        time_markers = ""
+        for i in range(0, int(total_duration) + 1, 5):
+            time_markers += f"{i:<5}"
+        timeline += time_markers + " (seconds)\n"
+        # Create a ruler for easier time tracking
+        ruler = ""
+        for i in range(0, int(total_duration) + 1):
+            if i % 5 == 0:
+                ruler += "+"
+            else:
+                ruler += "-"
+            ruler += "-" * 9  # Each second is 10 characters wide
+        timeline += ruler + "\n"
+        # Create a visualization of beats with symbols
+        beat_line = ["·"] * int(total_duration * 2)  # 2 characters per second
+        for i, time in enumerate(beats_info['beat_times']):
+            if i >= len(beats_info['beat_strengths']):
+                break
+            # Convert to float if it's a numpy array
+            time_val = ensure_float(time)
+            # Determine position in the timeline
+            pos = int(time_val * 2)  # Convert to position in the beat_line
+            if pos >= len(beat_line):
+                continue
+            # Determine beat type based on strength and position
+            strength = beats_info['beat_strengths'][i]
+            # Convert to float if it's a numpy array
+            strength = ensure_float(strength)
+            if i % beats_info['time_signature'] == 0:
+                beat_line[pos] = "S"  # Strong beat at start of measure
+            elif strength >= 0.8:
+                beat_line[pos] = "S"  # Strong beat
+            elif i % beats_info['time_signature'] == beats_info['time_signature'] // 2 and beats_info['time_signature'] > 3:
+                beat_line[pos] = "m"  # Medium beat (3rd beat in 4/4)
+            elif strength >= 0.5:
+                beat_line[pos] = "m"  # Medium beat
+            else:
+                beat_line[pos] = "w"  # Weak beat
+        # Format and add to timeline
+        beat_visualization = ""
+        for i in range(0, len(beat_line), 10):
+            beat_visualization += "".join(beat_line[i:i+10])
+            if i + 10 < len(beat_line):
+                beat_visualization += " "  # Add space every 5 seconds
+        timeline += beat_visualization + "\n\n"
+        # Add measure markers
+        timeline += "=== MEASURE MARKERS ===\n\n"
+        # Create a list to track measure start times
+        measure_starts = []
+        for i, time in enumerate(beats_info['beat_times']):
+            if i % beats_info['time_signature'] == 0:  # Start of measure
+                # Convert to float if it's a numpy array
+                time_val = ensure_float(time)
+                measure_starts.append((i // beats_info['time_signature'] + 1, time_val))
+        # Format measure information
+        if measure_starts:
+            timeline += "| Measure # | Start Time | Duration |\n"
+            timeline += "|-----------|------------|----------|\n"
+            for i in range(len(measure_starts)):
+                measure_num, start_time = measure_starts[i]
+                # Calculate end time (start of next measure or end of song)
+                if i < len(measure_starts) - 1:
+                    end_time = measure_starts[i+1][1]
+                elif 'beat_times' in beats_info and len(beats_info['beat_times']) > 0:
+                    # Get the last beat time and convert to float if needed
+                    last_beat = beats_info['beat_times'][-1]
+                    end_time = ensure_float(last_beat)
+                else:
+                    end_time = start_time + 2.0  # Default 2 seconds if no next measure
+                duration = end_time - start_time
+                timeline += f"| {measure_num:<9} | {start_time:.2f}s | {duration:.2f}s |\n"
+                # No truncation - show all measures
+        # Add phrase information
+        if 'phrases' in beats_info and beats_info['phrases']:
+            timeline += "\n=== MUSICAL PHRASES ===\n\n"
+            for i, phrase in enumerate(beats_info['phrases']):
+                # Show all phrases, not just the first 10
+                if not phrase:
+                    continue
+                # Safely check phrase indices
+                if not (len(phrase) > 0 and len(beats_info['beat_times']) > 0):
+                    continue
+                start_beat = min(phrase[0], len(beats_info['beat_times'])-1)
+                end_beat = min(phrase[-1], len(beats_info['beat_times'])-1)
+                # Convert to float if needed
+                phrase_start = ensure_float(beats_info['beat_times'][start_beat])
+                phrase_end = ensure_float(beats_info['beat_times'][end_beat])
+                timeline += f"Phrase {i+1}: Beats {start_beat+1}-{end_beat+1} ({phrase_start:.2f}s - {phrase_end:.2f}s)\n"
+                # Create syllable template for this phrase with simplified numpy handling
+                phrase_beats = {
+                    "beat_times": [ensure_float(beats_info['beat_times'][j])
+                                  for j in phrase if j < len(beats_info['beat_times'])],
+                    "beat_strengths": [ensure_float(beats_info['beat_strengths'][j])
+                                      for j in phrase if j < len(beats_info['beat_strengths'])],
+                    "tempo": ensure_float(beats_info['tempo']),
+                    "time_signature": beats_info['time_signature'],
+                    "phrases": [list(range(len(phrase)))]
+                }
+                template = create_flexible_syllable_templates(phrase_beats)
+                timeline += f"  Syllable Template: {template}\n"
+                # Create a visual representation of this phrase
+                if phrase_start < total_duration and phrase_end < total_duration:
+                    # Create a timeline for this phrase
+                    phrase_visualization = ["·"] * int(total_duration * 2)
+                    # Mark the phrase boundaries
+                    start_pos = int(phrase_start * 2)
+                    end_pos = int(phrase_end * 2)
+                    if start_pos < len(phrase_visualization):
+                        phrase_visualization[start_pos] = "["
+                    if end_pos < len(phrase_visualization):
+                        phrase_visualization[end_pos] = "]"
+                    # Mark the beats in this phrase
+                    for j in phrase:
+                        if j < len(beats_info['beat_times']):
+                            beat_time = ensure_float(beats_info['beat_times'][j])
+                            beat_pos = int(beat_time * 2)
+                            if beat_pos < len(phrase_visualization) and beat_pos != start_pos and beat_pos != end_pos:
+                                # Determine beat type
+                                if j % beats_info['time_signature'] == 0:
+                                    phrase_visualization[beat_pos] = "S"
+                                elif j % beats_info['time_signature'] == beats_info['time_signature'] // 2:
+                                    phrase_visualization[beat_pos] = "m"
+                                else:
+                                    phrase_visualization[beat_pos] = "w"
+                    # Format and add visualization
+                    phrase_visual = ""
+                    for k in range(0, len(phrase_visualization), 10):
+                        phrase_visual += "".join(phrase_visualization[k:k+10])
+                        if k + 10 < len(phrase_visualization):
+                            phrase_visual += " "
+                    timeline += f"  Timeline: {phrase_visual}\n\n"
+        # Add second-level script display
+        try:
+            # Get second-level beat information
+            subbeat_info = detect_beats_and_subbeats(y, sr, subdivision=4)
+            duration = librosa.get_duration(y=y, sr=sr)
+            # Map to seconds
+            sec_map = map_beats_to_seconds(subbeat_info["subbeat_times"], duration)
+            # Create templates
+            templates = create_second_level_templates(sec_map, subbeat_info["tempo"])
+            # Add to timeline
+            timeline += "\n=== SECOND-LEVEL SCRIPT ===\n\n"
+            timeline += "Each line below represents ONE SECOND of audio with matching lyric content.\n"
+            timeline += "| Second | Beat Pattern | Lyric Content |\n"
+            timeline += "|--------|-------------|---------------|\n"
+            # Get clean lyrics (without analysis notes)
+            clean_lyrics = lyrics
+            if isinstance(lyrics, str):
+                if "[Note: Rhythm Analysis]" in lyrics:
+                    clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip()
+                elif "[Note: Potential rhythm mismatches" in lyrics:
+                    clean_lyrics = lyrics.split("[Note:")[0].strip()
+            # Get lyric lines
+            lines = clean_lyrics.strip().split('\n') if clean_lyrics else []
+            for i, template in enumerate(templates):
+                # Get corresponding lyric line if available
+                lyric = lines[i] if i < len(lines) else ""
+                if lyric.startswith('[') and ']' in lyric:
+                    lyric = ""  # Skip section headers
+                # Format nicely for display
+                timeline += f"| {i+1:<6} | {template:<30} | {lyric[:40]} |\n"
+            # Add ASCII visualization of second-level beats
+            timeline += "\n=== SECOND-LEVEL VISUALIZATION ===\n\n"
+            timeline += "Each row represents ONE SECOND. Beat types:\n"
+            timeline += "S = Strong beat | m = Medium beat | w = Weak beat | · = No beat\n\n"
+            for i, window in enumerate(sec_map):
+                beats = window["beats"]
+                # Create ASCII visualization
+                beat_viz = ["·"] * 20  # 20 columns for visualization
+                for beat in beats:
+                    # Calculate position in visualization
+                    pos = int(beat["relative_pos"] * 19)  # Map 0-1 to 0-19
+                    if 0 <= pos < len(beat_viz):
+                        # Set marker based on beat type
+                        if beat["type"] == "main":
+                            beat_viz[pos] = "S"
+                        elif beat["strength"] >= 0.7:
+                            beat_viz[pos] = "m"
+                        else:
+                            beat_viz[pos] = "w"
+                # Get corresponding lyric
+                lyric = lines[i] if i < len(lines) else ""
+                if lyric.startswith('[') and ']' in lyric:
+                    lyric = ""
+                # Format visualization line
+                viz_line = f"Second {i+1:2d}: [" + "".join(beat_viz) + "]"
+                if lyric:
+                    viz_line += f" → {lyric[:40]}"
+                timeline += viz_line + "\n"
+        except Exception as e:
+            timeline += f"\n[Error generating second-level analysis: {str(e)}]"
+        # Add a section showing alignment if lyrics were generated
+        if lyrics and isinstance(lyrics, str):
+            timeline += "\n=== LYRICS-BEAT ALIGNMENT ===\n\n"
+            # Remove rhythm analysis notes from lyrics if present
+            if "[Note:" in lyrics:
+                clean_lyrics = lyrics.split("[Note:")[0].strip()
+            else:
+                clean_lyrics = lyrics
+            lines = clean_lyrics.strip().split('\n')
+            # Show alignment for ALL lines, not just the first 10
+            for i, line in enumerate(lines):
+                if not line.strip() or line.startswith('['):
+                    continue
+                timeline += f"Line: \"{line}\"\n"
+                # Count syllables
+                syllable_count = count_syllables(line)
+                timeline += f"  Syllables: {syllable_count}\n"
+                # Show ideal timing (if we have enough phrases)
+                if 'phrases' in beats_info and beats_info['phrases'] and i < len(beats_info['phrases']):
+                    phrase = beats_info['phrases'][i]
+                    # Safely check if phrase has elements and indices are valid
+                    if phrase and len(phrase) > 0 and len(beats_info['beat_times']) > 0:
+                        start_beat = min(phrase[0], len(beats_info['beat_times'])-1)
+                        end_beat = min(phrase[-1], len(beats_info['beat_times'])-1)
+                        start_time = ensure_float(beats_info['beat_times'][start_beat])
+                        end_time = ensure_float(beats_info['beat_times'][end_beat])
+                        timeline += f"  Timing: {start_time:.2f}s - {end_time:.2f}s\n"
+                        # Create a visualization of syllable alignment
+                        timeline += "  Alignment: "
+                        # Create a timeline focused on just this phrase
+                        phrase_duration = end_time - start_time
+                        syllable_viz = []
+                        # Initialize with beat markers for this phrase
+                        for j in phrase:
+                            if j < len(beats_info['beat_times']):
+                                beat_time = ensure_float(beats_info['beat_times'][j])
+                                # Handle edge case where phrase_duration is very small
+                                if phrase_duration > 0.001:  # Avoid division by very small numbers
+                                    relative_pos = int((beat_time - start_time) / phrase_duration * syllable_count)
+                                else:
+                                    relative_pos = 0
+                                while len(syllable_viz) <= relative_pos:
+                                    syllable_viz.append("·")
+                                if j % beats_info['time_signature'] == 0:
+                                    syllable_viz[relative_pos] = "S"
+                                elif j % beats_info['time_signature'] == beats_info['time_signature'] // 2:
+                                    syllable_viz[relative_pos] = "m"
+                                else:
+                                    syllable_viz[relative_pos] = "w"
+                        # Fill in any gaps
+                        while len(syllable_viz) < syllable_count:
+                            syllable_viz.append("·")
+                        # Trim if too long
+                        syllable_viz = syllable_viz[:syllable_count]
+                        # Now map to the line
+                        timeline += "".join(syllable_viz) + "\n"
+                timeline += "\n"
+            # No truncation message for lines
+        return timeline
+    except Exception as e:
+        print(f"Error generating complete beat timeline: {str(e)}")
+        return f"Error generating complete beat timeline: {str(e)}"
+def display_results(audio_file):
+    """Process audio file and return formatted results for display in the UI."""
+    # Default error response
+    error_response = ("Please upload an audio file.",
+                     "No emotion analysis available.",
+                     "No audio classification available.",
+                     "No lyrics generated.",
+                     "No beat timeline available.")
+    if audio_file is None:
+        return error_response
+    try:
+        # Process audio and get results
+        results = process_audio(audio_file)
+        # Check if we got an error message
+        if isinstance(results, str) and "Error" in results:
+            return results, *error_response[1:]
+        elif isinstance(results, tuple) and isinstance(results[0], str) and "Error" in results[0]:
+            return results[0], *error_response[1:]
+        # Extract results
+        if isinstance(results, dict):
+            # New format
+            genre_results = results.get("genre_results", "Genre classification failed")
+            lyrics = results.get("lyrics", "Lyrics generation failed")
+            ast_results = results.get("ast_results", [])
+        else:
+            # Old tuple format
+            genre_results, lyrics, ast_results = results
+        # Get clean lyrics (without analysis notes)
+        clean_lyrics = lyrics
+        if isinstance(lyrics, str):
+            if "[Note: Rhythm Analysis]" in lyrics:
+                clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip()
+            elif "[Note: Potential rhythm mismatches" in lyrics:
+                clean_lyrics = lyrics.split("[Note:")[0].strip()
+        # Generate beat timeline - use the complete timeline function that shows all beats
+        beat_timeline = format_complete_beat_timeline(audio_file, clean_lyrics)
+        # Format emotion analysis results
+        emotion_text = "No emotion analysis available."
+        try:
+            emotion_results = music_analyzer.analyze_music(audio_file)
+            emotion_text = (f"Tempo: {emotion_results['summary']['tempo']:.1f} BPM\n"
+                           f"Key: {emotion_results['summary']['key']} {emotion_results['summary']['mode']}\n"
+                           f"Primary Emotion: {emotion_results['summary']['primary_emotion']}\n"
+                           f"Primary Theme: {emotion_results['summary']['primary_theme']}")
+            # Add song structure if available (without nested try/except)
+            y, sr = load_audio(audio_file, SAMPLE_RATE)
+            beats_info = detect_beats(y, sr)
+            sections_info = detect_sections(y, sr)
+            if sections_info:
+                emotion_text += "\n\nSong Structure:\n"
+                for section in sections_info:
+                    emotion_text += (f"- {section['type'].capitalize()}: {section['start']:.1f}s to {section['end']:.1f}s "
+                                   f"({section['duration']:.1f}s)\n")
+        except Exception as e:
+            print(f"Error in emotion analysis: {str(e)}")
+        # Format audio classification results
+        ast_text = "No valid audio classification results available."
+        if ast_results and isinstance(ast_results, list):
+            ast_text = "Audio Classification Results:\n"
+            for result in ast_results[:5]:  # Show top 5 results
+                ast_text += f"{result['label']}: {result['score']*100:.2f}%\n"
+        # Return all results
+        return genre_results, emotion_text, ast_text, clean_lyrics, beat_timeline
+    except Exception as e:
+        error_msg = f"Error: {str(e)}"
+        print(error_msg)
+        return error_msg, *error_response[1:]
 # Create enhanced Gradio interface with tabs for better organization
 with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
     gr.Markdown("# Music Genre Classifier & Lyrics Generator")
                 with gr.TabItem("Generated Lyrics"):
                     lyrics_output = gr.Textbox(label="Lyrics", lines=18)
+                with gr.TabItem("Beat & Syllable Timeline"):
+                    beat_timeline_output = gr.Textbox(label="Beat Timings & Syllable Patterns", lines=40)
     # Connect the button to the display function with updated outputs
     submit_btn.click(
         fn=display_results,
         inputs=[audio_input],
+        outputs=[genre_output, emotion_output, ast_output, lyrics_output, beat_timeline_output]
     )
     # Enhanced explanation of how the system works
            - Strong and weak beats
            - Natural phrase boundaries
            - Time signature and tempo variations
+           - Beat subdivisions (half and quarter beats)
+        5. **Second-Level Alignment**: The system maps beats and subbeats to each second of audio, creating precise templates for perfect alignment.
+        6. **Syllable Template Creation**: For each second of audio, the system generates precise syllable templates that reflect:
            - Beat stress patterns (strong, medium, weak)
            - Appropriate syllable counts based on tempo
            - Genre-specific rhythmic qualities
+           - Half-beat and quarter-beat subdivisions
+        7. **Lyrics Generation**: Using the detected genre, emotion, and rhythm patterns, a large language model generates lyrics that:
            - Match the emotional quality of the music
+           - Follow the precise syllable templates for each second
            - Align stressed syllables with strong beats
            - Maintain genre-appropriate style and themes
+        8. **Rhythm Verification**: The system verifies the generated lyrics, analyzing:
            - Syllable count accuracy
            - Stress alignment with strong beats
            - Word stress patterns
+           - Second-by-second alignment precision
+        9. **Refinement**: If significant rhythm mismatches are detected, the system can automatically refine the lyrics for better alignment.
         This multi-step process creates lyrics that feel naturally connected to the music, as if they were written specifically for it.
         """)

utils.py CHANGED Viewed

@@ -37,54 +37,6 @@ def extract_mfcc_features(y, sr, n_mfcc=20):
         # Return a fallback feature vector if extraction fails
         return np.zeros(n_mfcc)
-def calculate_lyrics_length(duration, tempo=100, time_signature=4):
-    """Calculate appropriate lyrics structure based on musical principles."""
-    # Legacy behavior - simple calculation based on duration
-    lines_count = max(4, int(duration / 10))
-    # If only duration was provided (original usage), return just the integer
-    if not isinstance(tempo, (int, float)) or not isinstance(time_signature, (int, float)):
-        return lines_count
-    # Enhanced calculation
-    beats_per_minute = tempo
-    beats_per_second = beats_per_minute / 60
-    total_beats = duration * beats_per_second
-    total_measures = total_beats / time_signature
-    # Determine section distributions
-    verse_lines = 0
-    chorus_lines = 0
-    bridge_lines = 0
-    if lines_count <= 6:
-        verse_lines = 2
-        chorus_lines = 2
-    elif lines_count <= 10:
-        verse_lines = 3
-        chorus_lines = 2
-    else:
-        verse_lines = 3
-        chorus_lines = 2
-        bridge_lines = 2
-    # Create structured output
-    song_structure = {
-        "total_measures": int(total_measures),
-        "lines_count": lines_count,  # Include the original line count
-        "sections": [
-            {"type": "verse", "lines": verse_lines, "measures": int(total_measures * 0.4)},
-            {"type": "chorus", "lines": chorus_lines, "measures": int(total_measures * 0.3)}
-        ]
-    }
-    if bridge_lines > 0:
-        song_structure["sections"].append(
-            {"type": "bridge", "lines": bridge_lines, "measures": int(total_measures * 0.2)}
-        )
-    return song_structure
 def format_genre_results(top_genres):
     """Format genre classification results for display."""
     result = "Top Detected Genres:\n"
@@ -103,17 +55,3 @@ def ensure_cuda_availability():
         print("CUDA is not available. Using CPU for inference.")
     return cuda_available
-def preprocess_audio_for_model(waveform, sample_rate, target_sample_rate=16000, max_length=16000):
-    """Preprocess audio for model input (resample, pad/trim)."""
-    # Resample if needed
-    if sample_rate != target_sample_rate:
-        waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=target_sample_rate)
-    # Trim or pad to expected length
-    if len(waveform) > max_length:
-        waveform = waveform[:max_length]
-    elif len(waveform) < max_length:
-        padding = max_length - len(waveform)
-        waveform = np.pad(waveform, (0, padding), 'constant')
-    return waveform

         # Return a fallback feature vector if extraction fails
         return np.zeros(n_mfcc)
 def format_genre_results(top_genres):
     """Format genre classification results for display."""
     result = "Top Detected Genres:\n"
         print("CUDA is not available. Using CPU for inference.")
     return cuda_available