root commited on
Commit
bddf9c4
·
1 Parent(s): 670bed3
Files changed (4) hide show
  1. app.py +0 -0
  2. appp.py +0 -0
  3. lastapp.py +1229 -273
  4. utils.py +0 -62
app.py CHANGED
The diff for this file is too large to render. See raw diff
 
appp.py ADDED
The diff for this file is too large to render. See raw diff
 
lastapp.py CHANGED
@@ -19,10 +19,8 @@ from utils import (
19
  load_audio,
20
  extract_audio_duration,
21
  extract_mfcc_features,
22
- calculate_lyrics_length,
23
  format_genre_results,
24
- ensure_cuda_availability,
25
- preprocess_audio_for_model
26
  )
27
  from emotionanalysis import MusicAnalyzer
28
  import librosa
@@ -106,6 +104,75 @@ llm_pipeline = pipeline(
106
  # Initialize music emotion analyzer
107
  music_analyzer = MusicAnalyzer()
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  # New function: Count syllables in text
110
  def count_syllables(text):
111
  """Count syllables in a given text using the pronouncing library."""
@@ -113,31 +180,7 @@ def count_syllables(text):
113
  syllable_count = 0
114
 
115
  for word in words:
116
- # Get pronunciations for the word
117
- pronunciations = pronouncing.phones_for_word(word)
118
- if pronunciations:
119
- # Count syllables in the first pronunciation
120
- syllable_count += pronouncing.syllable_count(pronunciations[0])
121
- else:
122
- # Fallback: estimate syllables based on vowel groups
123
- vowels = "aeiouy"
124
- count = 0
125
- prev_is_vowel = False
126
-
127
- for char in word:
128
- is_vowel = char.lower() in vowels
129
- if is_vowel and not prev_is_vowel:
130
- count += 1
131
- prev_is_vowel = is_vowel
132
-
133
- if word.endswith('e'):
134
- count -= 1
135
- if word.endswith('le') and len(word) > 2 and word[-3] not in vowels:
136
- count += 1
137
- if count == 0:
138
- count = 1
139
-
140
- syllable_count += count
141
 
142
  return syllable_count
143
 
@@ -304,8 +347,7 @@ def detect_beats(y, sr):
304
  onset_envelope=combined_onset,
305
  sr=sr,
306
  tightness=100,
307
- start_bpm=60, # Lower starting BPM helps find different time signatures
308
- std_bpm=20 # Allow wider variations
309
  )
310
  tempo_candidates.append(tempo2)
311
  beat_candidates.append(beats2)
@@ -487,6 +529,281 @@ def detect_beats(y, sr):
487
  "phrases": phrases
488
  }
489
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490
  def detect_sections(y, sr):
491
  """
492
  Advanced detection of musical sections with adaptive segmentation and improved classification.
@@ -768,6 +1085,24 @@ def create_flexible_syllable_templates(beats_info, genre=None, phrase_mode='defa
768
  import numpy as np
769
  from sklearn.cluster import KMeans
770
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
771
  # Extract basic beat information
772
  beat_times = beats_info.get("beat_times", [])
773
  beat_strengths = beats_info.get("beat_strengths", [1.0] * len(beat_times))
@@ -1169,10 +1504,10 @@ def format_syllable_templates_for_prompt(syllable_templates, arrow="→", line_w
1169
 
1170
  return "\n".join(output)
1171
 
1172
- def verify_flexible_syllable_counts(lyrics, templates):
1173
  """
1174
  Enhanced verification of syllable counts and stress patterns with precise alignment analysis
1175
- and detailed feedback for all phrases in a template.
1176
  """
1177
  import re
1178
  import pronouncing
@@ -1180,74 +1515,6 @@ def verify_flexible_syllable_counts(lyrics, templates):
1180
  import functools
1181
  from itertools import chain
1182
 
1183
- # Apply caching to improve performance for repeated word lookups
1184
- @functools.lru_cache(maxsize=512)
1185
- def cached_phones_for_word(word):
1186
- return pronouncing.phones_for_word(word)
1187
-
1188
- @functools.lru_cache(maxsize=512)
1189
- def count_syllables_for_word(word):
1190
- """Count syllables in a single word with caching for performance."""
1191
- # Try using pronouncing library first
1192
- pronunciations = cached_phones_for_word(word.lower())
1193
- if pronunciations:
1194
- return pronouncing.syllable_count(pronunciations[0])
1195
-
1196
- # Fallback method for words not in the pronouncing dictionary
1197
- vowels = "aeiouy"
1198
- word = word.lower()
1199
- count = 0
1200
- prev_is_vowel = False
1201
-
1202
- for char in word:
1203
- is_vowel = char in vowels
1204
- if is_vowel and not prev_is_vowel:
1205
- count += 1
1206
- prev_is_vowel = is_vowel
1207
-
1208
- # Handle special cases
1209
- if word.endswith('e') and not word.endswith('le'):
1210
- count -= 1
1211
- if word.endswith('le') and len(word) > 2 and word[-3] not in vowels:
1212
- count += 1
1213
- if count == 0:
1214
- count = 1
1215
-
1216
- return count
1217
-
1218
- @functools.lru_cache(maxsize=512)
1219
- def get_word_stress(word):
1220
- """Get the stress pattern for a word with improved fallback handling."""
1221
- pronunciations = cached_phones_for_word(word.lower())
1222
- if pronunciations:
1223
- return pronouncing.stresses(pronunciations[0])
1224
-
1225
- # Enhanced fallback for words not in the dictionary
1226
- syllables = count_syllables_for_word(word)
1227
-
1228
- # Common English stress patterns by word length
1229
- if syllables == 1:
1230
- return "1" # Single syllable words are stressed
1231
- elif syllables == 2:
1232
- # Most 2-syllable nouns and adjectives stress first syllable
1233
- # Common endings that indicate second-syllable stress
1234
- second_syllable_stress = ["ing", "er", "or", "ize", "ise", "ate", "ect", "end", "ure"]
1235
- if any(word.endswith(ending) for ending in second_syllable_stress):
1236
- return "01"
1237
- else:
1238
- return "10" # Default for 2-syllable words
1239
- elif syllables == 3:
1240
- # Common endings for specific stress patterns in 3-syllable words
1241
- if any(word.endswith(ending) for ending in ["ity", "ety", "ify", "ogy", "graphy"]):
1242
- return "100" # First syllable stress
1243
- elif any(word.endswith(ending) for ending in ["ation", "ious", "itis"]):
1244
- return "010" # Middle syllable stress
1245
- else:
1246
- return "100" # Default for 3-syllable words
1247
- else:
1248
- # For longer words, use common English patterns
1249
- return "1" + "0" * (syllables - 1)
1250
-
1251
  # Split lyrics into lines
1252
  lines = [line.strip() for line in lyrics.split("\n") if line.strip()]
1253
 
@@ -1463,6 +1730,97 @@ def verify_flexible_syllable_counts(lyrics, templates):
1463
  # If no matching template was found
1464
  verification_notes.append(f"Line {i+1}: Unable to find matching template pattern")
1465
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1466
  # Only add detailed analysis if we have rhythm mismatches
1467
  if verification_notes:
1468
  lyrics += "\n\n[Note: Potential rhythm mismatches detected in these lines:]\n"
@@ -1660,6 +2018,28 @@ def generate_lyrics(genre, duration, emotion_results, song_structure=None):
1660
  Returns:
1661
  Generated lyrics aligned with the rhythm patterns of the music
1662
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1663
  # Extract emotion and theme data from analysis results
1664
  primary_emotion = emotion_results["emotion_analysis"]["primary_emotion"]
1665
  primary_theme = emotion_results["theme_analysis"]["primary_theme"]
@@ -1682,7 +2062,35 @@ def generate_lyrics(genre, duration, emotion_results, song_structure=None):
1682
  structure_visualization += f"Song Duration: {duration:.1f} seconds\n"
1683
  structure_visualization += f"Tempo: {tempo:.1f} BPM\n\n"
1684
 
1685
- if song_structure:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1686
  # Try to use flexible structure if available
1687
  if "flexible_structure" in song_structure and song_structure["flexible_structure"]:
1688
  flexible = song_structure["flexible_structure"]
@@ -1982,8 +2390,16 @@ def generate_lyrics(genre, duration, emotion_results, song_structure=None):
1982
  # Store the syllable guidance for later use
1983
  syllable_guidance_text = syllable_guidance
1984
 
1985
- # Determine if we should use traditional sections or not based on structure
1986
- if song_structure and "flexible_structure" in song_structure and song_structure["flexible_structure"]:
 
 
 
 
 
 
 
 
1987
  # If we have more than 4 segments, it's likely not a traditional song structure
1988
  if "segments" in song_structure["flexible_structure"]:
1989
  segments = song_structure["flexible_structure"]["segments"]
@@ -1991,12 +2407,57 @@ def generate_lyrics(genre, duration, emotion_results, song_structure=None):
1991
  use_sections = False
1992
 
1993
  # Create enhanced prompt with better rhythm alignment instructions
1994
- if use_sections:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1995
  # Traditional approach with sections
1996
  content = f"""
1997
  You are a talented songwriter who specializes in {genre} music.
1998
  Write original {genre} song lyrics for a song that is {duration:.1f} seconds long.
1999
 
 
 
2000
  Music analysis has detected the following qualities in the music:
2001
  - Tempo: {tempo:.1f} BPM
2002
  - Key: {key} {mode}
@@ -2014,14 +2475,6 @@ CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT:
2014
  6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis)
2015
  7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels
2016
 
2017
- Think step by step about how to match words to the rhythm pattern:
2018
- 1. First, identify the strong beats in each line pattern
2019
- 2. Choose words where stressed syllables naturally fall on strong beats
2020
- 3. Count syllables carefully to ensure they match the pattern precisely
2021
- 4. Test your line against the pattern by mapping each syllable
2022
-
2023
- IMPORTANT: Each line of lyrics must match exactly to ONE musical phrase/segment.
2024
-
2025
  The lyrics should:
2026
  - Perfectly capture the essence and style of {genre} music
2027
  - Express the {primary_emotion} emotion and {primary_theme} theme
@@ -2029,6 +2482,8 @@ The lyrics should:
2029
  - Be completely original
2030
  - Match the song duration of {duration:.1f} seconds
2031
 
 
 
2032
  IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]"
2033
  where you analyze how well the lyrics align with the musical rhythm. This section MUST appear
2034
  even if there are no rhythm issues. Include the following in your analysis:
@@ -2044,6 +2499,8 @@ Your lyrics:
2044
  You are a talented songwriter who specializes in {genre} music.
2045
  Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long.
2046
 
 
 
2047
  Music analysis has detected the following qualities:
2048
  - Tempo: {tempo:.1f} BPM
2049
  - Key: {key} {mode}
@@ -2061,19 +2518,6 @@ CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT:
2061
  6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis)
2062
  7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels
2063
 
2064
- Think step by step about how to match words to the rhythm pattern:
2065
- 1. First, identify the strong beats in each line pattern
2066
- 2. Choose words where stressed syllables naturally fall on strong beats
2067
- 3. Count syllables carefully to ensure they match the pattern precisely
2068
- 4. Test your line against the pattern by mapping each syllable
2069
-
2070
- CRITICAL: Each line of lyrics must match exactly to ONE musical phrase/segment.
2071
-
2072
- For perfect alignment examples:
2073
- - "FEEL the RHY-thm in your SOUL" – stressed syllables on strong beats
2074
- - "to-DAY we DANCE a-LONG" – natural speech stress matches musical stress
2075
- - "WAIT-ing FOR the SUN to RISE" – syllable emphasis aligns with beat emphasis
2076
-
2077
  The lyrics should:
2078
  - Perfectly capture the essence and style of {genre} music
2079
  - Express the {primary_emotion} emotion and {primary_theme} theme
@@ -2084,6 +2528,8 @@ The lyrics should:
2084
  Include any section labels like [Verse] or [Chorus] as indicated in the rhythm patterns above.
2085
  Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY.
2086
 
 
 
2087
  IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]"
2088
  where you analyze how well the lyrics align with the musical rhythm. This section MUST appear
2089
  even if there are no rhythm issues. Include the following in your analysis:
@@ -2096,6 +2542,7 @@ Your lyrics:
2096
 
2097
  # Format as a chat message for the LLM
2098
  messages = [
 
2099
  {"role": "user", "content": content}
2100
  ]
2101
 
@@ -2112,13 +2559,21 @@ Your lyrics:
2112
  # Configure generation parameters based on model capability
2113
  generation_params = {
2114
  "do_sample": True,
2115
- "temperature": 0.6, # Lower for more consistent rhythm alignment
2116
- "top_p": 0.95,
2117
- "top_k": 50, # Increased from 20 for more diversity
2118
  "repetition_penalty": 1.2,
2119
- "max_new_tokens": 2048 # Doubled from 1024 for more comprehensive lyrics
 
2120
  }
2121
 
 
 
 
 
 
 
 
2122
  # Generate output
2123
  generated_ids = llm_model.generate(
2124
  **model_inputs,
@@ -2128,24 +2583,123 @@ Your lyrics:
2128
  # Extract output tokens
2129
  output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
2130
 
2131
- # Skip the thinking process completely and just get the raw output
2132
  lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip()
2133
 
2134
- # If we find <thinking> tags, extract only the content after </thinking>
 
2135
  if "<thinking>" in lyrics and "</thinking>" in lyrics:
2136
  lyrics = lyrics.split("</thinking>")[1].strip()
2137
 
2138
- # Remove any other thinking indicators that might be present
2139
- thinking_markers = ["<think>", "</think>", "[thinking]", "[/thinking]", "I'll think step by step:"]
 
 
 
 
 
 
 
 
 
 
 
 
2140
  for marker in thinking_markers:
2141
  if marker in lyrics:
2142
  parts = lyrics.split(marker)
2143
  if len(parts) > 1:
2144
  lyrics = parts[-1].strip() # Take the last part after any thinking marker
2145
 
2146
- # Verify syllable counts with enhanced verification
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2147
  if templates_for_verification:
2148
- verified_lyrics = verify_flexible_syllable_counts(lyrics, templates_for_verification)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2149
 
2150
  # Check if significant issues were detected
2151
  if "[Note: Potential rhythm mismatches" in verified_lyrics and "Detailed Alignment Analysis" in verified_lyrics:
@@ -2206,7 +2760,7 @@ Improved lyrics with fixed rhythm:
2206
  refined_lyrics = llm_tokenizer.decode(refined_output_ids, skip_special_tokens=True).strip()
2207
 
2208
  # Verify the refined lyrics
2209
- refined_verified_lyrics = verify_flexible_syllable_counts(refined_lyrics, templates_for_verification)
2210
 
2211
  # Only use refined lyrics if they're better (fewer notes)
2212
  if "[Note: Potential rhythm mismatches" not in refined_verified_lyrics:
@@ -2274,6 +2828,16 @@ Improved lyrics with fixed rhythm:
2274
 
2275
  if len(templates_for_verification) > 30:
2276
  syllable_analysis += f"... and {len(templates_for_verification) - 30} more lines\n\n"
 
 
 
 
 
 
 
 
 
 
2277
 
2278
  # Add structure visualization to syllable analysis
2279
  syllable_analysis += "\n" + structure_visualization
@@ -2329,24 +2893,28 @@ def process_audio(audio_file):
2329
  print(f"Error in genre classification: {str(e)}")
2330
  return f"Error in genre classification: {str(e)}", None, ast_results
2331
 
 
 
 
 
 
 
 
 
 
 
 
2332
  print("Step 4/5: Analyzing music emotions, themes, and structure...")
2333
  # Analyze music emotions and themes
2334
  try:
2335
  emotion_results = music_analyzer.analyze_music(audio_file)
2336
  except Exception as e:
2337
  print(f"Error in emotion analysis: {str(e)}")
2338
- # Continue even if emotion analysis fails
2339
- emotion_results = {
2340
- "emotion_analysis": {"primary_emotion": "Unknown"},
2341
- "theme_analysis": {"primary_theme": "Unknown"},
2342
- "rhythm_analysis": {"tempo": 0},
2343
- "tonal_analysis": {"key": "Unknown", "mode": ""},
2344
- "summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"}
2345
- }
2346
 
2347
  # Calculate detailed song structure for better lyrics alignment
2348
  try:
2349
- # Enhanced song structure calculation for precise lyrics matching
2350
  y, sr = load_audio(audio_file, SAMPLE_RATE)
2351
 
2352
  # Analyze beats and phrases for music-aligned lyrics
@@ -2427,21 +2995,21 @@ def process_audio(audio_file):
2427
  "end": segment_end
2428
  })
2429
 
2430
- # Create a flexible structure with the segments
2431
  flexible_structure = {
2432
  "beats": beats_info,
2433
  "segments": segments
2434
  }
2435
 
2436
- # Add to song structure
2437
  song_structure = {
2438
  "beats": beats_info,
2439
  "sections": sections_info,
2440
- "flexible_structure": flexible_structure
 
2441
  }
2442
 
2443
  # Add syllable counts to each section
2444
- song_structure["syllables"] = []
2445
  for section in sections_info:
2446
  # Create syllable templates for sections
2447
  section_beats_info = {
@@ -2477,12 +3045,37 @@ def process_audio(audio_file):
2477
 
2478
  song_structure["syllables"].append(section_info)
2479
 
2480
- print(f"Successfully analyzed song structure with {len(segments)} segments")
2481
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2482
  except Exception as e:
2483
  print(f"Error analyzing song structure: {str(e)}")
2484
- # Continue with a simpler approach if this fails
2485
- song_structure = None
2486
 
2487
  print("Step 5/5: Generating rhythmically aligned lyrics...")
2488
  # Generate lyrics based on top genre, emotion analysis, and song structure
@@ -2526,6 +3119,476 @@ def process_audio(audio_file):
2526
  print(error_msg)
2527
  return error_msg, None, []
2528
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2529
  # Create enhanced Gradio interface with tabs for better organization
2530
  with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
2531
  gr.Markdown("# Music Genre Classifier & Lyrics Generator")
@@ -2566,126 +3629,14 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
2566
  with gr.TabItem("Generated Lyrics"):
2567
  lyrics_output = gr.Textbox(label="Lyrics", lines=18)
2568
 
2569
- with gr.TabItem("Rhythm Analysis"):
2570
- rhythm_analysis_output = gr.Textbox(label="Syllable-Beat Alignment Analysis", lines=16)
2571
-
2572
- with gr.TabItem("Syllable Analysis"):
2573
- syllable_analysis_output = gr.Textbox(label="Detailed Syllable Analysis", lines=16)
2574
- prompt_template_output = gr.Textbox(label="Prompt Template", lines=16)
2575
-
2576
- # Processing function with better handling of results
2577
- def display_results(audio_file):
2578
- if audio_file is None:
2579
- return "Please upload an audio file.", "No emotion analysis available.", "No audio classification available.", "No lyrics generated.", "No rhythm analysis available.", "No syllable analysis available.", "No prompt template available."
2580
-
2581
- try:
2582
- # Process audio and get results
2583
- results = process_audio(audio_file)
2584
-
2585
- # Check if we got an error message instead of results
2586
- if isinstance(results, str) and "Error" in results:
2587
- return results, "Error in analysis", "Error in classification", "No lyrics generated", "No rhythm analysis available", "No syllable analysis available", "No prompt template available"
2588
- elif isinstance(results, tuple) and isinstance(results[0], str) and "Error" in results[0]:
2589
- return results[0], "Error in analysis", "Error in classification", "No lyrics generated", "No rhythm analysis available", "No syllable analysis available", "No prompt template available"
2590
-
2591
- # For backwards compatibility, handle both dictionary and tuple returns
2592
- if isinstance(results, dict):
2593
- genre_results = results.get("genre_results", "Genre classification failed")
2594
- lyrics = results.get("lyrics", "Lyrics generation failed")
2595
- ast_results = results.get("ast_results", [])
2596
-
2597
- # Use clean lyrics if available
2598
- clean_lyrics = results.get("clean_lyrics", lyrics)
2599
- rhythm_analysis = results.get("rhythm_analysis", "No detailed rhythm analysis available")
2600
-
2601
- # Extract syllable analysis and prompt template
2602
- syllable_analysis = results.get("syllable_analysis", "No syllable analysis available")
2603
- prompt_template = results.get("prompt_template", "No prompt template available")
2604
- else:
2605
- # Handle the old tuple return format
2606
- genre_results, lyrics, ast_results = results
2607
- clean_lyrics = lyrics
2608
-
2609
- # Extract rhythm analysis if present
2610
- rhythm_analysis = "No detailed rhythm analysis available"
2611
- if isinstance(lyrics, str):
2612
- # First check for new format
2613
- if "[Note: Rhythm Analysis]" in lyrics:
2614
- clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip()
2615
- rhythm_analysis = lyrics.split("[Note: Rhythm Analysis]")[1]
2616
- # Check for old format
2617
- elif "[Note: Potential rhythm mismatches" in lyrics:
2618
- clean_lyrics = lyrics.split("[Note:")[0].strip()
2619
- rhythm_analysis = "[Note:" + lyrics.split("[Note:")[1]
2620
-
2621
- # Default values for new fields
2622
- syllable_analysis = "No syllable analysis available"
2623
- prompt_template = "No prompt template available"
2624
-
2625
- # Format emotion analysis results
2626
- try:
2627
- emotion_results = music_analyzer.analyze_music(audio_file)
2628
- emotion_text = f"Tempo: {emotion_results['summary']['tempo']:.1f} BPM\n"
2629
- emotion_text += f"Key: {emotion_results['summary']['key']} {emotion_results['summary']['mode']}\n"
2630
- emotion_text += f"Primary Emotion: {emotion_results['summary']['primary_emotion']}\n"
2631
- emotion_text += f"Primary Theme: {emotion_results['summary']['primary_theme']}"
2632
-
2633
- # Add detailed song structure information if available
2634
- try:
2635
- audio_data = extract_audio_features(audio_file)
2636
- song_structure = calculate_detailed_song_structure(audio_data)
2637
-
2638
- emotion_text += "\n\nSong Structure:\n"
2639
- for section in song_structure["syllables"]:
2640
- emotion_text += f"- {section['type'].capitalize()}: {section['start']:.1f}s to {section['end']:.1f}s "
2641
- emotion_text += f"({section['duration']:.1f}s, {section['beat_count']} beats, "
2642
-
2643
- if "syllable_template" in section:
2644
- emotion_text += f"template: {section['syllable_template']})\n"
2645
- else:
2646
- emotion_text += f"~{section['syllable_count']} syllables)\n"
2647
-
2648
- # Add flexible structure info if available
2649
- if "flexible_structure" in song_structure and song_structure["flexible_structure"]:
2650
- flexible = song_structure["flexible_structure"]
2651
- if "segments" in flexible and flexible["segments"]:
2652
- emotion_text += "\nDetailed Rhythm Analysis:\n"
2653
- for i, segment in enumerate(flexible["segments"][:5]): # Show first 5 segments
2654
- emotion_text += f"- Segment {i+1}: {segment['start']:.1f}s to {segment['end']:.1f}s, "
2655
- emotion_text += f"pattern: {segment.get('syllable_template', 'N/A')}\n"
2656
-
2657
- if len(flexible["segments"]) > 5:
2658
- emotion_text += f" (+ {len(flexible['segments']) - 5} more segments)\n"
2659
-
2660
- except Exception as e:
2661
- print(f"Error displaying song structure: {str(e)}")
2662
- # Continue without showing structure details
2663
-
2664
- except Exception as e:
2665
- print(f"Error in emotion analysis: {str(e)}")
2666
- emotion_text = f"Error in emotion analysis: {str(e)}"
2667
-
2668
- # Format AST classification results
2669
- if ast_results and isinstance(ast_results, list):
2670
- ast_text = "Audio Classification Results:\n"
2671
- for result in ast_results[:5]: # Show top 5 results
2672
- ast_text += f"{result['label']}: {result['score']*100:.2f}%\n"
2673
- else:
2674
- ast_text = "No valid audio classification results available."
2675
-
2676
- # Return all results including new fields
2677
- return genre_results, emotion_text, ast_text, clean_lyrics, rhythm_analysis, syllable_analysis, prompt_template
2678
-
2679
- except Exception as e:
2680
- error_msg = f"Error: {str(e)}"
2681
- print(error_msg)
2682
- return error_msg, "Error in emotion analysis", "Error in audio classification", "No lyrics generated", "No rhythm analysis available", "No syllable analysis available", "No prompt template available"
2683
 
2684
  # Connect the button to the display function with updated outputs
2685
  submit_btn.click(
2686
  fn=display_results,
2687
  inputs=[audio_input],
2688
- outputs=[genre_output, emotion_output, ast_output, lyrics_output, rhythm_analysis_output, syllable_analysis_output, prompt_template_output]
2689
  )
2690
 
2691
  # Enhanced explanation of how the system works
@@ -2703,24 +3654,29 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
2703
  - Strong and weak beats
2704
  - Natural phrase boundaries
2705
  - Time signature and tempo variations
 
 
 
2706
 
2707
- 5. **Syllable Template Creation**: For each musical phrase, the system generates precise syllable templates that reflect:
2708
  - Beat stress patterns (strong, medium, weak)
2709
  - Appropriate syllable counts based on tempo
2710
  - Genre-specific rhythmic qualities
 
2711
 
2712
- 6. **Lyrics Generation**: Using the detected genre, emotion, and rhythm patterns, a large language model generates lyrics that:
2713
  - Match the emotional quality of the music
2714
- - Follow the precise syllable templates
2715
  - Align stressed syllables with strong beats
2716
  - Maintain genre-appropriate style and themes
2717
 
2718
- 7. **Rhythm Verification**: The system verifies the generated lyrics, analyzing:
2719
  - Syllable count accuracy
2720
  - Stress alignment with strong beats
2721
  - Word stress patterns
 
2722
 
2723
- 8. **Refinement**: If significant rhythm mismatches are detected, the system can automatically refine the lyrics for better alignment.
2724
 
2725
  This multi-step process creates lyrics that feel naturally connected to the music, as if they were written specifically for it.
2726
  """)
 
19
  load_audio,
20
  extract_audio_duration,
21
  extract_mfcc_features,
 
22
  format_genre_results,
23
+ ensure_cuda_availability
 
24
  )
25
  from emotionanalysis import MusicAnalyzer
26
  import librosa
 
104
  # Initialize music emotion analyzer
105
  music_analyzer = MusicAnalyzer()
106
 
107
+ # New global function moved outside of verify_flexible_syllable_counts
108
+ @functools.lru_cache(maxsize=512)
109
+ def cached_phones_for_word(word):
110
+ """Get word pronunciations with caching for better performance."""
111
+ return pronouncing.phones_for_word(word)
112
+
113
+ @functools.lru_cache(maxsize=512)
114
+ def count_syllables_for_word(word):
115
+ """Count syllables in a single word with caching for performance."""
116
+ # Try using pronouncing library first
117
+ pronunciations = cached_phones_for_word(word.lower())
118
+ if pronunciations:
119
+ return pronouncing.syllable_count(pronunciations[0])
120
+
121
+ # Fallback method for words not in the pronouncing dictionary
122
+ vowels = "aeiouy"
123
+ word = word.lower()
124
+ count = 0
125
+ prev_is_vowel = False
126
+
127
+ for char in word:
128
+ is_vowel = char in vowels
129
+ if is_vowel and not prev_is_vowel:
130
+ count += 1
131
+ prev_is_vowel = is_vowel
132
+
133
+ # Handle special cases
134
+ if word.endswith('e') and not word.endswith('le'):
135
+ count -= 1
136
+ if word.endswith('le') and len(word) > 2 and word[-3] not in vowels:
137
+ count += 1
138
+ if count == 0:
139
+ count = 1
140
+
141
+ return count
142
+
143
+ @functools.lru_cache(maxsize=512)
144
+ def get_word_stress(word):
145
+ """Get the stress pattern for a word with improved fallback handling."""
146
+ pronunciations = cached_phones_for_word(word.lower())
147
+ if pronunciations:
148
+ return pronouncing.stresses(pronunciations[0])
149
+
150
+ # Enhanced fallback for words not in the dictionary
151
+ syllables = count_syllables_for_word(word)
152
+
153
+ # Common English stress patterns by word length
154
+ if syllables == 1:
155
+ return "1" # Single syllable words are stressed
156
+ elif syllables == 2:
157
+ # Most 2-syllable nouns and adjectives stress first syllable
158
+ # Common endings that indicate second-syllable stress
159
+ second_syllable_stress = ["ing", "er", "or", "ize", "ise", "ate", "ect", "end", "ure"]
160
+ if any(word.endswith(ending) for ending in second_syllable_stress):
161
+ return "01"
162
+ else:
163
+ return "10" # Default for 2-syllable words
164
+ elif syllables == 3:
165
+ # Common endings for specific stress patterns in 3-syllable words
166
+ if any(word.endswith(ending) for ending in ["ity", "ety", "ify", "ogy", "graphy"]):
167
+ return "100" # First syllable stress
168
+ elif any(word.endswith(ending) for ending in ["ation", "ious", "itis"]):
169
+ return "010" # Middle syllable stress
170
+ else:
171
+ return "100" # Default for 3-syllable words
172
+ else:
173
+ # For longer words, use common English patterns
174
+ return "1" + "0" * (syllables - 1)
175
+
176
  # New function: Count syllables in text
177
  def count_syllables(text):
178
  """Count syllables in a given text using the pronouncing library."""
 
180
  syllable_count = 0
181
 
182
  for word in words:
183
+ syllable_count += count_syllables_for_word(word)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
  return syllable_count
186
 
 
347
  onset_envelope=combined_onset,
348
  sr=sr,
349
  tightness=100,
350
+ start_bpm=60 # Lower starting BPM helps find different time signatures
 
351
  )
352
  tempo_candidates.append(tempo2)
353
  beat_candidates.append(beats2)
 
529
  "phrases": phrases
530
  }
531
 
532
+ def detect_beats_and_subbeats(y, sr, subdivision=4):
533
+ """
534
+ Detect main beats and interpolate subbeats between consecutive beats.
535
+
536
+ Parameters:
537
+ y: Audio time series
538
+ sr: Sample rate
539
+ subdivision: Number of subdivisions between beats (default: 4 for quarter beats)
540
+
541
+ Returns:
542
+ Dictionary containing beat times, subbeat times, and tempo information
543
+ """
544
+ # Detect main beats using librosa
545
+ try:
546
+ tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
547
+ beat_times = librosa.frames_to_time(beat_frames, sr=sr)
548
+
549
+ # Convert numpy values to native Python types
550
+ if isinstance(tempo, np.ndarray) or isinstance(tempo, np.number):
551
+ tempo = float(tempo)
552
+
553
+ # Convert beat_times to a list of floats
554
+ if isinstance(beat_times, np.ndarray):
555
+ beat_times = [float(t) for t in beat_times]
556
+ except Exception as e:
557
+ print(f"Error in beat detection: {e}")
558
+ # Default fallbacks
559
+ tempo = 120.0
560
+ beat_times = []
561
+
562
+ # Create subbeats by interpolating between main beats
563
+ subbeat_times = []
564
+
565
+ # Early return if no beats detected
566
+ if not beat_times or len(beat_times) < 2:
567
+ return {
568
+ "tempo": float(tempo) if tempo is not None else 120.0,
569
+ "beat_times": beat_times,
570
+ "subbeat_times": []
571
+ }
572
+
573
+ for i in range(len(beat_times) - 1):
574
+ # Get current and next beat time
575
+ try:
576
+ current_beat = float(beat_times[i])
577
+ next_beat = float(beat_times[i + 1])
578
+ except (IndexError, ValueError, TypeError):
579
+ continue
580
+
581
+ # Calculate time interval between beats
582
+ interval = (next_beat - current_beat) / subdivision
583
+
584
+ # Add the main beat
585
+ subbeat_times.append({
586
+ "time": float(current_beat),
587
+ "type": "main",
588
+ "strength": 1.0,
589
+ "beat_index": i
590
+ })
591
+
592
+ # Add subbeats
593
+ for j in range(1, subdivision):
594
+ subbeat_time = current_beat + j * interval
595
+ # Calculate strength based on position
596
+ # For 4/4 time, beat 3 is stronger than beats 2 and 4
597
+ if j == subdivision // 2 and subdivision == 4:
598
+ strength = 0.8 # Stronger subbeat (e.g., beat 3 in 4/4)
599
+ else:
600
+ strength = 0.5 # Weaker subbeat
601
+
602
+ subbeat_times.append({
603
+ "time": float(subbeat_time),
604
+ "type": "sub",
605
+ "strength": float(strength),
606
+ "beat_index": i,
607
+ "subbeat_index": j
608
+ })
609
+
610
+ # Add the last main beat
611
+ if beat_times:
612
+ try:
613
+ subbeat_times.append({
614
+ "time": float(beat_times[-1]),
615
+ "type": "main",
616
+ "strength": 1.0,
617
+ "beat_index": len(beat_times) - 1
618
+ })
619
+ except (ValueError, TypeError):
620
+ # Skip if conversion fails
621
+ pass
622
+
623
+ return {
624
+ "tempo": float(tempo) if tempo is not None else 120.0,
625
+ "beat_times": beat_times,
626
+ "subbeat_times": subbeat_times
627
+ }
628
+
629
+ def map_beats_to_seconds(subbeat_times, duration, fps=1.0):
630
+ """
631
+ Map beats and subbeats to second-level intervals.
632
+
633
+ Parameters:
634
+ subbeat_times: List of dictionaries containing beat and subbeat information
635
+ duration: Total duration of the audio in seconds
636
+ fps: Frames per second (default: 1.0 for one-second intervals)
637
+
638
+ Returns:
639
+ List of dictionaries, each containing beats within a time window
640
+ """
641
+ # Safety check for input parameters
642
+ if not isinstance(subbeat_times, list):
643
+ print("Warning: subbeat_times is not a list")
644
+ subbeat_times = []
645
+
646
+ try:
647
+ duration = float(duration)
648
+ except (ValueError, TypeError):
649
+ print("Warning: duration is not convertible to float, defaulting to 30")
650
+ duration = 30.0
651
+
652
+ # Calculate number of time windows
653
+ num_windows = int(duration * fps) + 1
654
+
655
+ # Initialize time windows
656
+ time_windows = []
657
+
658
+ for i in range(num_windows):
659
+ # Calculate window boundaries
660
+ start_time = i / fps
661
+ end_time = (i + 1) / fps
662
+
663
+ # Find beats and subbeats within this window
664
+ window_beats = []
665
+
666
+ for beat in subbeat_times:
667
+ # Safety check for beat object
668
+ if not isinstance(beat, dict):
669
+ continue
670
+
671
+ # Safely access beat time
672
+ try:
673
+ beat_time = float(beat.get("time", 0))
674
+ except (ValueError, TypeError):
675
+ continue
676
+
677
+ if start_time <= beat_time < end_time:
678
+ # Safely extract beat properties with defaults
679
+ beat_type = beat.get("type", "sub")
680
+ if not isinstance(beat_type, str):
681
+ beat_type = "sub"
682
+
683
+ # Safely handle strength
684
+ try:
685
+ strength = float(beat.get("strength", 0.5))
686
+ except (ValueError, TypeError):
687
+ strength = 0.5
688
+
689
+ # Add beat to this window
690
+ window_beats.append({
691
+ "time": beat_time,
692
+ "type": beat_type,
693
+ "strength": strength,
694
+ "relative_pos": (beat_time - start_time) / (1/fps) # Position within window (0-1)
695
+ })
696
+
697
+ # Add window to list
698
+ time_windows.append({
699
+ "second": i,
700
+ "start": start_time,
701
+ "end": end_time,
702
+ "beats": window_beats
703
+ })
704
+
705
+ return time_windows
706
+
707
+ def create_second_level_templates(sec_map, tempo, genre=None):
708
+ """
709
+ Create syllable templates for each second-level window.
710
+
711
+ Parameters:
712
+ sec_map: List of second-level time windows with beat information
713
+ tempo: Tempo in BPM
714
+ genre: Optional genre for genre-specific adjustments
715
+
716
+ Returns:
717
+ List of template strings, one for each second
718
+ """
719
+ # Helper function to map tempo to base syllable count
720
+ def tempo_to_syllable_base(tempo):
721
+ """Continuous function mapping tempo to syllable base count"""
722
+ # Sigmoid-like function that smoothly transitions between syllable counts
723
+ if tempo > 180:
724
+ return 1.0
725
+ elif tempo > 140:
726
+ return 1.0 + (180 - tempo) * 0.02 # Gradual increase 1.0 → 1.8
727
+ elif tempo > 100:
728
+ return 1.8 + (140 - tempo) * 0.01 # Gradual increase 1.8 → 2.2
729
+ elif tempo > 70:
730
+ return 2.2 + (100 - tempo) * 0.02 # Gradual increase 2.2 → 2.8
731
+ else:
732
+ return 2.8 + max(0, (70 - tempo) * 0.04) # Continue increasing for very slow tempos
733
+
734
+ # Calculate base syllable count from tempo
735
+ base_syllables = tempo_to_syllable_base(tempo)
736
+
737
+ # Apply genre-specific adjustments
738
+ genre_factor = 1.0
739
+ if genre:
740
+ genre_lower = genre.lower()
741
+ if any(term in genre_lower for term in ["rap", "hip hop", "hip-hop"]):
742
+ genre_factor = 1.4 # Much higher syllable density for rap
743
+ elif any(term in genre_lower for term in ["folk", "country", "ballad"]):
744
+ genre_factor = 0.8 # Lower density for folk styles
745
+
746
+ # Create templates for each second
747
+ templates = []
748
+
749
+ for window in sec_map:
750
+ beats = window["beats"]
751
+
752
+ # If no beats in this second, create a default template
753
+ if not beats:
754
+ templates.append("w(0.5):1")
755
+ continue
756
+
757
+ # Create beat patterns for this second
758
+ beat_patterns = []
759
+
760
+ for beat in beats:
761
+ # Ensure we're dealing with a dictionary and that it has a "strength" key
762
+ if not isinstance(beat, dict):
763
+ continue # Skip this beat if it's not a dictionary
764
+
765
+ # Safely get beat type and strength
766
+ if "type" not in beat or not isinstance(beat["type"], str):
767
+ beat_type = "w" # Default to weak if type is missing or not a string
768
+ else:
769
+ beat_type = "S" if beat["type"] == "main" else "m" if beat.get("strength", 0) >= 0.7 else "w"
770
+
771
+ # Safely get strength value with fallback
772
+ try:
773
+ strength = float(beat.get("strength", 0.5))
774
+ except (ValueError, TypeError):
775
+ strength = 0.5 # Default if conversion fails
776
+
777
+ # Adjust syllable count based on beat type and strength
778
+ if beat_type == "S":
779
+ syllable_factor = 1.2 # More syllables for strong beats
780
+ elif beat_type == "m":
781
+ syllable_factor = 1.0 # Normal for medium beats
782
+ else:
783
+ syllable_factor = 0.8 # Fewer for weak beats
784
+
785
+ # Calculate final syllable count
786
+ syllable_count = base_syllables * syllable_factor * genre_factor
787
+
788
+ # Round to half-syllable precision
789
+ syllable_count = round(syllable_count * 2) / 2
790
+
791
+ # Ensure reasonable limits
792
+ syllable_count = max(0.5, min(4, syllable_count))
793
+
794
+ # Format with embedded strength value
795
+ strength_pct = round(strength * 100) / 100
796
+ beat_patterns.append(f"{beat_type}({strength_pct}):{syllable_count}")
797
+
798
+ # Join patterns with dashes - ensure we have at least one pattern
799
+ if not beat_patterns:
800
+ templates.append("w(0.5):1") # Default if no valid patterns were created
801
+ else:
802
+ second_template = "-".join(beat_patterns)
803
+ templates.append(second_template)
804
+
805
+ return templates
806
+
807
  def detect_sections(y, sr):
808
  """
809
  Advanced detection of musical sections with adaptive segmentation and improved classification.
 
1085
  import numpy as np
1086
  from sklearn.cluster import KMeans
1087
 
1088
+ # Convert any numpy values to native Python types for safety - directly handle conversions
1089
+ # Process the dictionary to convert numpy values to Python native types
1090
+ if isinstance(beats_info, dict):
1091
+ processed_beats_info = {}
1092
+ for k, v in beats_info.items():
1093
+ if isinstance(v, np.ndarray):
1094
+ if v.size == 1:
1095
+ processed_beats_info[k] = float(v.item())
1096
+ else:
1097
+ processed_beats_info[k] = [float(x) if isinstance(x, np.number) else x for x in v]
1098
+ elif isinstance(v, np.number):
1099
+ processed_beats_info[k] = float(v)
1100
+ elif isinstance(v, list):
1101
+ processed_beats_info[k] = [float(x) if isinstance(x, np.number) else x for x in v]
1102
+ else:
1103
+ processed_beats_info[k] = v
1104
+ beats_info = processed_beats_info
1105
+
1106
  # Extract basic beat information
1107
  beat_times = beats_info.get("beat_times", [])
1108
  beat_strengths = beats_info.get("beat_strengths", [1.0] * len(beat_times))
 
1504
 
1505
  return "\n".join(output)
1506
 
1507
+ def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=None):
1508
  """
1509
  Enhanced verification of syllable counts and stress patterns with precise alignment analysis
1510
+ for both phrase-level and second-level templates.
1511
  """
1512
  import re
1513
  import pronouncing
 
1515
  import functools
1516
  from itertools import chain
1517
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1518
  # Split lyrics into lines
1519
  lines = [line.strip() for line in lyrics.split("\n") if line.strip()]
1520
 
 
1730
  # If no matching template was found
1731
  verification_notes.append(f"Line {i+1}: Unable to find matching template pattern")
1732
 
1733
+ # Add second-level verification if templates are provided
1734
+ if second_level_templates:
1735
+ verification_notes.append("\n=== SECOND-LEVEL VERIFICATION ===\n")
1736
+
1737
+ # Check each second against corresponding line
1738
+ for i, template in enumerate(second_level_templates):
1739
+ if i >= len(lines):
1740
+ break
1741
+
1742
+ line = lines[i]
1743
+
1744
+ # Skip section headers
1745
+ if line.startswith('[') and ']' in line:
1746
+ continue
1747
+
1748
+ actual_count = count_syllables(line)
1749
+
1750
+ # Parse template to get expected syllable count
1751
+ total_expected = 0
1752
+ beat_patterns = []
1753
+
1754
+ # Handle templates with beat patterns like "S(0.95):2-w(0.4):1"
1755
+ if isinstance(template, str) and "-" in template:
1756
+ for beat in template.split("-"):
1757
+ if ":" in beat:
1758
+ try:
1759
+ count_part = beat.split(":")[1]
1760
+ count = float(count_part)
1761
+ total_expected += count
1762
+
1763
+ # Extract beat type for alignment check
1764
+ beat_type = beat.split("(")[0] if "(" in beat else beat[0]
1765
+ beat_patterns.append((beat_type, count))
1766
+ except (IndexError, ValueError):
1767
+ pass
1768
+
1769
+ # Compare actual vs expected count
1770
+ if total_expected > 0:
1771
+ # Calculate adaptive threshold based on expected syllables
1772
+ expected_ratio = 0.2 # More strict at second level
1773
+ threshold = max(0.5, round(total_expected * expected_ratio))
1774
+
1775
+ difference = abs(actual_count - total_expected)
1776
+
1777
+ if difference > threshold:
1778
+ verification_notes.append(f"Second {i+1}: Expected {total_expected} syllables, got {actual_count}")
1779
+ total_mismatch_count += 1
1780
+
1781
+ # Check for stress misalignment in this second
1782
+ words = re.findall(r'\b[a-zA-Z]+\b', line.lower())
1783
+ word_analysis = []
1784
+ cumulative_syllables = 0
1785
+
1786
+ for word in words:
1787
+ syllable_count = count_syllables_for_word(word)
1788
+ stress_pattern = get_word_stress(word)
1789
+
1790
+ word_analysis.append({
1791
+ "word": word,
1792
+ "syllables": syllable_count,
1793
+ "stress_pattern": stress_pattern,
1794
+ "position": cumulative_syllables
1795
+ })
1796
+
1797
+ cumulative_syllables += syllable_count
1798
+
1799
+ # Check if stressed syllables align with strong beats
1800
+ if beat_patterns:
1801
+ strong_positions = []
1802
+ current_pos = 0
1803
+
1804
+ for beat_type, count in beat_patterns:
1805
+ if beat_type == "S":
1806
+ strong_positions.append(current_pos)
1807
+ current_pos += count
1808
+
1809
+ # Look for misalignments
1810
+ for pos in strong_positions:
1811
+ for word_info in word_analysis:
1812
+ word_start = word_info["position"]
1813
+ word_end = word_start + word_info["syllables"]
1814
+
1815
+ if word_start <= pos < word_end:
1816
+ # Check if a stressed syllable falls on this position
1817
+ syllable_in_word = int(pos - word_start)
1818
+ stress = word_info["stress_pattern"]
1819
+
1820
+ if stress and syllable_in_word < len(stress) and stress[syllable_in_word] != '1':
1821
+ verification_notes.append(f" → In second {i+1}, '{word_info['word']}' has unstressed syllable on strong beat")
1822
+ break
1823
+
1824
  # Only add detailed analysis if we have rhythm mismatches
1825
  if verification_notes:
1826
  lyrics += "\n\n[Note: Potential rhythm mismatches detected in these lines:]\n"
 
2018
  Returns:
2019
  Generated lyrics aligned with the rhythm patterns of the music
2020
  """
2021
+ # Ensure emotion_results is a dictionary with the expected structure
2022
+ if not isinstance(emotion_results, dict):
2023
+ emotion_results = {
2024
+ "emotion_analysis": {"primary_emotion": "Unknown"},
2025
+ "theme_analysis": {"primary_theme": "Unknown"},
2026
+ "rhythm_analysis": {"tempo": 0},
2027
+ "tonal_analysis": {"key": "Unknown", "mode": ""},
2028
+ "summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"}
2029
+ }
2030
+
2031
+ # Extract emotion and theme data with safe defaults
2032
+ primary_emotion = emotion_results.get("emotion_analysis", {}).get("primary_emotion", "Unknown")
2033
+ primary_theme = emotion_results.get("theme_analysis", {}).get("primary_theme", "Unknown")
2034
+
2035
+ # Extract numeric values safely with fallbacks
2036
+ try:
2037
+ tempo = float(emotion_results.get("rhythm_analysis", {}).get("tempo", 0.0))
2038
+ except (ValueError, TypeError):
2039
+ tempo = 0.0
2040
+
2041
+ key = emotion_results.get("tonal_analysis", {}).get("key", "Unknown")
2042
+ mode = emotion_results.get("tonal_analysis", {}).get("mode", "")
2043
  # Extract emotion and theme data from analysis results
2044
  primary_emotion = emotion_results["emotion_analysis"]["primary_emotion"]
2045
  primary_theme = emotion_results["theme_analysis"]["primary_theme"]
 
2062
  structure_visualization += f"Song Duration: {duration:.1f} seconds\n"
2063
  structure_visualization += f"Tempo: {tempo:.1f} BPM\n\n"
2064
 
2065
+ # Add second-level template guidance if available
2066
+ if song_structure and "second_level" in song_structure and song_structure["second_level"]:
2067
+ second_level_templates = song_structure["second_level"]["templates"]
2068
+
2069
+ # Create second-level guidance
2070
+ second_level_guidance = "\nSECOND-BY-SECOND RHYTHM INSTRUCTIONS:\n"
2071
+ second_level_guidance += "Each line below corresponds to ONE SECOND of audio. Follow these rhythm patterns EXACTLY:\n\n"
2072
+
2073
+ # Format each second's template
2074
+ formatted_second_templates = []
2075
+ for i, template in enumerate(second_level_templates):
2076
+ if i < min(60, len(second_level_templates)): # Limit to 60 seconds to avoid overwhelming the LLM
2077
+ formatted_template = format_syllable_templates_for_prompt(template, arrow="→", line_wrap=0)
2078
+ formatted_second_templates.append(f"Second {i+1}: {formatted_template}")
2079
+
2080
+ second_level_guidance += "\n".join(formatted_second_templates)
2081
+
2082
+ # Add critical instructions for second-level alignment
2083
+ second_level_guidance += "\n\nCRITICAL: Create ONE LINE of lyrics for EACH SECOND, following the exact rhythm pattern."
2084
+ second_level_guidance += "\nIf a second has no beats, use it for a breath or pause in the lyrics."
2085
+ second_level_guidance += "\nThe first line of your lyrics MUST match Second 1, the second line matches Second 2, and so on."
2086
+
2087
+ # Add to syllable guidance
2088
+ syllable_guidance = second_level_guidance
2089
+
2090
+ # Store templates for verification
2091
+ templates_for_verification = second_level_templates
2092
+
2093
+ elif song_structure:
2094
  # Try to use flexible structure if available
2095
  if "flexible_structure" in song_structure and song_structure["flexible_structure"]:
2096
  flexible = song_structure["flexible_structure"]
 
2390
  # Store the syllable guidance for later use
2391
  syllable_guidance_text = syllable_guidance
2392
 
2393
+ # Determine if we should use traditional sections or second-level alignment
2394
+ use_sections = True
2395
+ use_second_level = False
2396
+
2397
+ if song_structure and "second_level" in song_structure and song_structure["second_level"]:
2398
+ use_second_level = True
2399
+ # If we have second-level templates, prioritize those over traditional sections
2400
+ if len(song_structure["second_level"]["templates"]) > 0:
2401
+ use_sections = False
2402
+ elif song_structure and "flexible_structure" in song_structure and song_structure["flexible_structure"]:
2403
  # If we have more than 4 segments, it's likely not a traditional song structure
2404
  if "segments" in song_structure["flexible_structure"]:
2405
  segments = song_structure["flexible_structure"]["segments"]
 
2407
  use_sections = False
2408
 
2409
  # Create enhanced prompt with better rhythm alignment instructions
2410
+ if use_second_level:
2411
+ # Second-level approach with per-second alignment
2412
+ content = f"""
2413
+ You are a talented songwriter who specializes in {genre} music.
2414
+ Write original {genre} song lyrics for a song that is {duration:.1f} seconds long.
2415
+
2416
+ IMPORTANT: DO NOT include any thinking process, explanations, or analysis before the lyrics. Start directly with the song lyrics.
2417
+
2418
+ Music analysis has detected the following qualities in the music:
2419
+ - Tempo: {tempo:.1f} BPM
2420
+ - Key: {key} {mode}
2421
+ - Primary emotion: {primary_emotion}
2422
+ - Primary theme: {primary_theme}
2423
+
2424
+ {syllable_guidance}
2425
+
2426
+ CRITICAL INSTRUCTIONS FOR SECOND-LEVEL RHYTHM ALIGNMENT:
2427
+ 1. Each line of lyrics MUST correspond to ONE SECOND of audio.
2428
+ 2. The first line of your lyrics MUST match Second 1, the second line matches Second 2, etc.
2429
+ 3. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern)
2430
+ 4. Natural word stress patterns must match the beat strength (strong words on strong beats)
2431
+ 5. For seconds with no beats, use a pause, breath, or continue a phrase from previous line
2432
+ 6. Pay attention to strength values in the pattern (higher values need stronger emphasis)
2433
+ 7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables
2434
+
2435
+ The lyrics should:
2436
+ - Perfectly capture the essence and style of {genre} music
2437
+ - Express the {primary_emotion} emotion and {primary_theme} theme
2438
+ - Match EXACTLY with the second-by-second rhythm patterns provided above
2439
+ - Be completely original
2440
+ - Create a coherent song that flows naturally despite the precise timing requirements
2441
+
2442
+ IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics.
2443
+
2444
+ IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]"
2445
+ where you analyze how well the lyrics align with the musical rhythm. This section MUST appear
2446
+ even if there are no rhythm issues. Include the following in your analysis:
2447
+ 1. How well each line matches its corresponding second's rhythm pattern
2448
+ 2. Where stressed syllables align with strong beats
2449
+ 3. Any potential misalignments or improvements
2450
+
2451
+ Your lyrics:
2452
+ """
2453
+ elif use_sections:
2454
  # Traditional approach with sections
2455
  content = f"""
2456
  You are a talented songwriter who specializes in {genre} music.
2457
  Write original {genre} song lyrics for a song that is {duration:.1f} seconds long.
2458
 
2459
+ IMPORTANT: DO NOT include any thinking process, explanations, or analysis before the lyrics. Start directly with the song lyrics.
2460
+
2461
  Music analysis has detected the following qualities in the music:
2462
  - Tempo: {tempo:.1f} BPM
2463
  - Key: {key} {mode}
 
2475
  6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis)
2476
  7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels
2477
 
 
 
 
 
 
 
 
 
2478
  The lyrics should:
2479
  - Perfectly capture the essence and style of {genre} music
2480
  - Express the {primary_emotion} emotion and {primary_theme} theme
 
2482
  - Be completely original
2483
  - Match the song duration of {duration:.1f} seconds
2484
 
2485
+ IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics.
2486
+
2487
  IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]"
2488
  where you analyze how well the lyrics align with the musical rhythm. This section MUST appear
2489
  even if there are no rhythm issues. Include the following in your analysis:
 
2499
  You are a talented songwriter who specializes in {genre} music.
2500
  Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long.
2501
 
2502
+ IMPORTANT: DO NOT include any thinking process, explanations, or analysis before the lyrics. Start directly with the song lyrics.
2503
+
2504
  Music analysis has detected the following qualities:
2505
  - Tempo: {tempo:.1f} BPM
2506
  - Key: {key} {mode}
 
2518
  6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis)
2519
  7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels
2520
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2521
  The lyrics should:
2522
  - Perfectly capture the essence and style of {genre} music
2523
  - Express the {primary_emotion} emotion and {primary_theme} theme
 
2528
  Include any section labels like [Verse] or [Chorus] as indicated in the rhythm patterns above.
2529
  Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY.
2530
 
2531
+ IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics.
2532
+
2533
  IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]"
2534
  where you analyze how well the lyrics align with the musical rhythm. This section MUST appear
2535
  even if there are no rhythm issues. Include the following in your analysis:
 
2542
 
2543
  # Format as a chat message for the LLM
2544
  messages = [
2545
+ {"role": "system", "content": "You are a professional songwriter. Create lyrics that match the specified rhythm patterns exactly. Start with the lyrics immediately without any explanation or thinking. Be concise and direct."},
2546
  {"role": "user", "content": content}
2547
  ]
2548
 
 
2559
  # Configure generation parameters based on model capability
2560
  generation_params = {
2561
  "do_sample": True,
2562
+ "temperature": 0.5, # Lower for more consistent and direct output
2563
+ "top_p": 0.85, # Slightly lower for more predictable responses
2564
+ "top_k": 50,
2565
  "repetition_penalty": 1.2,
2566
+ "max_new_tokens": 2048,
2567
+ "num_return_sequences": 1
2568
  }
2569
 
2570
+ # Add specific stop sequences to prevent excessive explanation
2571
+ if hasattr(llm_model.generation_config, "stopping_criteria"):
2572
+ thinking_stops = ["Let me think", "First, I need to", "Let's analyze", "I'll approach this", "Step 1:", "To start,"]
2573
+ for stop in thinking_stops:
2574
+ if stop not in llm_model.generation_config.stopping_criteria:
2575
+ llm_model.generation_config.stopping_criteria.append(stop)
2576
+
2577
  # Generate output
2578
  generated_ids = llm_model.generate(
2579
  **model_inputs,
 
2583
  # Extract output tokens
2584
  output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
2585
 
2586
+ # Get the raw output and strip any thinking process
2587
  lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip()
2588
 
2589
+ # Enhanced thinking process removal - handle multiple formats
2590
+ # First check for standard thinking tags
2591
  if "<thinking>" in lyrics and "</thinking>" in lyrics:
2592
  lyrics = lyrics.split("</thinking>")[1].strip()
2593
 
2594
+ # Check for alternative thinking indicators with improved detection
2595
+ thinking_markers = [
2596
+ "<think>", "</think>",
2597
+ "[thinking]", "[/thinking]",
2598
+ "I'll think step by step:",
2599
+ "First, I need to understand",
2600
+ "Let me think about",
2601
+ "Let's tackle this query",
2602
+ "Okay, let's tackle this query",
2603
+ "First, I need to understand the requirements",
2604
+ "Looking at the rhythm patterns"
2605
+ ]
2606
+
2607
+ # First try to find clear section breaks
2608
  for marker in thinking_markers:
2609
  if marker in lyrics:
2610
  parts = lyrics.split(marker)
2611
  if len(parts) > 1:
2612
  lyrics = parts[-1].strip() # Take the last part after any thinking marker
2613
 
2614
+ # Look for long analytical sections followed by clear lyrics
2615
+ analytical_patterns = [
2616
+ "Let me analyze",
2617
+ "I need to understand",
2618
+ "The tempo is",
2619
+ "First, let's look at",
2620
+ "Wait, maybe",
2621
+ "Considering the emotional tone",
2622
+ "Starting with the first line",
2623
+ "Let me check the examples"
2624
+ ]
2625
+
2626
+ # Check if lyrics begin with any analytical patterns
2627
+ for pattern in analytical_patterns:
2628
+ if lyrics.startswith(pattern):
2629
+ # Try to find where the actual lyrics start - look for common lyrics markers
2630
+ lyrics_markers = [
2631
+ "\n\n[Verse",
2632
+ "\n\n[Chorus",
2633
+ "\n\nVerse",
2634
+ "\n\nChorus",
2635
+ "\n\n[Verse 1]",
2636
+ "\n\n[Intro]"
2637
+ ]
2638
+
2639
+ for marker in lyrics_markers:
2640
+ if marker in lyrics:
2641
+ lyrics = lyrics[lyrics.index(marker):].strip()
2642
+ break
2643
+
2644
+ # One last effort to clean up - if the text is very long and contains obvious thinking
2645
+ # before getting to actual lyrics, try to find a clear starting point
2646
+ if len(lyrics.split()) > 100 and "\n\n" in lyrics:
2647
+ paragraphs = lyrics.split("\n\n")
2648
+ for i, paragraph in enumerate(paragraphs):
2649
+ # Look for typical song structure indicators in a paragraph
2650
+ if any(marker in paragraph for marker in ["[Verse", "[Chorus", "Verse 1", "Chorus:"]):
2651
+ lyrics = "\n\n".join(paragraphs[i:])
2652
+ break
2653
+
2654
+ # Clean up any remaining thinking artifacts at the beginning
2655
+ lines = lyrics.split('\n')
2656
+ clean_lines = []
2657
+ lyrics_started = False
2658
+
2659
+ for line in lines:
2660
+ # Skip initial commentary/thinking lines until we hit what looks like lyrics
2661
+ if not lyrics_started:
2662
+ if (line.strip().startswith('[') and ']' in line) or not any(thinking in line.lower() for thinking in ["i think", "let me", "maybe", "perhaps", "alternatively", "checking"]):
2663
+ lyrics_started = True
2664
+
2665
+ if lyrics_started:
2666
+ clean_lines.append(line)
2667
+
2668
+ # Only use the cleaning logic if we found some actual lyrics
2669
+ if clean_lines:
2670
+ lyrics = '\n'.join(clean_lines)
2671
+
2672
+ # Special handling for second-level templates
2673
+ second_level_verification = None
2674
+ if song_structure and "second_level" in song_structure and song_structure["second_level"]:
2675
+ second_level_verification = song_structure["second_level"]["templates"]
2676
+
2677
+ # Verify syllable counts with enhanced verification - pass second-level templates if available
2678
  if templates_for_verification:
2679
+ # Convert any NumPy values to native types before verification - directly handle conversions
2680
+ # Simple conversion for basic templates (non-recursive)
2681
+ if isinstance(templates_for_verification, list):
2682
+ safe_templates = []
2683
+ for template in templates_for_verification:
2684
+ if isinstance(template, dict):
2685
+ processed_template = {}
2686
+ for k, v in template.items():
2687
+ if isinstance(v, np.ndarray):
2688
+ if v.size == 1:
2689
+ processed_template[k] = float(v.item())
2690
+ else:
2691
+ processed_template[k] = [float(x) if isinstance(x, np.number) else x for x in v]
2692
+ elif isinstance(v, np.number):
2693
+ processed_template[k] = float(v)
2694
+ else:
2695
+ processed_template[k] = v
2696
+ safe_templates.append(processed_template)
2697
+ else:
2698
+ safe_templates.append(template)
2699
+ else:
2700
+ safe_templates = templates_for_verification
2701
+
2702
+ verified_lyrics = verify_flexible_syllable_counts(lyrics, safe_templates, second_level_verification)
2703
 
2704
  # Check if significant issues were detected
2705
  if "[Note: Potential rhythm mismatches" in verified_lyrics and "Detailed Alignment Analysis" in verified_lyrics:
 
2760
  refined_lyrics = llm_tokenizer.decode(refined_output_ids, skip_special_tokens=True).strip()
2761
 
2762
  # Verify the refined lyrics
2763
+ refined_verified_lyrics = verify_flexible_syllable_counts(refined_lyrics, safe_templates, second_level_verification)
2764
 
2765
  # Only use refined lyrics if they're better (fewer notes)
2766
  if "[Note: Potential rhythm mismatches" not in refined_verified_lyrics:
 
2828
 
2829
  if len(templates_for_verification) > 30:
2830
  syllable_analysis += f"... and {len(templates_for_verification) - 30} more lines\n\n"
2831
+
2832
+ # Add second-level analysis if available
2833
+ if second_level_verification:
2834
+ syllable_analysis += "\nSecond-Level Template Analysis:\n"
2835
+ for i, template in enumerate(second_level_verification):
2836
+ if i < min(len(second_level_verification), 30): # Limit to 30 seconds
2837
+ syllable_analysis += f"Second {i+1}: {template}\n"
2838
+
2839
+ if len(second_level_verification) > 30:
2840
+ syllable_analysis += f"... and {len(second_level_verification) - 30} more seconds\n"
2841
 
2842
  # Add structure visualization to syllable analysis
2843
  syllable_analysis += "\n" + structure_visualization
 
2893
  print(f"Error in genre classification: {str(e)}")
2894
  return f"Error in genre classification: {str(e)}", None, ast_results
2895
 
2896
+ # Initialize default values
2897
+ ast_results = ast_results if ast_results else []
2898
+ song_structure = None
2899
+ emotion_results = {
2900
+ "emotion_analysis": {"primary_emotion": "Unknown"},
2901
+ "theme_analysis": {"primary_theme": "Unknown"},
2902
+ "rhythm_analysis": {"tempo": 0},
2903
+ "tonal_analysis": {"key": "Unknown", "mode": ""},
2904
+ "summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"}
2905
+ }
2906
+
2907
  print("Step 4/5: Analyzing music emotions, themes, and structure...")
2908
  # Analyze music emotions and themes
2909
  try:
2910
  emotion_results = music_analyzer.analyze_music(audio_file)
2911
  except Exception as e:
2912
  print(f"Error in emotion analysis: {str(e)}")
2913
+ # Continue with default emotion_results
 
 
 
 
 
 
 
2914
 
2915
  # Calculate detailed song structure for better lyrics alignment
2916
  try:
2917
+ # Load audio data
2918
  y, sr = load_audio(audio_file, SAMPLE_RATE)
2919
 
2920
  # Analyze beats and phrases for music-aligned lyrics
 
2995
  "end": segment_end
2996
  })
2997
 
2998
+ # Create flexible structure with the segments
2999
  flexible_structure = {
3000
  "beats": beats_info,
3001
  "segments": segments
3002
  }
3003
 
3004
+ # Create song structure object
3005
  song_structure = {
3006
  "beats": beats_info,
3007
  "sections": sections_info,
3008
+ "flexible_structure": flexible_structure,
3009
+ "syllables": []
3010
  }
3011
 
3012
  # Add syllable counts to each section
 
3013
  for section in sections_info:
3014
  # Create syllable templates for sections
3015
  section_beats_info = {
 
3045
 
3046
  song_structure["syllables"].append(section_info)
3047
 
3048
+ # Add second-level beat analysis
3049
+ try:
3050
+ # Get enhanced beat information with subbeats
3051
+ subbeat_info = detect_beats_and_subbeats(y, sr, subdivision=4)
3052
+
3053
+ # Map beats to second-level windows
3054
+ sec_map = map_beats_to_seconds(
3055
+ subbeat_info["subbeat_times"],
3056
+ audio_data["duration"]
3057
+ )
3058
+
3059
+ # Create second-level templates
3060
+ second_level_templates = create_second_level_templates(
3061
+ sec_map,
3062
+ subbeat_info["tempo"],
3063
+ top_genres[0][0] # Use top genre
3064
+ )
3065
+
3066
+ # Add to song structure
3067
+ song_structure["second_level"] = {
3068
+ "sec_map": sec_map,
3069
+ "templates": second_level_templates
3070
+ }
3071
+
3072
+ except Exception as e:
3073
+ print(f"Error in second-level beat analysis: {str(e)}")
3074
+ # Continue without second-level data
3075
+
3076
  except Exception as e:
3077
  print(f"Error analyzing song structure: {str(e)}")
3078
+ # Continue without song structure
 
3079
 
3080
  print("Step 5/5: Generating rhythmically aligned lyrics...")
3081
  # Generate lyrics based on top genre, emotion analysis, and song structure
 
3119
  print(error_msg)
3120
  return error_msg, None, []
3121
 
3122
+ def format_complete_beat_timeline(audio_file, lyrics=None):
3123
+ """Creates a complete formatted timeline showing all beat timings and their syllable patterns without truncation"""
3124
+ if audio_file is None:
3125
+ return "Please upload an audio file to see beat timeline."
3126
+
3127
+ try:
3128
+ # Extract audio data
3129
+ y, sr = load_audio(audio_file, SAMPLE_RATE)
3130
+
3131
+ # Get beat information
3132
+ beats_info = detect_beats(y, sr)
3133
+
3134
+ def ensure_float(value):
3135
+ if isinstance(value, np.ndarray) or isinstance(v, np.number): # Should be 'value', not 'v'
3136
+ return float(value)
3137
+ return value
3138
+
3139
+ # Format the timeline
3140
+ timeline = "=== BEAT & SYLLABLE TIMELINE ===\n\n"
3141
+ # Convert tempo to float before formatting if it's a numpy array
3142
+ tempo = ensure_float(beats_info['tempo'])
3143
+ timeline += f"Tempo: {tempo:.1f} BPM\n"
3144
+ timeline += f"Time Signature: {beats_info['time_signature']}/4\n"
3145
+ timeline += f"Total Beats: {beats_info['beat_count']}\n\n"
3146
+
3147
+ # Create a table header
3148
+ timeline += "| Beat # | Time (s) | Beat Strength | Syllable Pattern |\n"
3149
+ timeline += "|--------|----------|--------------|------------------|\n"
3150
+
3151
+ # Add beat-by-beat information - show ALL beats
3152
+ for i, (time, strength) in enumerate(zip(beats_info['beat_times'], beats_info['beat_strengths'])):
3153
+ # Convert numpy values to Python float if needed
3154
+ time = ensure_float(time)
3155
+ strength = ensure_float(strength)
3156
+
3157
+ # Determine beat type based on strength
3158
+ if strength >= 0.8:
3159
+ beat_type = "STRONG"
3160
+ elif strength >= 0.5:
3161
+ beat_type = "medium"
3162
+ else:
3163
+ beat_type = "weak"
3164
+
3165
+ # Create beat pattern indicator
3166
+ if i % beats_info['time_signature'] == 0:
3167
+ pattern = "S" # Strong beat at start of measure
3168
+ elif i % beats_info['time_signature'] == beats_info['time_signature'] // 2 and beats_info['time_signature'] > 3:
3169
+ pattern = "m" # Medium beat (3rd beat in 4/4)
3170
+ else:
3171
+ pattern = "w" # Weak beat
3172
+
3173
+ # Add row to table
3174
+ timeline += f"| {i+1:<6} | {time:.2f}s | {beat_type:<12} | {pattern}:{1.5 if pattern=='S' else 1.0} |\n"
3175
+
3176
+ # No truncation - show all beats
3177
+
3178
+ # Add a visual timeline of beats
3179
+ timeline += "\n=== VISUAL BEAT TIMELINE ===\n\n"
3180
+ timeline += "Each character represents 0.5 seconds. Beats are marked as:\n"
3181
+ timeline += "S = Strong beat | m = Medium beat | w = Weak beat | · = No beat\n\n"
3182
+
3183
+ # Calculate total duration and create time markers
3184
+ if 'beat_times' in beats_info and len(beats_info['beat_times']) > 0:
3185
+ # Get the max value safely
3186
+ max_beat_time = max([ensure_float(t) for t in beats_info['beat_times']])
3187
+ total_duration = max_beat_time + 2 # Add 2 seconds of padding
3188
+ else:
3189
+ total_duration = 30 # Default duration if no beats found
3190
+
3191
+ time_markers = ""
3192
+ for i in range(0, int(total_duration) + 1, 5):
3193
+ time_markers += f"{i:<5}"
3194
+ timeline += time_markers + " (seconds)\n"
3195
+
3196
+ # Create a ruler for easier time tracking
3197
+ ruler = ""
3198
+ for i in range(0, int(total_duration) + 1):
3199
+ if i % 5 == 0:
3200
+ ruler += "+"
3201
+ else:
3202
+ ruler += "-"
3203
+ ruler += "-" * 9 # Each second is 10 characters wide
3204
+ timeline += ruler + "\n"
3205
+
3206
+ # Create a visualization of beats with symbols
3207
+ beat_line = ["·"] * int(total_duration * 2) # 2 characters per second
3208
+
3209
+ for i, time in enumerate(beats_info['beat_times']):
3210
+ if i >= len(beats_info['beat_strengths']):
3211
+ break
3212
+
3213
+ # Convert to float if it's a numpy array
3214
+ time_val = ensure_float(time)
3215
+
3216
+ # Determine position in the timeline
3217
+ pos = int(time_val * 2) # Convert to position in the beat_line
3218
+ if pos >= len(beat_line):
3219
+ continue
3220
+
3221
+ # Determine beat type based on strength and position
3222
+ strength = beats_info['beat_strengths'][i]
3223
+ # Convert to float if it's a numpy array
3224
+ strength = ensure_float(strength)
3225
+
3226
+ if i % beats_info['time_signature'] == 0:
3227
+ beat_line[pos] = "S" # Strong beat at start of measure
3228
+ elif strength >= 0.8:
3229
+ beat_line[pos] = "S" # Strong beat
3230
+ elif i % beats_info['time_signature'] == beats_info['time_signature'] // 2 and beats_info['time_signature'] > 3:
3231
+ beat_line[pos] = "m" # Medium beat (3rd beat in 4/4)
3232
+ elif strength >= 0.5:
3233
+ beat_line[pos] = "m" # Medium beat
3234
+ else:
3235
+ beat_line[pos] = "w" # Weak beat
3236
+
3237
+ # Format and add to timeline
3238
+ beat_visualization = ""
3239
+ for i in range(0, len(beat_line), 10):
3240
+ beat_visualization += "".join(beat_line[i:i+10])
3241
+ if i + 10 < len(beat_line):
3242
+ beat_visualization += " " # Add space every 5 seconds
3243
+ timeline += beat_visualization + "\n\n"
3244
+
3245
+ # Add measure markers
3246
+ timeline += "=== MEASURE MARKERS ===\n\n"
3247
+
3248
+ # Create a list to track measure start times
3249
+ measure_starts = []
3250
+ for i, time in enumerate(beats_info['beat_times']):
3251
+ if i % beats_info['time_signature'] == 0: # Start of measure
3252
+ # Convert to float if it's a numpy array
3253
+ time_val = ensure_float(time)
3254
+ measure_starts.append((i // beats_info['time_signature'] + 1, time_val))
3255
+
3256
+ # Format measure information
3257
+ if measure_starts:
3258
+ timeline += "| Measure # | Start Time | Duration |\n"
3259
+ timeline += "|-----------|------------|----------|\n"
3260
+
3261
+ for i in range(len(measure_starts)):
3262
+ measure_num, start_time = measure_starts[i]
3263
+
3264
+ # Calculate end time (start of next measure or end of song)
3265
+ if i < len(measure_starts) - 1:
3266
+ end_time = measure_starts[i+1][1]
3267
+ elif 'beat_times' in beats_info and len(beats_info['beat_times']) > 0:
3268
+ # Get the last beat time and convert to float if needed
3269
+ last_beat = beats_info['beat_times'][-1]
3270
+ end_time = ensure_float(last_beat)
3271
+ else:
3272
+ end_time = start_time + 2.0 # Default 2 seconds if no next measure
3273
+
3274
+ duration = end_time - start_time
3275
+
3276
+ timeline += f"| {measure_num:<9} | {start_time:.2f}s | {duration:.2f}s |\n"
3277
+
3278
+ # No truncation - show all measures
3279
+
3280
+ # Add phrase information
3281
+ if 'phrases' in beats_info and beats_info['phrases']:
3282
+ timeline += "\n=== MUSICAL PHRASES ===\n\n"
3283
+ for i, phrase in enumerate(beats_info['phrases']):
3284
+ # Show all phrases, not just the first 10
3285
+ if not phrase:
3286
+ continue
3287
+
3288
+ # Safely check phrase indices
3289
+ if not (len(phrase) > 0 and len(beats_info['beat_times']) > 0):
3290
+ continue
3291
+
3292
+ start_beat = min(phrase[0], len(beats_info['beat_times'])-1)
3293
+ end_beat = min(phrase[-1], len(beats_info['beat_times'])-1)
3294
+
3295
+ # Convert to float if needed
3296
+ phrase_start = ensure_float(beats_info['beat_times'][start_beat])
3297
+ phrase_end = ensure_float(beats_info['beat_times'][end_beat])
3298
+
3299
+ timeline += f"Phrase {i+1}: Beats {start_beat+1}-{end_beat+1} ({phrase_start:.2f}s - {phrase_end:.2f}s)\n"
3300
+
3301
+ # Create syllable template for this phrase with simplified numpy handling
3302
+ phrase_beats = {
3303
+ "beat_times": [ensure_float(beats_info['beat_times'][j])
3304
+ for j in phrase if j < len(beats_info['beat_times'])],
3305
+ "beat_strengths": [ensure_float(beats_info['beat_strengths'][j])
3306
+ for j in phrase if j < len(beats_info['beat_strengths'])],
3307
+ "tempo": ensure_float(beats_info['tempo']),
3308
+ "time_signature": beats_info['time_signature'],
3309
+ "phrases": [list(range(len(phrase)))]
3310
+ }
3311
+
3312
+ template = create_flexible_syllable_templates(phrase_beats)
3313
+ timeline += f" Syllable Template: {template}\n"
3314
+
3315
+ # Create a visual representation of this phrase
3316
+ if phrase_start < total_duration and phrase_end < total_duration:
3317
+ # Create a timeline for this phrase
3318
+ phrase_visualization = ["·"] * int(total_duration * 2)
3319
+
3320
+ # Mark the phrase boundaries
3321
+ start_pos = int(phrase_start * 2)
3322
+ end_pos = int(phrase_end * 2)
3323
+
3324
+ if start_pos < len(phrase_visualization):
3325
+ phrase_visualization[start_pos] = "["
3326
+
3327
+ if end_pos < len(phrase_visualization):
3328
+ phrase_visualization[end_pos] = "]"
3329
+
3330
+ # Mark the beats in this phrase
3331
+ for j in phrase:
3332
+ if j < len(beats_info['beat_times']):
3333
+ beat_time = ensure_float(beats_info['beat_times'][j])
3334
+ beat_pos = int(beat_time * 2)
3335
+
3336
+ if beat_pos < len(phrase_visualization) and beat_pos != start_pos and beat_pos != end_pos:
3337
+ # Determine beat type
3338
+ if j % beats_info['time_signature'] == 0:
3339
+ phrase_visualization[beat_pos] = "S"
3340
+ elif j % beats_info['time_signature'] == beats_info['time_signature'] // 2:
3341
+ phrase_visualization[beat_pos] = "m"
3342
+ else:
3343
+ phrase_visualization[beat_pos] = "w"
3344
+
3345
+ # Format and add visualization
3346
+ phrase_visual = ""
3347
+ for k in range(0, len(phrase_visualization), 10):
3348
+ phrase_visual += "".join(phrase_visualization[k:k+10])
3349
+ if k + 10 < len(phrase_visualization):
3350
+ phrase_visual += " "
3351
+
3352
+ timeline += f" Timeline: {phrase_visual}\n\n"
3353
+
3354
+ # Add second-level script display
3355
+ try:
3356
+ # Get second-level beat information
3357
+ subbeat_info = detect_beats_and_subbeats(y, sr, subdivision=4)
3358
+ duration = librosa.get_duration(y=y, sr=sr)
3359
+
3360
+ # Map to seconds
3361
+ sec_map = map_beats_to_seconds(subbeat_info["subbeat_times"], duration)
3362
+
3363
+ # Create templates
3364
+ templates = create_second_level_templates(sec_map, subbeat_info["tempo"])
3365
+
3366
+ # Add to timeline
3367
+ timeline += "\n=== SECOND-LEVEL SCRIPT ===\n\n"
3368
+ timeline += "Each line below represents ONE SECOND of audio with matching lyric content.\n"
3369
+ timeline += "| Second | Beat Pattern | Lyric Content |\n"
3370
+ timeline += "|--------|-------------|---------------|\n"
3371
+
3372
+ # Get clean lyrics (without analysis notes)
3373
+ clean_lyrics = lyrics
3374
+ if isinstance(lyrics, str):
3375
+ if "[Note: Rhythm Analysis]" in lyrics:
3376
+ clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip()
3377
+ elif "[Note: Potential rhythm mismatches" in lyrics:
3378
+ clean_lyrics = lyrics.split("[Note:")[0].strip()
3379
+
3380
+ # Get lyric lines
3381
+ lines = clean_lyrics.strip().split('\n') if clean_lyrics else []
3382
+
3383
+ for i, template in enumerate(templates):
3384
+ # Get corresponding lyric line if available
3385
+ lyric = lines[i] if i < len(lines) else ""
3386
+ if lyric.startswith('[') and ']' in lyric:
3387
+ lyric = "" # Skip section headers
3388
+
3389
+ # Format nicely for display
3390
+ timeline += f"| {i+1:<6} | {template:<30} | {lyric[:40]} |\n"
3391
+
3392
+ # Add ASCII visualization of second-level beats
3393
+ timeline += "\n=== SECOND-LEVEL VISUALIZATION ===\n\n"
3394
+ timeline += "Each row represents ONE SECOND. Beat types:\n"
3395
+ timeline += "S = Strong beat | m = Medium beat | w = Weak beat | · = No beat\n\n"
3396
+
3397
+ for i, window in enumerate(sec_map):
3398
+ beats = window["beats"]
3399
+
3400
+ # Create ASCII visualization
3401
+ beat_viz = ["·"] * 20 # 20 columns for visualization
3402
+
3403
+ for beat in beats:
3404
+ # Calculate position in visualization
3405
+ pos = int(beat["relative_pos"] * 19) # Map 0-1 to 0-19
3406
+ if 0 <= pos < len(beat_viz):
3407
+ # Set marker based on beat type
3408
+ if beat["type"] == "main":
3409
+ beat_viz[pos] = "S"
3410
+ elif beat["strength"] >= 0.7:
3411
+ beat_viz[pos] = "m"
3412
+ else:
3413
+ beat_viz[pos] = "w"
3414
+
3415
+ # Get corresponding lyric
3416
+ lyric = lines[i] if i < len(lines) else ""
3417
+ if lyric.startswith('[') and ']' in lyric:
3418
+ lyric = ""
3419
+
3420
+ # Format visualization line
3421
+ viz_line = f"Second {i+1:2d}: [" + "".join(beat_viz) + "]"
3422
+ if lyric:
3423
+ viz_line += f" → {lyric[:40]}"
3424
+
3425
+ timeline += viz_line + "\n"
3426
+
3427
+ except Exception as e:
3428
+ timeline += f"\n[Error generating second-level analysis: {str(e)}]"
3429
+
3430
+ # Add a section showing alignment if lyrics were generated
3431
+ if lyrics and isinstance(lyrics, str):
3432
+ timeline += "\n=== LYRICS-BEAT ALIGNMENT ===\n\n"
3433
+ # Remove rhythm analysis notes from lyrics if present
3434
+ if "[Note:" in lyrics:
3435
+ clean_lyrics = lyrics.split("[Note:")[0].strip()
3436
+ else:
3437
+ clean_lyrics = lyrics
3438
+
3439
+ lines = clean_lyrics.strip().split('\n')
3440
+
3441
+ # Show alignment for ALL lines, not just the first 10
3442
+ for i, line in enumerate(lines):
3443
+ if not line.strip() or line.startswith('['):
3444
+ continue
3445
+
3446
+ timeline += f"Line: \"{line}\"\n"
3447
+
3448
+ # Count syllables
3449
+ syllable_count = count_syllables(line)
3450
+ timeline += f" Syllables: {syllable_count}\n"
3451
+
3452
+ # Show ideal timing (if we have enough phrases)
3453
+ if 'phrases' in beats_info and beats_info['phrases'] and i < len(beats_info['phrases']):
3454
+ phrase = beats_info['phrases'][i]
3455
+ # Safely check if phrase has elements and indices are valid
3456
+ if phrase and len(phrase) > 0 and len(beats_info['beat_times']) > 0:
3457
+ start_beat = min(phrase[0], len(beats_info['beat_times'])-1)
3458
+ end_beat = min(phrase[-1], len(beats_info['beat_times'])-1)
3459
+
3460
+ start_time = ensure_float(beats_info['beat_times'][start_beat])
3461
+ end_time = ensure_float(beats_info['beat_times'][end_beat])
3462
+
3463
+ timeline += f" Timing: {start_time:.2f}s - {end_time:.2f}s\n"
3464
+
3465
+ # Create a visualization of syllable alignment
3466
+ timeline += " Alignment: "
3467
+
3468
+ # Create a timeline focused on just this phrase
3469
+ phrase_duration = end_time - start_time
3470
+ syllable_viz = []
3471
+
3472
+ # Initialize with beat markers for this phrase
3473
+ for j in phrase:
3474
+ if j < len(beats_info['beat_times']):
3475
+ beat_time = ensure_float(beats_info['beat_times'][j])
3476
+ # Handle edge case where phrase_duration is very small
3477
+ if phrase_duration > 0.001: # Avoid division by very small numbers
3478
+ relative_pos = int((beat_time - start_time) / phrase_duration * syllable_count)
3479
+ else:
3480
+ relative_pos = 0
3481
+
3482
+ while len(syllable_viz) <= relative_pos:
3483
+ syllable_viz.append("·")
3484
+
3485
+ if j % beats_info['time_signature'] == 0:
3486
+ syllable_viz[relative_pos] = "S"
3487
+ elif j % beats_info['time_signature'] == beats_info['time_signature'] // 2:
3488
+ syllable_viz[relative_pos] = "m"
3489
+ else:
3490
+ syllable_viz[relative_pos] = "w"
3491
+
3492
+ # Fill in any gaps
3493
+ while len(syllable_viz) < syllable_count:
3494
+ syllable_viz.append("·")
3495
+
3496
+ # Trim if too long
3497
+ syllable_viz = syllable_viz[:syllable_count]
3498
+
3499
+ # Now map to the line
3500
+ timeline += "".join(syllable_viz) + "\n"
3501
+
3502
+ timeline += "\n"
3503
+
3504
+ # No truncation message for lines
3505
+
3506
+ return timeline
3507
+
3508
+ except Exception as e:
3509
+ print(f"Error generating complete beat timeline: {str(e)}")
3510
+ return f"Error generating complete beat timeline: {str(e)}"
3511
+
3512
+ def display_results(audio_file):
3513
+ """Process audio file and return formatted results for display in the UI."""
3514
+ # Default error response
3515
+ error_response = ("Please upload an audio file.",
3516
+ "No emotion analysis available.",
3517
+ "No audio classification available.",
3518
+ "No lyrics generated.",
3519
+ "No beat timeline available.")
3520
+
3521
+ if audio_file is None:
3522
+ return error_response
3523
+
3524
+ try:
3525
+ # Process audio and get results
3526
+ results = process_audio(audio_file)
3527
+
3528
+ # Check if we got an error message
3529
+ if isinstance(results, str) and "Error" in results:
3530
+ return results, *error_response[1:]
3531
+ elif isinstance(results, tuple) and isinstance(results[0], str) and "Error" in results[0]:
3532
+ return results[0], *error_response[1:]
3533
+
3534
+ # Extract results
3535
+ if isinstance(results, dict):
3536
+ # New format
3537
+ genre_results = results.get("genre_results", "Genre classification failed")
3538
+ lyrics = results.get("lyrics", "Lyrics generation failed")
3539
+ ast_results = results.get("ast_results", [])
3540
+ else:
3541
+ # Old tuple format
3542
+ genre_results, lyrics, ast_results = results
3543
+
3544
+ # Get clean lyrics (without analysis notes)
3545
+ clean_lyrics = lyrics
3546
+ if isinstance(lyrics, str):
3547
+ if "[Note: Rhythm Analysis]" in lyrics:
3548
+ clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip()
3549
+ elif "[Note: Potential rhythm mismatches" in lyrics:
3550
+ clean_lyrics = lyrics.split("[Note:")[0].strip()
3551
+
3552
+ # Generate beat timeline - use the complete timeline function that shows all beats
3553
+ beat_timeline = format_complete_beat_timeline(audio_file, clean_lyrics)
3554
+
3555
+ # Format emotion analysis results
3556
+ emotion_text = "No emotion analysis available."
3557
+ try:
3558
+ emotion_results = music_analyzer.analyze_music(audio_file)
3559
+ emotion_text = (f"Tempo: {emotion_results['summary']['tempo']:.1f} BPM\n"
3560
+ f"Key: {emotion_results['summary']['key']} {emotion_results['summary']['mode']}\n"
3561
+ f"Primary Emotion: {emotion_results['summary']['primary_emotion']}\n"
3562
+ f"Primary Theme: {emotion_results['summary']['primary_theme']}")
3563
+
3564
+ # Add song structure if available (without nested try/except)
3565
+ y, sr = load_audio(audio_file, SAMPLE_RATE)
3566
+ beats_info = detect_beats(y, sr)
3567
+ sections_info = detect_sections(y, sr)
3568
+
3569
+ if sections_info:
3570
+ emotion_text += "\n\nSong Structure:\n"
3571
+ for section in sections_info:
3572
+ emotion_text += (f"- {section['type'].capitalize()}: {section['start']:.1f}s to {section['end']:.1f}s "
3573
+ f"({section['duration']:.1f}s)\n")
3574
+ except Exception as e:
3575
+ print(f"Error in emotion analysis: {str(e)}")
3576
+
3577
+ # Format audio classification results
3578
+ ast_text = "No valid audio classification results available."
3579
+ if ast_results and isinstance(ast_results, list):
3580
+ ast_text = "Audio Classification Results:\n"
3581
+ for result in ast_results[:5]: # Show top 5 results
3582
+ ast_text += f"{result['label']}: {result['score']*100:.2f}%\n"
3583
+
3584
+ # Return all results
3585
+ return genre_results, emotion_text, ast_text, clean_lyrics, beat_timeline
3586
+
3587
+ except Exception as e:
3588
+ error_msg = f"Error: {str(e)}"
3589
+ print(error_msg)
3590
+ return error_msg, *error_response[1:]
3591
+
3592
  # Create enhanced Gradio interface with tabs for better organization
3593
  with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
3594
  gr.Markdown("# Music Genre Classifier & Lyrics Generator")
 
3629
  with gr.TabItem("Generated Lyrics"):
3630
  lyrics_output = gr.Textbox(label="Lyrics", lines=18)
3631
 
3632
+ with gr.TabItem("Beat & Syllable Timeline"):
3633
+ beat_timeline_output = gr.Textbox(label="Beat Timings & Syllable Patterns", lines=40)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3634
 
3635
  # Connect the button to the display function with updated outputs
3636
  submit_btn.click(
3637
  fn=display_results,
3638
  inputs=[audio_input],
3639
+ outputs=[genre_output, emotion_output, ast_output, lyrics_output, beat_timeline_output]
3640
  )
3641
 
3642
  # Enhanced explanation of how the system works
 
3654
  - Strong and weak beats
3655
  - Natural phrase boundaries
3656
  - Time signature and tempo variations
3657
+ - Beat subdivisions (half and quarter beats)
3658
+
3659
+ 5. **Second-Level Alignment**: The system maps beats and subbeats to each second of audio, creating precise templates for perfect alignment.
3660
 
3661
+ 6. **Syllable Template Creation**: For each second of audio, the system generates precise syllable templates that reflect:
3662
  - Beat stress patterns (strong, medium, weak)
3663
  - Appropriate syllable counts based on tempo
3664
  - Genre-specific rhythmic qualities
3665
+ - Half-beat and quarter-beat subdivisions
3666
 
3667
+ 7. **Lyrics Generation**: Using the detected genre, emotion, and rhythm patterns, a large language model generates lyrics that:
3668
  - Match the emotional quality of the music
3669
+ - Follow the precise syllable templates for each second
3670
  - Align stressed syllables with strong beats
3671
  - Maintain genre-appropriate style and themes
3672
 
3673
+ 8. **Rhythm Verification**: The system verifies the generated lyrics, analyzing:
3674
  - Syllable count accuracy
3675
  - Stress alignment with strong beats
3676
  - Word stress patterns
3677
+ - Second-by-second alignment precision
3678
 
3679
+ 9. **Refinement**: If significant rhythm mismatches are detected, the system can automatically refine the lyrics for better alignment.
3680
 
3681
  This multi-step process creates lyrics that feel naturally connected to the music, as if they were written specifically for it.
3682
  """)
utils.py CHANGED
@@ -37,54 +37,6 @@ def extract_mfcc_features(y, sr, n_mfcc=20):
37
  # Return a fallback feature vector if extraction fails
38
  return np.zeros(n_mfcc)
39
 
40
- def calculate_lyrics_length(duration, tempo=100, time_signature=4):
41
- """Calculate appropriate lyrics structure based on musical principles."""
42
- # Legacy behavior - simple calculation based on duration
43
- lines_count = max(4, int(duration / 10))
44
-
45
- # If only duration was provided (original usage), return just the integer
46
- if not isinstance(tempo, (int, float)) or not isinstance(time_signature, (int, float)):
47
- return lines_count
48
-
49
- # Enhanced calculation
50
- beats_per_minute = tempo
51
- beats_per_second = beats_per_minute / 60
52
- total_beats = duration * beats_per_second
53
- total_measures = total_beats / time_signature
54
-
55
- # Determine section distributions
56
- verse_lines = 0
57
- chorus_lines = 0
58
- bridge_lines = 0
59
-
60
- if lines_count <= 6:
61
- verse_lines = 2
62
- chorus_lines = 2
63
- elif lines_count <= 10:
64
- verse_lines = 3
65
- chorus_lines = 2
66
- else:
67
- verse_lines = 3
68
- chorus_lines = 2
69
- bridge_lines = 2
70
-
71
- # Create structured output
72
- song_structure = {
73
- "total_measures": int(total_measures),
74
- "lines_count": lines_count, # Include the original line count
75
- "sections": [
76
- {"type": "verse", "lines": verse_lines, "measures": int(total_measures * 0.4)},
77
- {"type": "chorus", "lines": chorus_lines, "measures": int(total_measures * 0.3)}
78
- ]
79
- }
80
-
81
- if bridge_lines > 0:
82
- song_structure["sections"].append(
83
- {"type": "bridge", "lines": bridge_lines, "measures": int(total_measures * 0.2)}
84
- )
85
-
86
- return song_structure
87
-
88
  def format_genre_results(top_genres):
89
  """Format genre classification results for display."""
90
  result = "Top Detected Genres:\n"
@@ -103,17 +55,3 @@ def ensure_cuda_availability():
103
  print("CUDA is not available. Using CPU for inference.")
104
  return cuda_available
105
 
106
- def preprocess_audio_for_model(waveform, sample_rate, target_sample_rate=16000, max_length=16000):
107
- """Preprocess audio for model input (resample, pad/trim)."""
108
- # Resample if needed
109
- if sample_rate != target_sample_rate:
110
- waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=target_sample_rate)
111
-
112
- # Trim or pad to expected length
113
- if len(waveform) > max_length:
114
- waveform = waveform[:max_length]
115
- elif len(waveform) < max_length:
116
- padding = max_length - len(waveform)
117
- waveform = np.pad(waveform, (0, padding), 'constant')
118
-
119
- return waveform
 
37
  # Return a fallback feature vector if extraction fails
38
  return np.zeros(n_mfcc)
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  def format_genre_results(top_genres):
41
  """Format genre classification results for display."""
42
  result = "Top Detected Genres:\n"
 
55
  print("CUDA is not available. Using CPU for inference.")
56
  return cuda_available
57