diff --git "a/appp.py" "b/appp.py"
new file mode 100644--- /dev/null
+++ "b/appp.py"
@@ -0,0 +1,3950 @@
+import os
+import io
+import gradio as gr
+import torch
+import numpy as np
+import re
+import pronouncing # Add this to requirements.txt for syllable counting
+import functools # Add this for lru_cache functionality
+from transformers import (
+ AutoModelForAudioClassification,
+ AutoFeatureExtractor,
+ AutoTokenizer,
+ pipeline,
+ AutoModelForCausalLM,
+ BitsAndBytesConfig
+)
+from huggingface_hub import login
+from utils import (
+ load_audio,
+ extract_audio_duration,
+ extract_mfcc_features,
+ format_genre_results,
+ ensure_cuda_availability
+)
+from emotionanalysis import MusicAnalyzer
+import librosa
+
+# Login to Hugging Face Hub if token is provided
+if "HF_TOKEN" in os.environ:
+ login(token=os.environ["HF_TOKEN"])
+
+# Constants
+GENRE_MODEL_NAME = "dima806/music_genres_classification"
+MUSIC_DETECTION_MODEL = "MIT/ast-finetuned-audioset-10-10-0.4593"
+LLM_MODEL_NAME = "Qwen/Qwen3-14B"
+SAMPLE_RATE = 22050 # Standard sample rate for audio processing
+
+# Check CUDA availability (for informational purposes)
+CUDA_AVAILABLE = ensure_cuda_availability()
+
+# Create music detection pipeline
+print(f"Loading music detection model: {MUSIC_DETECTION_MODEL}")
+try:
+ music_detector = pipeline(
+ "audio-classification",
+ model=MUSIC_DETECTION_MODEL,
+ device=0 if CUDA_AVAILABLE else -1
+ )
+ print("Successfully loaded music detection pipeline")
+except Exception as e:
+ print(f"Error creating music detection pipeline: {str(e)}")
+ # Fallback to manual loading
+ try:
+ music_processor = AutoFeatureExtractor.from_pretrained(MUSIC_DETECTION_MODEL)
+ music_model = AutoModelForAudioClassification.from_pretrained(MUSIC_DETECTION_MODEL)
+ print("Successfully loaded music detection model and feature extractor")
+ except Exception as e2:
+ print(f"Error loading music detection model components: {str(e2)}")
+ raise RuntimeError(f"Could not load music detection model: {str(e2)}")
+
+# Create genre classification pipeline
+print(f"Loading audio classification model: {GENRE_MODEL_NAME}")
+try:
+ genre_classifier = pipeline(
+ "audio-classification",
+ model=GENRE_MODEL_NAME,
+ device=0 if CUDA_AVAILABLE else -1
+ )
+ print("Successfully loaded audio classification pipeline")
+except Exception as e:
+ print(f"Error creating pipeline: {str(e)}")
+ # Fallback to manual loading
+ try:
+ genre_processor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME)
+ genre_model = AutoModelForAudioClassification.from_pretrained(GENRE_MODEL_NAME)
+ print("Successfully loaded audio classification model and feature extractor")
+ except Exception as e2:
+ print(f"Error loading model components: {str(e2)}")
+ raise RuntimeError(f"Could not load genre classification model: {str(e2)}")
+
+# Load LLM with appropriate quantization for T4 GPU
+bnb_config = BitsAndBytesConfig(
+ load_in_4bit=True,
+ bnb_4bit_quant_type="nf4",
+ bnb_4bit_compute_dtype=torch.float16,
+)
+
+llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
+llm_model = AutoModelForCausalLM.from_pretrained(
+ LLM_MODEL_NAME,
+ device_map="auto",
+ quantization_config=bnb_config,
+ torch_dtype=torch.float16,
+)
+
+# Create LLM pipeline
+llm_pipeline = pipeline(
+ "text-generation",
+ model=llm_model,
+ tokenizer=llm_tokenizer,
+ max_new_tokens=512,
+)
+
+# Initialize music emotion analyzer
+music_analyzer = MusicAnalyzer()
+
+# New global function moved outside of verify_flexible_syllable_counts
+@functools.lru_cache(maxsize=512)
+def cached_phones_for_word(word):
+ """Get word pronunciations with caching for better performance."""
+ return pronouncing.phones_for_word(word)
+
+@functools.lru_cache(maxsize=512)
+def count_syllables_for_word(word):
+ """Count syllables in a single word with caching for performance."""
+ # Try using pronouncing library first
+ pronunciations = cached_phones_for_word(word.lower())
+ if pronunciations:
+ return pronouncing.syllable_count(pronunciations[0])
+
+ # Fallback method for words not in the pronouncing dictionary
+ vowels = "aeiouy"
+ word = word.lower()
+ count = 0
+ prev_is_vowel = False
+
+ for char in word:
+ is_vowel = char in vowels
+ if is_vowel and not prev_is_vowel:
+ count += 1
+ prev_is_vowel = is_vowel
+
+ # Handle special cases
+ if word.endswith('e') and not word.endswith('le'):
+ count -= 1
+ if word.endswith('le') and len(word) > 2 and word[-3] not in vowels:
+ count += 1
+ if count == 0:
+ count = 1
+
+ return count
+
+@functools.lru_cache(maxsize=512)
+def get_word_stress(word):
+ """Get the stress pattern for a word with improved fallback handling."""
+ pronunciations = cached_phones_for_word(word.lower())
+ if pronunciations:
+ return pronouncing.stresses(pronunciations[0])
+
+ # Enhanced fallback for words not in the dictionary
+ syllables = count_syllables_for_word(word)
+
+ # Common English stress patterns by word length
+ if syllables == 1:
+ return "1" # Single syllable words are stressed
+ elif syllables == 2:
+ # Most 2-syllable nouns and adjectives stress first syllable
+ # Common endings that indicate second-syllable stress
+ second_syllable_stress = ["ing", "er", "or", "ize", "ise", "ate", "ect", "end", "ure"]
+ if any(word.endswith(ending) for ending in second_syllable_stress):
+ return "01"
+ else:
+ return "10" # Default for 2-syllable words
+ elif syllables == 3:
+ # Common endings for specific stress patterns in 3-syllable words
+ if any(word.endswith(ending) for ending in ["ity", "ety", "ify", "ogy", "graphy"]):
+ return "100" # First syllable stress
+ elif any(word.endswith(ending) for ending in ["ation", "ious", "itis"]):
+ return "010" # Middle syllable stress
+ else:
+ return "100" # Default for 3-syllable words
+ else:
+ # For longer words, use common English patterns
+ return "1" + "0" * (syllables - 1)
+
+# New function: Count syllables in text
+def count_syllables(text):
+ """Count syllables in a given text using the pronouncing library."""
+ words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
+ syllable_count = 0
+
+ for word in words:
+ syllable_count += count_syllables_for_word(word)
+
+ return syllable_count
+
+def extract_audio_features(audio_file):
+ """Extract audio features from an audio file."""
+ try:
+ # Load the audio file using utility function
+ y, sr = load_audio(audio_file, SAMPLE_RATE)
+
+ if y is None or sr is None:
+ raise ValueError("Failed to load audio data")
+
+ # Get audio duration in seconds
+ duration = extract_audio_duration(y, sr)
+
+ # Extract MFCCs for genre classification (may not be needed with the pipeline)
+ mfccs_mean = extract_mfcc_features(y, sr, n_mfcc=20)
+
+ return {
+ "features": mfccs_mean,
+ "duration": duration,
+ "waveform": y,
+ "sample_rate": sr,
+ "path": audio_file # Keep path for the pipeline
+ }
+ except Exception as e:
+ print(f"Error extracting audio features: {str(e)}")
+ raise ValueError(f"Failed to extract audio features: {str(e)}")
+
+def classify_genre(audio_data):
+ """Classify the genre of the audio using the loaded model."""
+ try:
+ # First attempt: Try using the pipeline if available
+ if 'genre_classifier' in globals():
+ results = genre_classifier(audio_data["path"])
+ # Transform pipeline results to our expected format
+ top_genres = [(result["label"], result["score"]) for result in results[:3]]
+ return top_genres
+
+ # Second attempt: Use manually loaded model components
+ elif 'genre_processor' in globals() and 'genre_model' in globals():
+ # Process audio input with feature extractor
+ inputs = genre_processor(
+ audio_data["waveform"],
+ sampling_rate=audio_data["sample_rate"],
+ return_tensors="pt"
+ )
+
+ with torch.no_grad():
+ outputs = genre_model(**inputs)
+ predictions = outputs.logits.softmax(dim=-1)
+
+ # Get the top 3 genres
+ values, indices = torch.topk(predictions, 3)
+
+ # Map indices to genre labels
+ genre_labels = genre_model.config.id2label
+
+ top_genres = []
+ for i, (value, index) in enumerate(zip(values[0], indices[0])):
+ genre = genre_labels[index.item()]
+ confidence = value.item()
+ top_genres.append((genre, confidence))
+
+ return top_genres
+
+ else:
+ raise ValueError("No genre classification model available")
+
+ except Exception as e:
+ print(f"Error in genre classification: {str(e)}")
+ # Fallback: return a default genre if everything fails
+ return [("rock", 1.0)]
+
+def detect_music(audio_data):
+ """Detect if the audio is music using the MIT AST model."""
+ try:
+ # First attempt: Try using the pipeline if available
+ if 'music_detector' in globals():
+ results = music_detector(audio_data["path"])
+ # Look for music-related classes in the results
+ music_confidence = 0.0
+ for result in results:
+ label = result["label"].lower()
+ if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]):
+ music_confidence = max(music_confidence, result["score"])
+ return music_confidence >= 0.2, results
+
+ # Second attempt: Use manually loaded model components
+ elif 'music_processor' in globals() and 'music_model' in globals():
+ # Process audio input with feature extractor
+ inputs = music_processor(
+ audio_data["waveform"],
+ sampling_rate=audio_data["sample_rate"],
+ return_tensors="pt"
+ )
+
+ with torch.no_grad():
+ outputs = music_model(**inputs)
+ predictions = outputs.logits.softmax(dim=-1)
+
+ # Get the top predictions
+ values, indices = torch.topk(predictions, 5)
+
+ # Map indices to labels
+ labels = music_model.config.id2label
+
+ # Check for music-related classes
+ music_confidence = 0.0
+ results = []
+
+ for i, (value, index) in enumerate(zip(values[0], indices[0])):
+ label = labels[index.item()].lower()
+ score = value.item()
+ results.append({"label": label, "score": score})
+
+ if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]):
+ music_confidence = max(music_confidence, score)
+
+ return music_confidence >= 0.2, results
+
+ else:
+ raise ValueError("No music detection model available")
+
+ except Exception as e:
+ print(f"Error in music detection: {str(e)}")
+ return False, []
+
+def detect_beats(y, sr):
+ """Enhanced beat detection with adaptive threshold analysis, improved time signature detection and scientific confidence metrics."""
+ # STEP 1: Improved pre-processing with robustness for quiet sections
+ # Apply a small floor to avoid division-by-zero issues
+ y = np.clip(y, 1e-10, None) # Prevent extreme quiet sections from causing NaN
+
+ # Separate harmonic and percussive components
+ y_harmonic, y_percussive = librosa.effects.hpss(y)
+
+ # Generate multiple onset envelopes with smoothing for stability
+ onset_env_full = librosa.onset.onset_strength(y=y, sr=sr)
+ onset_env_perc = librosa.onset.onset_strength(y=y_percussive, sr=sr)
+
+ # Apply small smoothing to handle quiet sections
+ onset_env_full = np.maximum(onset_env_full, 1e-6) # Minimum threshold to avoid NaN
+ onset_env_perc = np.maximum(onset_env_perc, 1e-6)
+
+ # Create weighted combination
+ combined_onset = onset_env_full * 0.3 + onset_env_perc * 0.7
+
+ # STEP 2: Multi-strategy tempo and beat detection with confidence tracking
+ tempo_candidates = []
+ beat_candidates = []
+ consistency_metrics = []
+
+ # Strategy 1: Standard detection
+ tempo1, beats1 = librosa.beat.beat_track(
+ onset_envelope=combined_onset,
+ sr=sr,
+ tightness=100 # More sensitive tracking
+ )
+ tempo_candidates.append(tempo1)
+ beat_candidates.append(beats1)
+
+ # Calculate autocorrelation-based confidence for this tempo
+ ac = librosa.autocorrelate(combined_onset)
+ estimated_period = int(sr * 60.0 / (tempo1 * librosa.get_duration(y=y, sr=sr) / len(combined_onset)))
+ if estimated_period < len(ac) and estimated_period > 0:
+ # Measure peak height relative to surroundings
+ local_ac = ac[max(0, estimated_period-5):min(len(ac), estimated_period+6)]
+ if np.max(local_ac) > 0:
+ tempo1_confidence = ac[estimated_period] / np.max(local_ac)
+ else:
+ tempo1_confidence = 0.5
+ else:
+ tempo1_confidence = 0.5
+ consistency_metrics.append(tempo1_confidence)
+
+ # Strategy 2: Try with different tempo range for complex signatures
+ tempo2, beats2 = librosa.beat.beat_track(
+ onset_envelope=combined_onset,
+ sr=sr,
+ tightness=100,
+ start_bpm=60 # Lower starting BPM helps find different time signatures
+ )
+ tempo_candidates.append(tempo2)
+ beat_candidates.append(beats2)
+
+ # Calculate confidence for the second tempo estimate
+ estimated_period2 = int(sr * 60.0 / (tempo2 * librosa.get_duration(y=y, sr=sr) / len(combined_onset)))
+ if estimated_period2 < len(ac) and estimated_period2 > 0:
+ local_ac2 = ac[max(0, estimated_period2-5):min(len(ac), estimated_period2+6)]
+ if np.max(local_ac2) > 0:
+ tempo2_confidence = ac[estimated_period2] / np.max(local_ac2)
+ else:
+ tempo2_confidence = 0.5
+ else:
+ tempo2_confidence = 0.5
+ consistency_metrics.append(tempo2_confidence)
+
+ # Strategy 3: Use dynamic programming for beat tracking
+ try:
+ tempo3, beats3 = librosa.beat.beat_track(
+ onset_envelope=combined_onset,
+ sr=sr,
+ tightness=300, # Higher tightness for more structured detection
+ trim=False
+ )
+ tempo_candidates.append(tempo3)
+ beat_candidates.append(beats3)
+
+ # Calculate DP-based confidence
+ if len(beats3) > 1:
+ beat_times3 = librosa.frames_to_time(beats3, sr=sr)
+ intervals3 = np.diff(beat_times3)
+ tempo3_consistency = 1.0 / (1.0 + np.std(intervals3)/np.mean(intervals3)) if np.mean(intervals3) > 0 else 0.5
+ else:
+ tempo3_consistency = 0.5
+ consistency_metrics.append(tempo3_consistency)
+ except Exception:
+ # Skip if this approach fails
+ pass
+
+ # Select the best strategy based on improved consistency measurement
+ beat_consistency = []
+ for i, beats in enumerate(beat_candidates):
+ if len(beats) <= 1:
+ beat_consistency.append(0)
+ continue
+
+ times = librosa.frames_to_time(beats, sr=sr)
+ intervals = np.diff(times)
+
+ # Comprehensive consistency metrics with better statistical justification
+ if np.mean(intervals) > 0:
+ # Combine coefficient of variation with autocorrelation confidence
+ cv = np.std(intervals)/np.mean(intervals) # Lower is better
+
+ # Add adjustments for beat count reasonability
+ duration = librosa.get_duration(y=y, sr=sr)
+ expected_beats = duration * tempo_candidates[i] / 60
+ beats_ratio = min(len(beats) / expected_beats, expected_beats / len(beats)) if expected_beats > 0 else 0.5
+
+ # Combine metrics with scientific weighting
+ consistency = (0.7 * (1.0 / (1.0 + cv))) + (0.3 * consistency_metrics[i]) + (0.2 * beats_ratio)
+ beat_consistency.append(consistency)
+ else:
+ beat_consistency.append(0)
+
+ # Select best model with scientific confidence calculation
+ if beat_consistency:
+ best_idx = np.argmax(beat_consistency)
+ best_confidence = beat_consistency[best_idx] * 100 # Convert to percentage
+ else:
+ best_idx = 0
+ best_confidence = 50.0 # Default 50% confidence if no good metrics
+
+ tempo = tempo_candidates[best_idx]
+ beat_frames = beat_candidates[best_idx]
+
+ # Calculate beat entropy - scientific measure of beat pattern predictability
+ beat_entropy = 0.0
+ if len(beat_frames) > 2:
+ times = librosa.frames_to_time(beat_frames, sr=sr)
+ intervals = np.diff(times)
+
+ # Quantize intervals to detect patterns
+ if len(intervals) > 0 and np.std(intervals) > 0:
+ quantized = np.round(intervals / np.min(intervals))
+ # Count frequencies of each interval type
+ unique, counts = np.unique(quantized, return_counts=True)
+ probs = counts / np.sum(counts)
+ # Calculate Shannon entropy
+ beat_entropy = -np.sum(probs * np.log2(probs))
+
+ # STEP 3: Improved beat strength extraction
+ beat_times = librosa.frames_to_time(beat_frames, sr=sr)
+
+ # Vectorized extraction of beat strengths with improved error handling
+ beat_strengths = []
+ if len(beat_frames) > 0:
+ # Filter out beat frames that exceed the onset envelope length
+ valid_frames = [frame for frame in beat_frames if frame < len(combined_onset)]
+ if valid_frames:
+ # Vectorized extraction with normalization for consistency
+ raw_strengths = combined_onset[valid_frames]
+
+ # Normalize strengths to [0,1] for scientific consistency
+ if np.max(raw_strengths) > 0:
+ normalized_strengths = raw_strengths / np.max(raw_strengths)
+ else:
+ normalized_strengths = np.ones_like(raw_strengths)
+
+ beat_strengths = normalized_strengths.tolist()
+
+ # Handle remaining beats with interpolation instead of constant values
+ if len(beat_times) > len(beat_strengths):
+ missing_count = len(beat_times) - len(beat_strengths)
+ # Use linear interpolation for more scientific approach
+ if beat_strengths:
+ last_strength = beat_strengths[-1]
+ decay_factor = 0.9 # Gradual decay for trailing beats
+ beat_strengths.extend([last_strength * (decay_factor ** (i+1))
+ for i in range(missing_count)])
+ else:
+ beat_strengths = [1.0] * len(beat_times)
+ else:
+ beat_strengths = [1.0] * len(beat_times)
+ else:
+ beat_strengths = [1.0] * len(beat_times)
+
+ # STEP 4: Calculate intervals between beats
+ intervals = np.diff(beat_times).tolist() if len(beat_times) > 1 else []
+
+ # STEP 5: Improved time signature detection with scientific confidence
+ # Start with default assumption
+ time_signature = 4
+ time_sig_confidence = 70.0 # Default confidence
+
+ if len(beat_strengths) > 8:
+ # Use autocorrelation to find periodicity in beat strengths
+ if len(beat_strengths) > 4:
+ # Normalize beat strengths for better pattern detection
+ norm_strengths = np.array(beat_strengths)
+ if np.max(norm_strengths) > 0:
+ norm_strengths = norm_strengths / np.max(norm_strengths)
+
+ # Compute autocorrelation to find periodic patterns (N)
+ ac = librosa.autocorrelate(norm_strengths, max_size=len(norm_strengths)//2)
+
+ # Find peaks in autocorrelation (indicates periodicity)
+ if len(ac) > 3: # Need enough data for peak picking
+ # Find peaks after lag 0
+ peaks = librosa.util.peak_pick(ac[1:], pre_max=1, post_max=1, pre_avg=1, post_avg=1, delta=0.1, wait=1)
+ peaks = peaks + 1 # Adjust for the removed lag 0
+
+ if len(peaks) > 0:
+ # Get the first significant peak position (cycle length N)
+ peak_idx = peaks[0]
+ N = peak_idx
+
+ # Calculate confidence based on peak prominence
+ if peak_idx < len(ac):
+ peak_height = ac[peak_idx]
+ local_prominence = peak_height / np.mean(ac[max(0, peak_idx-2):min(len(ac), peak_idx+3)])
+ time_sig_confidence = min(95, 60 + 35 * local_prominence) # Scale between 60-95%
+
+ # Map common cycle lengths to time signatures with improved musical theory
+ if N == 2:
+ time_signature = 2 # Clear binary meter (2/4, 2/2, etc.)
+ time_sig_confidence += 5 # Boost for simple meter
+ elif N == 3:
+ time_signature = 3 # Clear triple meter (3/4, 3/8, etc.)
+ time_sig_confidence += 5 # Boost for simple meter
+ elif 4 <= N <= 5:
+ time_signature = N # Direct mapping for common cases (4/4 or 5/4)
+ elif N == 6:
+ # Could be 6/8 (compound duple) or 3/4 with subdivisions
+ # Further analyze to distinguish
+ group_3_count = 0
+ for i in range(0, len(beat_strengths) - 6, 3):
+ if i + 2 < len(beat_strengths):
+ if beat_strengths[i] > beat_strengths[i+1] and beat_strengths[i] > beat_strengths[i+2]:
+ group_3_count += 1
+
+ group_2_count = 0
+ for i in range(0, len(beat_strengths) - 4, 2):
+ if i + 1 < len(beat_strengths):
+ if beat_strengths[i] > beat_strengths[i+1]:
+ group_2_count += 1
+
+ # Determine if it's grouped in 2s or 3s
+ time_signature = 3 if group_3_count > group_2_count else 6
+ elif N == 8:
+ time_signature = 4 # 4/4 with embellishments
+ elif N == 5 or N == 7:
+ time_signature = N # Odd time signatures like 5/4 or 7/8
+
+ # STEP 6: Enhanced phrase detection with adaptive thresholds and scientific justification
+ phrases = []
+ current_phrase = []
+
+ if len(beat_times) > 0:
+ # Calculate adaptive thresholds using percentiles instead of fixed ratios
+ if len(beat_strengths) > 4:
+ # Define thresholds based on distribution rather than fixed values
+ strong_threshold = np.percentile(beat_strengths, 75) # Top 25% are "strong" beats
+ # For gaps, calculate significant deviation using z-scores if we have intervals
+ if intervals:
+ mean_interval = np.mean(intervals)
+ std_interval = np.std(intervals)
+ # A significant gap is > 1.5 standard deviations above mean (95th percentile)
+ significant_gap = mean_interval + (1.5 * std_interval) if std_interval > 0 else mean_interval * 1.3
+ else:
+ significant_gap = 0
+ else:
+ # Fallback for limited data
+ strong_threshold = np.max(beat_strengths) * 0.8 if beat_strengths else 1.0
+ significant_gap = 0
+
+ # Identify phrase boundaries with improved musical heuristics
+ for i in range(len(beat_times)):
+ current_phrase.append(i)
+
+ # Check for phrase boundary conditions
+ if i < len(beat_times) - 1:
+ # Strong beat coming up (using adaptive threshold)
+ is_stronger_next = False
+ if i < len(beat_strengths) - 1:
+ is_stronger_next = beat_strengths[i+1] > strong_threshold and beat_strengths[i+1] > beat_strengths[i] * 1.1
+
+ # Significant gap (using adaptive threshold)
+ is_longer_gap = False
+ if i < len(beat_times) - 1 and intervals and i < len(intervals):
+ is_longer_gap = intervals[i] > significant_gap
+
+ # Measure boundary based on time signature
+ is_measure_boundary = (i + 1) % time_signature == 0 and i > 0
+
+ # Check for significant dip in onset strength (phrase boundary often has reduced energy)
+ is_energy_dip = False
+ if i < len(beat_strengths) - 1:
+ onset_ratio = beat_strengths[i+1] / max(beat_strengths[i], 0.001)
+ is_energy_dip = onset_ratio < 0.6
+
+ # Combined decision for phrase boundary with scientific weighting
+ phrase_boundary_score = (
+ (1.5 if is_stronger_next else 0) +
+ (2.0 if is_longer_gap else 0) +
+ (1.0 if is_measure_boundary else 0) +
+ (0.5 if is_energy_dip else 0)
+ )
+
+ if (phrase_boundary_score >= 1.5 and len(current_phrase) >= 2) or \
+ (is_measure_boundary and len(current_phrase) >= time_signature):
+ phrases.append(current_phrase)
+ current_phrase = []
+
+ # Add the last phrase if not empty
+ if current_phrase and len(current_phrase) >= 2:
+ phrases.append(current_phrase)
+
+ # Ensure we have at least one phrase
+ if not phrases and len(beat_times) >= 2:
+ # Default to grouping by measures based on detected time signature
+ for i in range(0, len(beat_times), time_signature):
+ end = min(i + time_signature, len(beat_times))
+ if end - i >= 2: # Ensure at least 2 beats per phrase
+ phrases.append(list(range(i, end)))
+
+ # Calculate beat periodicity (average time between beats)
+ beat_periodicity = np.mean(intervals) if intervals else (60 / tempo)
+
+ # Return enhanced results with scientific confidence metrics
+ return {
+ "tempo": tempo,
+ "tempo_confidence": best_confidence, # New scientific confidence metric
+ "time_signature": time_signature,
+ "time_sig_confidence": time_sig_confidence, # New scientific confidence metric
+ "beat_frames": beat_frames,
+ "beat_times": beat_times,
+ "beat_count": len(beat_times),
+ "beat_strengths": beat_strengths,
+ "intervals": intervals,
+ "phrases": phrases,
+ "beat_periodicity": beat_periodicity,
+ "beat_entropy": beat_entropy # New scientific measure of rhythm complexity
+ }
+
+def detect_beats_and_subbeats(y, sr, subdivision=4):
+ """
+ Detect main beats and interpolate subbeats between consecutive beats.
+
+ Parameters:
+ y: Audio time series
+ sr: Sample rate
+ subdivision: Number of subdivisions between beats (default: 4 for quarter beats)
+
+ Returns:
+ Dictionary containing beat times, subbeat times, and tempo information
+ """
+ # Detect main beats using librosa
+ try:
+ tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
+ beat_times = librosa.frames_to_time(beat_frames, sr=sr)
+
+ # Convert numpy values to native Python types
+ if isinstance(tempo, np.ndarray) or isinstance(tempo, np.number):
+ tempo = float(tempo)
+
+ # Convert beat_times to a list of floats
+ if isinstance(beat_times, np.ndarray):
+ beat_times = [float(t) for t in beat_times]
+ except Exception as e:
+ print(f"Error in beat detection: {e}")
+ # Default fallbacks
+ tempo = 120.0
+ beat_times = []
+
+ # Create subbeats by interpolating between main beats
+ subbeat_times = []
+
+ # Early return if no beats detected
+ if not beat_times or len(beat_times) < 2:
+ return {
+ "tempo": float(tempo) if tempo is not None else 120.0,
+ "beat_times": beat_times,
+ "subbeat_times": []
+ }
+
+ for i in range(len(beat_times) - 1):
+ # Get current and next beat time
+ try:
+ current_beat = float(beat_times[i])
+ next_beat = float(beat_times[i + 1])
+ except (IndexError, ValueError, TypeError):
+ continue
+
+ # Calculate time interval between beats
+ interval = (next_beat - current_beat) / subdivision
+
+ # Add the main beat
+ subbeat_times.append({
+ "time": float(current_beat),
+ "type": "main",
+ "strength": 1.0,
+ "beat_index": i
+ })
+
+ # Add subbeats
+ for j in range(1, subdivision):
+ subbeat_time = current_beat + j * interval
+ # Calculate strength based on position
+ # For 4/4 time, beat 3 is stronger than beats 2 and 4
+ if j == subdivision // 2 and subdivision == 4:
+ strength = 0.8 # Stronger subbeat (e.g., beat 3 in 4/4)
+ else:
+ strength = 0.5 # Weaker subbeat
+
+ subbeat_times.append({
+ "time": float(subbeat_time),
+ "type": "sub",
+ "strength": float(strength),
+ "beat_index": i,
+ "subbeat_index": j
+ })
+
+ # Add the last main beat
+ if beat_times:
+ try:
+ subbeat_times.append({
+ "time": float(beat_times[-1]),
+ "type": "main",
+ "strength": 1.0,
+ "beat_index": len(beat_times) - 1
+ })
+ except (ValueError, TypeError):
+ # Skip if conversion fails
+ pass
+
+ return {
+ "tempo": float(tempo) if tempo is not None else 120.0,
+ "beat_times": beat_times,
+ "subbeat_times": subbeat_times
+ }
+
+def map_beats_to_seconds(subbeat_times, duration, fps=1.0):
+ """
+ Map beats and subbeats to second-level intervals.
+
+ Parameters:
+ subbeat_times: List of dictionaries containing beat and subbeat information
+ duration: Total duration of the audio in seconds
+ fps: Frames per second (default: 1.0 for one-second intervals)
+
+ Returns:
+ List of dictionaries, each containing beats within a time window
+ """
+ # Safety check for input parameters
+ if not isinstance(subbeat_times, list):
+ print("Warning: subbeat_times is not a list")
+ subbeat_times = []
+
+ try:
+ duration = float(duration)
+ except (ValueError, TypeError):
+ print("Warning: duration is not convertible to float, defaulting to 30")
+ duration = 30.0
+
+ # Calculate number of time windows
+ num_windows = int(duration * fps) + 1
+
+ # Initialize time windows
+ time_windows = []
+
+ for i in range(num_windows):
+ # Calculate window boundaries
+ start_time = i / fps
+ end_time = (i + 1) / fps
+
+ # Find beats and subbeats within this window
+ window_beats = []
+
+ for beat in subbeat_times:
+ # Safety check for beat object
+ if not isinstance(beat, dict):
+ continue
+
+ # Safely access beat time
+ try:
+ beat_time = float(beat.get("time", 0))
+ except (ValueError, TypeError):
+ continue
+
+ if start_time <= beat_time < end_time:
+ # Safely extract beat properties with defaults
+ beat_type = beat.get("type", "sub")
+ if not isinstance(beat_type, str):
+ beat_type = "sub"
+
+ # Safely handle strength
+ try:
+ strength = float(beat.get("strength", 0.5))
+ except (ValueError, TypeError):
+ strength = 0.5
+
+ # Add beat to this window
+ window_beats.append({
+ "time": beat_time,
+ "type": beat_type,
+ "strength": strength,
+ "relative_pos": (beat_time - start_time) / (1/fps) # Position within window (0-1)
+ })
+
+ # Add window to list
+ time_windows.append({
+ "second": i,
+ "start": start_time,
+ "end": end_time,
+ "beats": window_beats
+ })
+
+ return time_windows
+
+def create_second_level_templates(sec_map, tempo, genre=None):
+ """
+ Create syllable templates for each second-level window.
+
+ Parameters:
+ sec_map: List of second-level time windows with beat information
+ tempo: Tempo in BPM
+ genre: Optional genre for genre-specific adjustments
+
+ Returns:
+ List of template strings, one for each second
+ """
+ # Helper function to map tempo to base syllable count
+ def tempo_to_syllable_base(tempo):
+ """Continuous function mapping tempo to syllable base count"""
+ # Sigmoid-like function that smoothly transitions between syllable counts
+ if tempo > 180:
+ return 1.0
+ elif tempo > 140:
+ return 1.0 + (180 - tempo) * 0.02 # Gradual increase 1.0 → 1.8
+ elif tempo > 100:
+ return 1.8 + (140 - tempo) * 0.01 # Gradual increase 1.8 → 2.2
+ elif tempo > 70:
+ return 2.2 + (100 - tempo) * 0.02 # Gradual increase 2.2 → 2.8
+ else:
+ return 2.8 + max(0, (70 - tempo) * 0.04) # Continue increasing for very slow tempos
+
+ # Calculate base syllable count from tempo
+ base_syllables = tempo_to_syllable_base(tempo)
+
+ # Apply genre-specific adjustments
+ genre_factor = 1.0
+ if genre:
+ genre_lower = genre.lower()
+ if any(term in genre_lower for term in ["rap", "hip hop", "hip-hop"]):
+ genre_factor = 1.4 # Much higher syllable density for rap
+ elif any(term in genre_lower for term in ["folk", "country", "ballad"]):
+ genre_factor = 0.8 # Lower density for folk styles
+
+ # Create templates for each second
+ templates = []
+
+ for window in sec_map:
+ beats = window["beats"]
+
+ # If no beats in this second, create a default template
+ if not beats:
+ templates.append("w(0.5):1")
+ continue
+
+ # Create beat patterns for this second
+ beat_patterns = []
+
+ for beat in beats:
+ # Ensure we're dealing with a dictionary and that it has a "strength" key
+ if not isinstance(beat, dict):
+ continue # Skip this beat if it's not a dictionary
+
+ # Safely get beat type and strength
+ if "type" not in beat or not isinstance(beat["type"], str):
+ beat_type = "w" # Default to weak if type is missing or not a string
+ else:
+ beat_type = "S" if beat["type"] == "main" else "m" if beat.get("strength", 0) >= 0.7 else "w"
+
+ # Safely get strength value with fallback
+ try:
+ strength = float(beat.get("strength", 0.5))
+ except (ValueError, TypeError):
+ strength = 0.5 # Default if conversion fails
+
+ # Adjust syllable count based on beat type and strength
+ if beat_type == "S":
+ syllable_factor = 1.2 # More syllables for strong beats
+ elif beat_type == "m":
+ syllable_factor = 1.0 # Normal for medium beats
+ else:
+ syllable_factor = 0.8 # Fewer for weak beats
+
+ # Calculate final syllable count
+ syllable_count = base_syllables * syllable_factor * genre_factor
+
+ # Round to half-syllable precision
+ syllable_count = round(syllable_count * 2) / 2
+
+ # Ensure reasonable limits
+ syllable_count = max(0.5, min(4, syllable_count))
+
+ # Format with embedded strength value
+ strength_pct = round(strength * 100) / 100
+ beat_patterns.append(f"{beat_type}({strength_pct}):{syllable_count}")
+
+ # Join patterns with dashes - ensure we have at least one pattern
+ if not beat_patterns:
+ templates.append("w(0.5):1") # Default if no valid patterns were created
+ else:
+ second_template = "-".join(beat_patterns)
+ templates.append(second_template)
+
+ return templates
+
+def detect_sections(y, sr):
+ """
+ Detect musical segments without classifying them by type (verse, chorus, etc.).
+
+ Parameters:
+ y: Audio time series
+ sr: Sample rate
+
+ Returns:
+ A list of section dictionaries with start time, end time, and duration
+ """
+ # Step 1: Extract rich feature set for comprehensive analysis
+ # ----------------------------------------------------------------------
+ hop_length = 512 # Common hop length for feature extraction
+
+ # Spectral features
+ S = np.abs(librosa.stft(y, hop_length=hop_length))
+ contrast = librosa.feature.spectral_contrast(S=S, sr=sr)
+
+ # Harmonic features with CQT-based chroma (better for harmonic analysis)
+ chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=hop_length)
+
+ # Timbral features
+ mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=hop_length)
+
+ # Energy features
+ rms = librosa.feature.rms(y=y, hop_length=hop_length)
+
+ # Harmonic-percussive source separation for better rhythm analysis
+ y_harmonic, y_percussive = librosa.effects.hpss(y)
+
+ # Step 2: Adaptive determination of segment count based on song complexity
+ # ----------------------------------------------------------------------
+ duration = librosa.get_duration(y=y, sr=sr)
+
+ # Feature preparation for adaptive segmentation
+ # Stack features with proper normalization (addressing the scale issue)
+ feature_stack = np.vstack([
+ librosa.util.normalize(contrast),
+ librosa.util.normalize(chroma),
+ librosa.util.normalize(mfcc),
+ librosa.util.normalize(rms)
+ ])
+
+ # Transpose to get time as first dimension
+ feature_matrix = feature_stack.T
+
+ # Step 3: Feature fusion using dimensionality reduction
+ # ----------------------------------------------------------------------
+ from sklearn.decomposition import PCA
+
+ # Handle very short audio files
+ n_components = min(8, feature_matrix.shape[0], feature_matrix.shape[1])
+
+ if feature_matrix.shape[0] > n_components and feature_matrix.shape[1] > 0:
+ try:
+ pca = PCA(n_components=n_components)
+ reduced_features = pca.fit_transform(feature_matrix)
+ except Exception as e:
+ print(f"PCA failed, falling back to original features: {e}")
+ # Fallback to simpler approach if PCA fails
+ reduced_features = feature_matrix
+ else:
+ # Not enough data for PCA
+ reduced_features = feature_matrix
+
+ # Step 4: Adaptive determination of optimal segment count
+ # ----------------------------------------------------------------------
+
+ # Initialize range of segment counts to try
+ min_segments = max(2, int(duration / 60)) # At least 2 segments, roughly 1 per minute
+ max_segments = min(10, int(duration / 20)) # At most 10 segments, roughly 1 per 20 seconds
+
+ # Ensure reasonable bounds
+ min_segments = max(2, min(min_segments, 4))
+ max_segments = max(min_segments + 1, min(max_segments, 8))
+
+ # Try different segment counts and evaluate with silhouette score
+ best_segments = min_segments
+ best_score = -1
+
+ from sklearn.metrics import silhouette_score
+ from sklearn.cluster import AgglomerativeClustering
+
+ # Only do this analysis if we have enough data
+ if reduced_features.shape[0] > max_segments:
+ for n_segments in range(min_segments, max_segments + 1):
+ try:
+ # Perform agglomerative clustering
+ clustering = AgglomerativeClustering(n_clusters=n_segments)
+ labels = clustering.fit_predict(reduced_features)
+
+ # Calculate silhouette score if we have enough samples
+ if len(np.unique(labels)) > 1 and len(labels) > n_segments + 1:
+ score = silhouette_score(reduced_features, labels)
+
+ if score > best_score:
+ best_score = score
+ best_segments = n_segments
+ except Exception as e:
+ print(f"Clustering with {n_segments} segments failed: {e}")
+ continue
+
+ # Use the optimal segment count for final segmentation
+ n_segments = best_segments
+
+ # Step 5: Final segmentation using the optimal segment count
+ # ----------------------------------------------------------------------
+
+ # Method 1: Use agglomerative clustering on the reduced features
+ try:
+ clustering = AgglomerativeClustering(n_clusters=n_segments)
+ labels = clustering.fit_predict(reduced_features)
+
+ # Convert cluster labels to boundaries by finding where labels change
+ boundaries = [0] # Start with the beginning
+
+ for i in range(1, len(labels)):
+ if labels[i] != labels[i-1]:
+ boundaries.append(i)
+
+ boundaries.append(len(labels)) # Add the end
+
+ # Convert to frames
+ bounds_frames = np.array(boundaries)
+
+ except Exception as e:
+ print(f"Final clustering failed: {e}")
+ # Fallback to librosa's agglomerative clustering on original features
+ bounds_frames = librosa.segment.agglomerative(feature_stack, n_segments)
+
+ # Step 6: Convert boundaries to time and create sections
+ # ----------------------------------------------------------------------
+ bounds_times = librosa.frames_to_time(bounds_frames, sr=sr, hop_length=hop_length)
+
+ # Create sections from the boundaries
+ sections = []
+
+ for i in range(len(bounds_times) - 1):
+ start = bounds_times[i]
+ end = bounds_times[i+1]
+ duration = end - start
+
+ # Skip extremely short sections
+ if duration < 4 and i > 0 and i < len(bounds_times) - 2:
+ continue
+
+ # Add section to the list (without classifying as verse/chorus/etc)
+ sections.append({
+ "type": "segment", # Generic type instead of verse/chorus/etc
+ "start": start,
+ "end": end,
+ "duration": duration
+ })
+
+ # Filter out any remaining extremely short sections
+ sections = [s for s in sections if s["duration"] >= 5]
+
+ return sections
+
+def create_flexible_syllable_templates(beats_info, genre=None, phrase_mode='default'):
+ """
+ Create enhanced syllable templates based on beat patterns with improved musical intelligence.
+
+ Parameters:
+ beats_info: Dictionary containing beat analysis data
+ genre: Optional genre to influence template creation
+ phrase_mode: 'default' uses provided phrases, 'auto' forces recalculation
+
+ Returns:
+ String of syllable templates with embedded strength values and flexible timing
+ """
+ import numpy as np
+ from sklearn.cluster import KMeans
+
+ # Convert any numpy values to native Python types for safety - directly handle conversions
+ # Process the dictionary to convert numpy values to Python native types
+ if isinstance(beats_info, dict):
+ processed_beats_info = {}
+ for k, v in beats_info.items():
+ if isinstance(v, np.ndarray):
+ if v.size == 1:
+ processed_beats_info[k] = float(v.item())
+ else:
+ processed_beats_info[k] = [float(x) if isinstance(x, np.number) else x for x in v]
+ elif isinstance(v, np.number):
+ processed_beats_info[k] = float(v)
+ elif isinstance(v, list):
+ processed_beats_info[k] = [float(x) if isinstance(x, np.number) else x for x in v]
+ else:
+ processed_beats_info[k] = v
+ beats_info = processed_beats_info
+
+ # Extract basic beat information
+ beat_times = beats_info.get("beat_times", [])
+ beat_strengths = beats_info.get("beat_strengths", [1.0] * len(beat_times))
+ tempo = beats_info.get("tempo", 120)
+ time_signature = beats_info.get("time_signature", 4)
+
+ # Early return for insufficient data
+ if len(beat_times) < 2:
+ return "S(1.0):1-w(0.5):1|S(1.0):1-w(0.5):1" # Default fallback pattern
+
+ # Step 1: Improved adaptive thresholding using k-means clustering
+ # ----------------------------------------------------------------------
+ if len(beat_strengths) >= 6: # Need enough data points for clustering
+ # Reshape for k-means
+ X = np.array(beat_strengths).reshape(-1, 1)
+
+ # Use k-means with 3 clusters for Strong, Medium, Weak classification
+ kmeans = KMeans(n_clusters=3, random_state=0, n_init=10).fit(X)
+
+ # Find the centroid values and sort them
+ centroids = sorted([float(c[0]) for c in kmeans.cluster_centers_])
+
+ # Map to thresholds (using the midpoints between centroids)
+ if len(centroids) >= 3:
+ medium_threshold = (centroids[0] + centroids[1]) / 2
+ strong_threshold = (centroids[1] + centroids[2]) / 2
+ else:
+ # Fallback if clustering doesn't work well
+ medium_threshold = np.percentile(beat_strengths, 33)
+ strong_threshold = np.percentile(beat_strengths, 66)
+ else:
+ # For limited data, use percentile-based approach
+ medium_threshold = np.percentile(beat_strengths, 33)
+ strong_threshold = np.percentile(beat_strengths, 66)
+
+ # Step 2: Create or refine phrases based on mode
+ # ----------------------------------------------------------------------
+ phrases = beats_info.get("phrases", [])
+
+ if phrase_mode == 'auto' or not phrases:
+ # Create phrases based on time signature and beat strengths
+ phrases = []
+ current_phrase = []
+
+ for i in range(len(beat_times)):
+ current_phrase.append(i)
+
+ # Check for natural phrase endings
+ if (i + 1) % time_signature == 0 or i == len(beat_times) - 1:
+ if len(current_phrase) >= 2: # Ensure minimum phrase length
+ phrases.append(current_phrase)
+ current_phrase = []
+
+ # Add any remaining beats
+ if current_phrase and len(current_phrase) >= 2:
+ phrases.append(current_phrase)
+
+ # Step 3: Improved continuous tempo-to-syllable mapping function
+ # ----------------------------------------------------------------------
+ def tempo_to_syllable_base(tempo):
+ """Continuous function mapping tempo to syllable base count with scientific curve"""
+ # Sigmoid-like function with more scientific parameters
+ # Using logistic function: L/(1+e^(-k(x-x0))) to create smooth transitions
+ if tempo < 40: # Very slow tempos
+ return 3.5 # Maximum syllables for extremely slow tempos
+ elif tempo > 200: # Very fast tempos
+ return 0.8 # Minimum syllables for extremely fast tempos
+ else:
+ # Scientific logistic function for middle range (40-200 BPM)
+ L = 3.5 # Upper limit
+ k = 0.04 # Steepness of curve
+ x0 = 120 # Midpoint (inflection point at normal tempo)
+ return L / (1 + np.exp(k * (tempo - x0)))
+
+ # Step 4: Generate enhanced templates with flexible timing
+ # ----------------------------------------------------------------------
+ syllable_templates = []
+
+ for phrase in phrases:
+ # Skip empty phrases
+ if not phrase:
+ continue
+
+ # Extract beat strengths for this phrase
+ phrase_strengths = [beat_strengths[i] for i in phrase if i < len(beat_strengths)]
+ if not phrase_strengths:
+ phrase_strengths = [1.0] * len(phrase)
+
+ # Apply improved adaptive thresholding for stress pattern detection
+ stress_pattern = []
+ for i, strength in enumerate(phrase_strengths):
+ # Consider both strength and metrical position with improved weighting
+ metrical_position = i % time_signature
+
+ # Apply improved position boosting based on musical theory
+ # In common time signatures, first beat gets strong emphasis,
+ # third beat gets moderate emphasis (in 4/4)
+ if metrical_position == 0: # Downbeat (first beat)
+ position_boost = 0.18 # Stronger boost for downbeats
+ elif time_signature == 4 and metrical_position == 2: # Third beat in 4/4
+ position_boost = 0.1 # Moderate boost for third beat
+ elif time_signature == 3 and metrical_position == 1: # Second beat in 3/4
+ position_boost = 0.05 # Slight boost for second beat in 3/4
+ else:
+ position_boost = 0 # No boost for other beats
+
+ effective_strength = strength + position_boost
+
+ if effective_strength >= strong_threshold:
+ stress_pattern.append(("S", effective_strength)) # Strong beat with strength
+ elif effective_strength >= medium_threshold:
+ stress_pattern.append(("m", effective_strength)) # Medium beat with strength
+ else:
+ stress_pattern.append(("w", effective_strength)) # Weak beat with strength
+
+ # Step 5: Calculate syllable counts using improved continuous function
+ # ----------------------------------------------------------------------
+ detailed_template = []
+
+ for i, (stress_type, strength) in enumerate(stress_pattern):
+ # Get base syllable count from tempo with more nuanced mapping
+ base_syllables = tempo_to_syllable_base(tempo)
+
+ # Adjust based on both stress type AND metrical position
+ metrical_position = i % time_signature
+ position_factor = 1.2 if metrical_position == 0 else 1.0
+
+ # More nuanced adjustment based on stress type
+ if stress_type == "S":
+ syllable_factor = 1.2 * position_factor # Emphasize strong beats more
+ elif stress_type == "m":
+ syllable_factor = 1.0 * position_factor # Medium beats
+ else:
+ syllable_factor = 0.8 # Weak beats
+
+ # Apply improved genre-specific adjustments with more granular factors
+ genre_factor = 1.0
+ if genre:
+ genre = genre.lower()
+ if "rap" in genre or "hip" in genre:
+ genre_factor = 1.5 # Significantly higher syllable density for rap
+ elif "folk" in genre or "country" in genre or "ballad" in genre:
+ genre_factor = 0.7 # Lower density for folk styles
+ elif "metal" in genre or "rock" in genre:
+ genre_factor = 1.1 # Slightly higher density for rock/metal
+ elif "jazz" in genre:
+ genre_factor = 1.2 # Higher density for jazz (complex rhythms)
+ elif "classical" in genre:
+ genre_factor = 0.9 # More moderate for classical
+
+ # Calculate adjusted syllable count with scientific weighting
+ raw_count = base_syllables * syllable_factor * genre_factor
+
+ # Use more precise rounding that preserves subtle differences
+ # Round to quarters rather than halves for more precision
+ rounded_count = round(raw_count * 4) / 4
+
+ # Limit to reasonable range (0.5 to 4) with improved bounds
+ syllable_count = max(0.5, min(4, rounded_count))
+
+ # Format with embedded strength value for reversibility
+ # Convert strength to 2-decimal precision percentage
+ strength_pct = round(strength * 100) / 100
+ detailed_template.append(f"{stress_type}({strength_pct}):{syllable_count}")
+
+ # Join beat templates for this phrase
+ phrase_template = "-".join(detailed_template)
+ syllable_templates.append(phrase_template)
+
+ # Step 6: Ensure valid output with improved defaults
+ # ----------------------------------------------------------------------
+ if not syllable_templates:
+ # Create sensible defaults based on time signature that reflect musical theory
+ if time_signature == 3: # 3/4 time - waltz pattern
+ syllable_templates = ["S(0.95):2-w(0.4):1-w(0.35):1"] # 3/4 default
+ elif time_signature == 2: # 2/4 time - march pattern
+ syllable_templates = ["S(0.95):1.5-w(0.4):1"] # 2/4 default
+ else: # 4/4 time - common time
+ syllable_templates = ["S(0.95):2-w(0.4):1-m(0.7):1.5-w(0.35):1"] # 4/4 default
+
+ # Join all phrase templates with the original separator for compatibility
+ return "|".join(syllable_templates)
+
+def format_syllable_templates_for_prompt(syllable_templates, arrow="→", line_wrap=10,
+ structured_output=False, beat_types=None):
+ """
+ Convert technical syllable templates into clear, human-readable instructions with
+ enhanced flexibility and customization options.
+
+ Parameters:
+ syllable_templates: String or list of templates
+ arrow: Symbol to use between beats (default: "→")
+ line_wrap: Number of beats before automatic line wrapping (0 = no wrapping)
+ structured_output: If True, return structured data instead of text
+ beat_types: Custom mapping for beat types (default: None, uses standard mapping)
+
+ Returns:
+ Human-readable instructions or structured data depending on parameters
+ """
+ if not syllable_templates:
+ return {} if structured_output else ""
+
+ # Define standard beat type mapping (extensible)
+ default_beat_types = {
+ "S": {"name": "STRONG", "description": "stressed syllable"},
+ "m": {"name": "medium", "description": "medium-stressed syllable"},
+ "w": {"name": "weak", "description": "unstressed syllable"},
+ "X": {"name": "EXTRA", "description": "extra strong syllable"},
+ "L": {"name": "legato", "description": "connected/tied syllable"}
+ }
+
+ # Use custom mapping if provided, otherwise use default
+ beat_types = beat_types or default_beat_types
+
+ # Initialize structured output if requested
+ structured_data = {"lines": [], "explanations": []} if structured_output else None
+
+ # Improved format detection - more robust than just checking for "|"
+ is_enhanced_format = False
+
+ # Check if it's a string with enhanced format patterns
+ if isinstance(syllable_templates, str):
+ # Look for enhanced format patterns - check for beat type indicators
+ if any(bt + "(" in syllable_templates or bt + ":" in syllable_templates or bt + "[" in syllable_templates
+ for bt in beat_types.keys()):
+ is_enhanced_format = True
+ # Secondary check for the "|" delimiter between phrases
+ elif "|" in syllable_templates:
+ is_enhanced_format = True
+
+ # Initialize the output with a brief explanatory header
+ output = []
+
+ if is_enhanced_format:
+ # Split into individual phrase templates
+ phrases = syllable_templates.split("|") if "|" in syllable_templates else [syllable_templates]
+
+ # Process each phrase into human-readable instructions
+ for i, phrase in enumerate(phrases):
+ # Check for special annotations
+ has_swing = "(swing)" in phrase
+ if has_swing:
+ phrase = phrase.replace("(swing)", "") # Remove annotation for processing
+
+ beats = phrase.split("-")
+ beat_instructions = []
+
+ # Process each beat in the phrase
+ for j, beat in enumerate(beats):
+ # Extract beat type and information
+ beat_info = {"original": beat, "type": None, "count": None, "strength": None}
+
+ # Handle enhanced format with embedded strength values: S(0.95):2
+ if "(" in beat and ")" in beat and ":" in beat:
+ parts = beat.split(":")
+ beat_type = parts[0].split("(")[0] # Extract beat type
+ strength = parts[0].split("(")[1].rstrip(")") # Extract strength value
+ count = parts[1] # Extract syllable count
+
+ beat_info["type"] = beat_type
+ beat_info["count"] = count
+ beat_info["strength"] = strength
+
+ # Handle simpler format: S2, m1, w1
+ elif any(beat.startswith(bt) for bt in beat_types.keys()) and len(beat) > 1:
+ beat_type = beat[0]
+ count = beat[1:]
+
+ beat_info["type"] = beat_type
+ beat_info["count"] = count
+
+ # Fallback for any other format
+ else:
+ beat_instructions.append(beat)
+ continue
+
+ # Format the beat instruction based on type
+ if beat_info["type"] in beat_types:
+ type_name = beat_types[beat_info["type"]]["name"]
+ if beat_info["strength"]:
+ beat_instructions.append(f"{type_name}({beat_info['count']}) [{beat_info['strength']}]")
+ else:
+ beat_instructions.append(f"{type_name}({beat_info['count']})")
+ else:
+ # Unknown beat type, use as-is
+ beat_instructions.append(beat)
+
+ # Handle line wrapping for readability
+ if line_wrap > 0 and len(beat_instructions) > line_wrap:
+ wrapped_instructions = []
+ for k in range(0, len(beat_instructions), line_wrap):
+ section = beat_instructions[k:k+line_wrap]
+ wrapped_instructions.append(f"{arrow} ".join(section))
+ line_desc = f"\n {arrow} ".join(wrapped_instructions)
+ else:
+ line_desc = f" {arrow} ".join(beat_instructions)
+
+ # Add swing notation if present
+ if has_swing:
+ line_desc += " [with swing feel]"
+
+ # Add to output
+ line_output = f"Line {i+1}: {line_desc}"
+ output.append(line_output)
+
+ if structured_output:
+ structured_data["lines"].append({
+ "line_number": i+1,
+ "beats": [{"original": beats[j],
+ "type": beat_info.get("type"),
+ "count": beat_info.get("count"),
+ "strength": beat_info.get("strength")}
+ for j, beat_info in enumerate([b for b in beats if isinstance(b, dict)])],
+ "has_swing": has_swing
+ })
+
+ # Add explanation of notation after the lines
+ explanation = [
+ "\n📝 UNDERSTANDING THE NOTATION:"
+ ]
+
+ # Add descriptions for each beat type that was actually used
+ used_beat_types = set()
+ for phrase in phrases:
+ for beat in phrase.split("-"):
+ for bt in beat_types.keys():
+ if beat.startswith(bt):
+ used_beat_types.add(bt)
+
+ for bt in used_beat_types:
+ if bt in beat_types:
+ name = beat_types[bt]["name"]
+ desc = beat_types[bt]["description"]
+ explanation.append(f"- {name}(n): Place a {desc} here, plus (n-1) unstressed syllables")
+
+ explanation.extend([
+ f"- {arrow}: Indicates flow from one beat to the next",
+ "- [0.xx]: Beat strength value (higher = more emphasis needed)"
+ ])
+
+ output.extend(explanation)
+
+ if structured_output:
+ structured_data["explanations"] = explanation
+
+ # Add examples for half-syllable values if they appear in the templates
+ has_half_syllables = any((".5" in beat) for phrase in phrases for beat in phrase.split("-"))
+ if has_half_syllables:
+ half_syllable_examples = [
+ "\n🎵 HALF-SYLLABLE EXAMPLES:",
+ "- STRONG(1.5): One stressed syllable followed by an unstressed half-syllable",
+ " Example: \"LOVE you\" where \"LOVE\" is stressed and \"you\" is quick",
+ "- medium(2.5): One medium syllable plus one-and-a-half unstressed syllables",
+ " Example: \"Wait for the\" where \"Wait\" is medium-stressed and \"for the\" is quick"
+ ]
+ output.extend(half_syllable_examples)
+
+ if structured_output:
+ structured_data["half_syllable_examples"] = half_syllable_examples
+
+ # Add swing explanation if needed
+ if any("swing" in phrase for phrase in phrases):
+ swing_guide = [
+ "\n🎶 SWING RHYTHM GUIDE:",
+ "- In swing, syllables should be unevenly timed (long-short pattern)",
+ "- Example: \"SUM-mer TIME\" in swing feels like \"SUM...mer-TIME\" with delay"
+ ]
+ output.extend(swing_guide)
+
+ if structured_output:
+ structured_data["swing_guide"] = swing_guide
+
+ # Handle the original format or segment dictionaries
+ else:
+ formatted_lines = []
+
+ if isinstance(syllable_templates, list):
+ for i, template in enumerate(syllable_templates):
+ if isinstance(template, dict) and "syllable_template" in template:
+ line = f"Line {i+1}: {template['syllable_template']} syllables"
+ formatted_lines.append(line)
+
+ if structured_output:
+ structured_data["lines"].append({
+ "line_number": i+1,
+ "syllable_count": template["syllable_template"]
+ })
+ elif isinstance(template, str):
+ line = f"Line {i+1}: {template} syllables"
+ formatted_lines.append(line)
+
+ if structured_output:
+ structured_data["lines"].append({
+ "line_number": i+1,
+ "syllable_count": template
+ })
+
+ output = formatted_lines
+ else:
+ output = [str(syllable_templates)]
+
+ if structured_output:
+ structured_data["raw_content"] = str(syllable_templates)
+
+ # Add general application advice
+ application_tips = [
+ "\n💡 APPLICATION TIPS:",
+ "1. Strong beats need naturally stressed syllables (like the START of \"RE-mem-ber\")",
+ "2. Place important words on strong beats for natural emphasis",
+ "3. Vowel sounds work best for sustained or emphasized syllables",
+ "4. Keep consonant clusters (like \"str\" or \"thr\") on weak beats"
+ ]
+ output.extend(application_tips)
+
+ if structured_output:
+ structured_data["application_tips"] = application_tips
+ return structured_data
+
+ return "\n".join(output)
+
+def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=None):
+ """
+ Enhanced verification of syllable counts and stress patterns with precise alignment analysis
+ for both phrase-level and second-level templates.
+ """
+ import re
+ import pronouncing
+ import numpy as np
+ import functools
+ from itertools import chain
+
+ print(f"DEBUG: In verify_flexible_syllable_counts, type of lyrics={type(lyrics)}")
+ print(f"DEBUG: Type of templates={type(templates)}")
+
+ # Ensure lyrics is a string
+ if not isinstance(lyrics, str):
+ print(f"DEBUG: lyrics is not a string, it's {type(lyrics)}")
+ # Convert to string if possible
+ try:
+ lyrics = str(lyrics)
+ except Exception as e:
+ print(f"DEBUG: Cannot convert lyrics to string: {str(e)}")
+ return "Error: Cannot process non-string lyrics"
+
+ # Ensure templates is a list
+ if not isinstance(templates, list):
+ print(f"DEBUG: templates is not a list, it's {type(templates)}")
+ # If it's not a list, create a single-item list
+ if templates is not None:
+ templates = [templates]
+ else:
+ templates = []
+
+ # Split lyrics into lines
+ lines = [line.strip() for line in lyrics.split("\n") if line.strip()]
+
+ # Initialize tracking variables
+ verification_notes = []
+ detailed_analysis = []
+ stress_misalignments = []
+ total_mismatch_count = 0
+
+ # Process each lyric line against its template
+ for i, line in enumerate(lines):
+ if i >= len(templates):
+ break
+
+ template = templates[i]
+ print(f"DEBUG: Processing template {i+1}, type={type(template)}")
+
+ # Extract the template string from different possible formats
+ template_str = None
+ if isinstance(template, dict) and "syllable_template" in template:
+ template_str = template["syllable_template"]
+ elif isinstance(template, str):
+ template_str = template
+ else:
+ print(f"DEBUG: Skipping template {i+1}, not a string or dict with syllable_template")
+ continue
+
+ if not isinstance(template_str, str):
+ print(f"DEBUG: template_str is not a string, it's {type(template_str)}")
+ continue
+
+ # Handle multiple phrases in template - process ALL phrases, not just the first
+ template_phrases = [template_str]
+ if "|" in template_str:
+ template_phrases = template_str.split("|")
+
+ # Check against all phrases and find the best match
+ best_match_diff = float('inf')
+ best_match_phrase = None
+ best_phrase_beats = None
+ actual_count = count_syllables(line)
+
+ for phrase_idx, phrase in enumerate(template_phrases):
+ # Extract beat patterns and expected syllable counts from template
+ beats_info = []
+ total_expected = 0
+
+ # Enhanced template parsing
+ if "-" in phrase:
+ beat_templates = phrase.split("-")
+
+ # Parse each beat template
+ for beat in beat_templates:
+ beat_info = {"original": beat, "type": None, "count": 1, "strength": None}
+
+ # Handle templates with embedded strength values: S(0.95):2
+ if "(" in beat and ")" in beat and ":" in beat:
+ parts = beat.split(":")
+ beat_type = parts[0].split("(")[0]
+ try:
+ strength = float(parts[0].split("(")[1].rstrip(")"))
+ except ValueError:
+ strength = 1.0
+
+ # Handle potential float syllable counts
+ try:
+ count = float(parts[1])
+ # Convert to int if it's a whole number
+ if count == int(count):
+ count = int(count)
+ except ValueError:
+ count = 1
+
+ beat_info.update({
+ "type": beat_type,
+ "count": count,
+ "strength": strength
+ })
+
+ # Handle simple format: S2, m1, w1
+ elif any(beat.startswith(x) for x in ["S", "m", "w", "X", "L"]):
+ beat_type = beat[0]
+
+ # Extract count, supporting float values
+ try:
+ count_str = beat[1:]
+ count = float(count_str)
+ if count == int(count):
+ count = int(count)
+ except ValueError:
+ count = 1
+
+ beat_info.update({
+ "type": beat_type,
+ "count": count
+ })
+
+ # Legacy format - just numbers
+ else:
+ try:
+ count = float(beat)
+ if count == int(count):
+ count = int(count)
+ beat_info["count"] = count
+ except ValueError:
+ pass
+
+ beats_info.append(beat_info)
+ total_expected += beat_info["count"]
+
+ # Compare this phrase to actual syllable count
+ phrase_diff = abs(actual_count - total_expected)
+
+ # Adaptive threshold based on expected syllables
+ expected_ratio = 0.15 if total_expected > 10 else 0.25
+ phrase_threshold = max(1, round(total_expected * expected_ratio))
+
+ # If this is the best match so far, store it
+ if phrase_diff < best_match_diff:
+ best_match_diff = phrase_diff
+ best_match_phrase = phrase
+ best_phrase_beats = beats_info
+
+ # For very simple templates without "-"
+ else:
+ try:
+ total_expected = float(phrase)
+ phrase_diff = abs(actual_count - total_expected)
+ if phrase_diff < best_match_diff:
+ best_match_diff = phrase_diff
+ best_match_phrase = phrase
+ best_phrase_beats = [{"count": total_expected}]
+ except ValueError:
+ pass
+
+ # If we found a reasonable match, proceed with analysis
+ if best_match_phrase and best_phrase_beats:
+ total_expected = sum(beat["count"] for beat in best_phrase_beats)
+
+ # Calculate adaptive threshold based on expected syllables
+ expected_ratio = 0.15 if total_expected > 10 else 0.25
+ threshold = max(1, round(total_expected * expected_ratio))
+
+ # Check if total syllable count is significantly off
+ if total_expected > 0 and best_match_diff > threshold:
+ verification_notes.append(f"Line {i+1}: Expected {total_expected} syllables, got {actual_count}")
+ total_mismatch_count += 1
+
+ # Extract words and perform detailed alignment analysis
+ words = re.findall(r'\b[a-zA-Z]+\b', line.lower())
+
+ # Get syllable count and stress for each word
+ word_analysis = []
+ cumulative_syllables = 0
+
+ for word in words:
+ syllable_count = count_syllables_for_word(word)
+
+ # Get stress pattern
+ stress_pattern = get_word_stress(word)
+
+ word_analysis.append({
+ "word": word,
+ "syllables": syllable_count,
+ "stress_pattern": stress_pattern,
+ "position": cumulative_syllables
+ })
+
+ cumulative_syllables += syllable_count
+
+ # Analyze alignment with beats - only if there are beat types
+ if best_phrase_beats and any(b.get("type") == "S" for b in best_phrase_beats if "type" in b):
+ # Identify positions where strong syllables should fall
+ strong_positions = []
+ current_pos = 0
+
+ for beat in best_phrase_beats:
+ if beat.get("type") == "S":
+ strong_positions.append(current_pos)
+ current_pos += beat.get("count", 1)
+
+ # Check if strong syllables align with strong beats
+ alignment_issues = []
+
+ for pos in strong_positions:
+ # Find which word contains this position
+ misaligned_word = None
+
+ for word_info in word_analysis:
+ word_start = word_info["position"]
+ word_end = word_start + word_info["syllables"]
+
+ if word_start <= pos < word_end:
+ # Check if a stressed syllable falls on this position
+ syllable_in_word = pos - word_start
+
+ # Get stress pattern for this word
+ stress = word_info["stress_pattern"]
+
+ # If we have stress information and this syllable isn't stressed
+ if stress and syllable_in_word < len(stress) and stress[syllable_in_word] != '1':
+ misaligned_word = word_info["word"]
+ alignment_issues.append(f"'{word_info['word']}' (unstressed syllable on strong beat)")
+ stress_misalignments.append({
+ "line": i+1,
+ "word": word_info["word"],
+ "position": pos,
+ "suggestion": get_stress_aligned_alternatives(word_info["word"], syllable_in_word)
+ })
+ break
+
+ if alignment_issues:
+ verification_notes.append(f" → Stress misalignments: {', '.join(alignment_issues)}")
+
+ # Generate a visual alignment map for better understanding
+ alignment_map = generate_alignment_visualization(line, best_phrase_beats, word_analysis)
+ if alignment_map:
+ detailed_analysis.append(f"Line {i+1} Alignment Analysis:\n{alignment_map}")
+ else:
+ # If no matching template was found
+ verification_notes.append(f"Line {i+1}: Unable to find matching template pattern")
+
+ # Add second-level verification if templates are provided
+ if second_level_templates:
+ verification_notes.append("\n=== SECOND-LEVEL VERIFICATION ===\n")
+
+ # Check each second against corresponding line
+ for i, template in enumerate(second_level_templates):
+ if i >= len(lines):
+ break
+
+ line = lines[i]
+
+ # Skip section headers
+ if line.startswith('[') and ']' in line:
+ continue
+
+ actual_count = count_syllables(line)
+
+ # Parse template to get expected syllable count
+ total_expected = 0
+ beat_patterns = []
+
+ # Handle templates with beat patterns like "S(0.95):2-w(0.4):1"
+ if isinstance(template, str) and "-" in template:
+ for beat in template.split("-"):
+ if ":" in beat:
+ try:
+ count_part = beat.split(":")[1]
+ count = float(count_part)
+ total_expected += count
+
+ # Extract beat type for alignment check
+ beat_type = beat.split("(")[0] if "(" in beat else beat[0]
+ beat_patterns.append((beat_type, count))
+ except (IndexError, ValueError):
+ pass
+
+ # Compare actual vs expected count
+ if total_expected > 0:
+ # Calculate adaptive threshold based on expected syllables
+ expected_ratio = 0.2 # More strict at second level
+ threshold = max(0.5, round(total_expected * expected_ratio))
+
+ difference = abs(actual_count - total_expected)
+
+ if difference > threshold:
+ verification_notes.append(f"Second {i+1}: Expected {total_expected} syllables, got {actual_count}")
+ total_mismatch_count += 1
+
+ # Check for stress misalignment in this second
+ words = re.findall(r'\b[a-zA-Z]+\b', line.lower())
+ word_analysis = []
+ cumulative_syllables = 0
+
+ for word in words:
+ syllable_count = count_syllables_for_word(word)
+ stress_pattern = get_word_stress(word)
+
+ word_analysis.append({
+ "word": word,
+ "syllables": syllable_count,
+ "stress_pattern": stress_pattern,
+ "position": cumulative_syllables
+ })
+
+ cumulative_syllables += syllable_count
+
+ # Check if stressed syllables align with strong beats
+ if beat_patterns:
+ strong_positions = []
+ current_pos = 0
+
+ for beat_type, count in beat_patterns:
+ if beat_type == "S":
+ strong_positions.append(current_pos)
+ current_pos += count
+
+ # Look for misalignments
+ for pos in strong_positions:
+ for word_info in word_analysis:
+ word_start = word_info["position"]
+ word_end = word_start + word_info["syllables"]
+
+ if word_start <= pos < word_end:
+ # Check if a stressed syllable falls on this position
+ syllable_in_word = int(pos - word_start)
+ stress = word_info["stress_pattern"]
+
+ if stress and syllable_in_word < len(stress) and stress[syllable_in_word] != '1':
+ verification_notes.append(f" → In second {i+1}, '{word_info['word']}' has unstressed syllable on strong beat")
+ break
+
+ # Only add detailed analysis if we have rhythm mismatches
+ if verification_notes:
+ lyrics += "\n\n[Note: Potential rhythm mismatches detected in these lines:]\n"
+ lyrics += "\n".join(verification_notes)
+
+ if detailed_analysis:
+ lyrics += "\n\n[Detailed Alignment Analysis:]\n"
+ lyrics += "\n\n".join(detailed_analysis)
+
+ lyrics += "\n\n[How to fix rhythm mismatches:]\n"
+ lyrics += "1. Make sure stressed syllables (like 'LO' in 'LOV-er') fall on STRONG beats\n"
+ lyrics += "2. Adjust syllable counts to match the template (add/remove words or use different words)\n"
+ lyrics += "3. Try using words where natural stress aligns with musical rhythm\n"
+
+ # Add specific word substitution suggestions if we found stress misalignments
+ if stress_misalignments:
+ lyrics += "\n[Specific word replacement suggestions:]\n"
+ for issue in stress_misalignments[:5]: # Limit to first 5 issues
+ if issue["suggestion"]:
+ lyrics += f"Line {issue['line']}: Consider replacing '{issue['word']}' with: {issue['suggestion']}\n"
+
+ return lyrics
+
+def generate_alignment_visualization(line, beats_info, word_analysis):
+ """Generate a visual representation of syllable alignment with beats."""
+ if not beats_info or not word_analysis:
+ return None
+
+ # Create a syllable breakdown with stress information
+ syllable_breakdown = []
+ syllable_stresses = []
+
+ for word_info in word_analysis:
+ word = word_info["word"]
+ syllables = word_info["syllables"]
+ stress = word_info["stress_pattern"] or ""
+
+ # Extend stress pattern if needed
+ while len(stress) < syllables:
+ stress += "0"
+
+ # Get syllable breakdown
+ parts = naive_syllable_split(word, syllables)
+
+ for i, part in enumerate(parts):
+ syllable_breakdown.append(part)
+ if i < len(stress):
+ syllable_stresses.append(stress[i])
+ else:
+ syllable_stresses.append("0")
+
+ # Create beat pattern
+ beat_types = []
+ current_pos = 0
+
+ for beat in beats_info:
+ beat_type = beat.get("type", "-")
+ count = beat.get("count", 1)
+
+ # Handle whole numbers and half syllables
+ if isinstance(count, int):
+ beat_types.extend([beat_type] * count)
+ else:
+ # For half syllables, round up and use markers
+ whole_part = int(count)
+ frac_part = count - whole_part
+
+ if whole_part > 0:
+ beat_types.extend([beat_type] * whole_part)
+
+ if frac_part > 0:
+ beat_types.append(f"{beat_type}½")
+
+ # Ensure we have enough beat types
+ while len(beat_types) < len(syllable_breakdown):
+ beat_types.append("-")
+
+ # Trim beat types if too many
+ beat_types = beat_types[:len(syllable_breakdown)]
+
+ # Generate the visualization with highlighted misalignments
+ result = []
+
+ # First line: syllable breakdown with stress indicators
+ syllable_display = []
+ for i, syllable in enumerate(syllable_breakdown):
+ if i < len(syllable_stresses) and syllable_stresses[i] == "1":
+ syllable_display.append(syllable.upper()) # Uppercase for stressed syllables
+ else:
+ syllable_display.append(syllable.lower()) # Lowercase for unstressed
+
+ result.append(" - ".join(syllable_display))
+
+ # Second line: beat indicators with highlighting for misalignments
+ beat_indicators = []
+ for i, (syllable, beat_type) in enumerate(zip(syllable_stresses, beat_types)):
+ if beat_type == "S" or beat_type.startswith("S"):
+ if syllable == "1":
+ beat_indicators.append("↑") # Aligned strong beat
+ else:
+ beat_indicators.append("❌") # Misaligned strong beat
+ elif beat_type == "m" or beat_type.startswith("m"):
+ beat_indicators.append("•") # Medium beat
+ elif beat_type == "w" or beat_type.startswith("w"):
+ beat_indicators.append("·") # Weak beat
+ else:
+ beat_indicators.append(" ")
+
+ result.append(" ".join(beat_indicators))
+
+ # Third line: beat types
+ result.append(" - ".join(beat_types))
+
+ return "\n".join(result)
+
+@functools.lru_cache(maxsize=256)
+def naive_syllable_split(word, syllable_count):
+ """Naively split a word into the specified number of syllables, with caching for performance."""
+ if syllable_count <= 1:
+ return [word]
+
+ # Common syllable break patterns
+ vowels = "aeiouy"
+ consonants = "bcdfghjklmnpqrstvwxz"
+
+ # Find potential split points
+ splits = []
+ for i in range(1, len(word) - 1):
+ if word[i] in consonants and word[i-1] in vowels:
+ splits.append(i)
+ elif word[i] in vowels and word[i-1] in consonants and word[i+1] in consonants:
+ splits.append(i+1)
+
+ # Ensure we have enough split points
+ while len(splits) < syllable_count - 1:
+ for i in range(1, len(word)):
+ if i not in splits:
+ splits.append(i)
+ break
+
+ # Sort and limit
+ splits.sort()
+ splits = splits[:syllable_count - 1]
+
+ # Split the word
+ result = []
+ prev = 0
+ for pos in splits:
+ result.append(word[prev:pos])
+ prev = pos
+
+ result.append(word[prev:])
+ return result
+
+def get_stress_aligned_alternatives(word, position_to_stress):
+ """Suggest alternative words with proper stress at the required position."""
+ # This would ideally use a more sophisticated dictionary lookup,
+ # but here's a simple implementation with common word patterns
+ syllable_count = count_syllables_for_word(word)
+
+ # Common synonyms/replacements by syllable count with stress position
+ if syllable_count == 2:
+ if position_to_stress == 0: # Need stress on first syllable
+ first_stress = ["love-ly", "won-der", "beau-ty", "danc-ing", "dream-ing",
+ "heart-beat", "sun-light", "moon-light", "star-light"]
+ return ", ".join(first_stress[:3])
+ else: # Need stress on second syllable
+ second_stress = ["be-LIEVE", "a-BOVE", "a-ROUND", "to-DAY", "a-LIVE",
+ "a-LONE", "be-HOLD", "re-TURN", "de-LIGHT"]
+ return ", ".join(second_stress[:3])
+ elif syllable_count == 3:
+ if position_to_stress == 0: # First syllable stress
+ return "MEM-o-ry, WON-der-ful, BEAU-ti-ful"
+ elif position_to_stress == 1: # Second syllable stress
+ return "a-MAZE-ing, to-GE-ther, for-EV-er"
+ else: # Third syllable stress
+ return "un-der-STAND, o-ver-COME, ne-ver-MORE"
+
+ # For other cases, just provide general guidance
+ return f"a word with stress on syllable {position_to_stress + 1}"
+
+def generate_lyrics(genre, duration, emotion_results, song_structure=None):
+ """
+ Generate lyrics based on the genre, emotion, and structure analysis with enhanced rhythmic alignment.
+
+ This improved version uses advanced template creation, better formatting, and verification with
+ potential refinement for lyrics that perfectly match the musical rhythm patterns.
+
+ Parameters:
+ genre: Musical genre of the audio
+ duration: Duration of the audio in seconds
+ emotion_results: Dictionary containing emotional analysis results
+ song_structure: Optional dictionary containing song structure analysis
+
+ Returns:
+ Generated lyrics aligned with the rhythm patterns of the music
+ """
+ # Safety check for strings
+ def is_safe_dict_access(obj, key):
+ """Safe dictionary key access with type checking"""
+ if not isinstance(obj, dict):
+ print(f"WARNING: Attempted to access key '{key}' on non-dictionary object of type {type(obj)}")
+ return False
+ return key in obj
+
+ # Ensure emotion_results is a dictionary with the expected structure
+ if not isinstance(emotion_results, dict):
+ emotion_results = {
+ "emotion_analysis": {"primary_emotion": "Unknown"},
+ "theme_analysis": {"primary_theme": "Unknown"},
+ "rhythm_analysis": {"tempo": 0},
+ "tonal_analysis": {"key": "Unknown", "mode": ""},
+ "summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"}
+ }
+
+ # Ensure song_structure is properly structured
+ if song_structure is not None and not isinstance(song_structure, dict):
+ print(f"WARNING: song_structure is not a dict, it's {type(song_structure)}")
+ song_structure = None
+
+ print(f"DEBUG: Starting generate_lyrics with genre={genre}, duration={duration}")
+ print(f"DEBUG: Type of song_structure={type(song_structure)}")
+ print(f"DEBUG: Type of emotion_results={type(emotion_results)}")
+
+ # Helper function to safely access dictionary with string keys
+ def safe_dict_get(d, key, default=None):
+ """Safely get a value from a dictionary, handling non-dictionary objects."""
+ if not isinstance(d, dict):
+ print(f"WARNING: Attempted to access key '{key}' in non-dictionary object of type {type(d)}")
+ return default
+ return d.get(key, default)
+
+ # Extract emotion and theme data with safe defaults
+ primary_emotion = safe_dict_get(safe_dict_get(emotion_results, "emotion_analysis", {}), "primary_emotion", "Unknown")
+ primary_theme = safe_dict_get(safe_dict_get(emotion_results, "theme_analysis", {}), "primary_theme", "Unknown")
+
+ # Extract numeric values safely with fallbacks
+ try:
+ tempo = float(safe_dict_get(safe_dict_get(emotion_results, "rhythm_analysis", {}), "tempo", 0.0))
+ except (ValueError, TypeError):
+ tempo = 0.0
+
+ key = safe_dict_get(safe_dict_get(emotion_results, "tonal_analysis", {}), "key", "Unknown")
+ mode = safe_dict_get(safe_dict_get(emotion_results, "tonal_analysis", {}), "mode", "")
+
+ # Format syllable templates for the prompt
+ syllable_guidance = ""
+ templates_for_verification = []
+
+ # Create a structure visualization to help with lyrics-music matching
+ structure_visualization = "=== MUSIC-LYRICS STRUCTURE MATCHING ===\n\n"
+ structure_visualization += f"Song Duration: {duration:.1f} seconds\n"
+ structure_visualization += f"Tempo: {tempo:.1f} BPM\n\n"
+
+ # Add second-level template guidance if available
+ if song_structure and is_safe_dict_access(song_structure, "second_level") and is_safe_dict_access(song_structure.get("second_level", {}), "templates"):
+ print(f"DEBUG: Using second-level templates")
+ second_level_templates = song_structure.get("second_level", {}).get("templates", [])
+
+ # Create second-level guidance
+ second_level_guidance = "\nSECOND-BY-SECOND RHYTHM INSTRUCTIONS:\n"
+ second_level_guidance += "Each line below corresponds to ONE SECOND of audio. Follow these rhythm patterns EXACTLY:\n\n"
+
+ # Format each second's template
+ formatted_second_templates = []
+ for i, template in enumerate(second_level_templates):
+ if i < min(60, len(second_level_templates)): # Limit to 60 seconds to avoid overwhelming the LLM
+ formatted_template = format_syllable_templates_for_prompt(template, arrow="→", line_wrap=0)
+ formatted_second_templates.append(f"Second {i+1}: {formatted_template}")
+
+ second_level_guidance += "\n".join(formatted_second_templates)
+
+ # Add critical instructions for second-level alignment
+ second_level_guidance += "\n\nCRITICAL: Create ONE LINE of lyrics for EACH SECOND, following the exact rhythm pattern."
+ second_level_guidance += "\nIf a second has no beats, use it for a breath or pause in the lyrics."
+ second_level_guidance += "\nThe first line of your lyrics MUST match Second 1, the second line matches Second 2, and so on."
+
+ # Add to syllable guidance
+ syllable_guidance = second_level_guidance
+
+ # Store templates for verification
+ templates_for_verification = second_level_templates
+
+ elif song_structure:
+ print(f"DEBUG: Checking flexible structure")
+ # Try to use flexible structure if available
+ if is_safe_dict_access(song_structure, "flexible_structure"):
+ print(f"DEBUG: Using flexible structure")
+ flexible = song_structure.get("flexible_structure", {})
+ if is_safe_dict_access(flexible, "segments") and len(flexible.get("segments", [])) > 0:
+ print(f"DEBUG: Found segments in flexible structure")
+ # Get the segments
+ segments = flexible.get("segments", [])
+
+ # Add structure visualization
+ structure_visualization += f"Total segments: {len(segments)}\n"
+ structure_visualization += "Each segment represents one musical phrase for which you should write ONE line of lyrics.\n\n"
+
+ # Process each segment to create enhanced rhythmic templates
+ enhanced_templates = []
+
+ for i, segment in enumerate(segments):
+ if i < 30: # Extend limit to 30 lines to handle longer songs
+ # Get the beat information for this segment
+ segment_start = segment["start"]
+ segment_end = segment["end"]
+
+ # Add segment info to visualization
+ structure_visualization += f"Segment {i+1}: {segment_start:.1f}s - {segment_end:.1f}s (duration: {segment_end-segment_start:.1f}s)\n"
+
+ # Find beats within this segment
+ segment_beats = []
+
+ # Add type checking for beat_times access
+ print(f"DEBUG: Checking beat_times in flexible structure")
+ if is_safe_dict_access(flexible, "beats") and is_safe_dict_access(flexible.get("beats", {}), "beat_times"):
+ beat_times = flexible.get("beats", {}).get("beat_times", [])
+ if isinstance(beat_times, list):
+ beat_strengths = flexible.get("beats", {}).get("beat_strengths", [])
+
+ for j, beat_time in enumerate(beat_times):
+ if segment_start <= beat_time < segment_end:
+ # Add this beat to the segment
+ segment_beats.append(j)
+
+ # Create segment-specific beat info
+ segment_beats_info = {
+ "beat_times": [beat_times[j] for j in segment_beats if j < len(beat_times)],
+ "tempo": flexible.get("beats", {}).get("tempo", 120)
+ }
+
+ if beat_strengths and isinstance(beat_strengths, list):
+ segment_beats_info["beat_strengths"] = [
+ beat_strengths[j] for j in segment_beats
+ if j < len(beat_strengths)
+ ]
+
+ # Create a phrase structure for this segment
+ segment_beats_info["phrases"] = [segment_beats]
+
+ # Generate enhanced template with genre awareness and auto phrasing
+ print(f"DEBUG: Creating flexible syllable template for segment {i+1}")
+ enhanced_template = create_flexible_syllable_templates(
+ segment_beats_info,
+ genre=genre,
+ phrase_mode='auto' if i == 0 else 'default'
+ )
+ enhanced_templates.append(enhanced_template)
+ templates_for_verification.append(enhanced_template)
+
+ # Add template to visualization
+ structure_visualization += f" Template: {enhanced_template}\n"
+ else:
+ print(f"DEBUG: beat_times is not a list, it's {type(beat_times)}")
+ else:
+ print(f"DEBUG: beats or beat_times not found in flexible structure")
+ # Skip segment if we don't have beat information
+ continue
+
+ # Use these templates to determine rhythm patterns, without classifying as verse/chorus
+ pattern_groups = {}
+
+ for i, template in enumerate(enhanced_templates):
+ # Create simplified version for pattern matching
+ simple_pattern = template.replace("(", "").replace(")", "").replace(":", "")
+
+ # Check if this pattern is similar to any we've seen
+ found_match = False
+ for group, patterns in pattern_groups.items():
+ if any(simple_pattern == p.replace("(", "").replace(")", "").replace(":", "") for p in patterns):
+ pattern_groups[group].append(template)
+ found_match = True
+ break
+
+ if not found_match:
+ # New pattern type
+ group_name = f"Group_{len(pattern_groups) + 1}"
+ pattern_groups[group_name] = [template]
+
+ # Format templates with improved formatting for the prompt
+ syllable_guidance = "CRITICAL RHYTHM INSTRUCTIONS:\n"
+ syllable_guidance += "Each line of lyrics MUST match exactly with one musical phrase/segment.\n"
+ syllable_guidance += "Follow these rhythm patterns for each line (STRONG beats need stressed syllables):\n\n"
+
+ # Add formatted templates without section labels
+ formatted_templates = []
+ for i, template in enumerate(enhanced_templates):
+ formatted_templates.append(format_syllable_templates_for_prompt([template], arrow="→", line_wrap=8))
+
+ syllable_guidance += "\n".join(formatted_templates)
+
+ # Store info for later use in traditional sections approach
+ use_sections = True
+
+ # Use the detected section structure for traditional approach
+ if verse_lines > 0:
+ verse_lines = min(verse_lines, total_lines // 2) # Ensure reasonable limits
+ else:
+ verse_lines = total_lines // 2
+
+ if chorus_lines > 0:
+ chorus_lines = min(chorus_lines, total_lines // 3)
+ else:
+ chorus_lines = total_lines // 3
+
+ if bridge_lines > 0:
+ bridge_lines = min(bridge_lines, total_lines // 6)
+ else:
+ bridge_lines = 0
+
+ # Fallback to traditional sections if needed
+ elif song_structure and is_safe_dict_access(song_structure, "syllables") and song_structure.get("syllables"):
+ syllable_guidance = "RHYTHM PATTERN INSTRUCTIONS:\n"
+ syllable_guidance += "Follow these syllable patterns for each section. Each line should match ONE phrase:\n\n"
+
+ # Count sections for visualization
+ section_counts = {"verse": 0, "chorus": 0, "bridge": 0, "intro": 0, "outro": 0}
+
+ for section in song_structure.get("syllables", []):
+ if not isinstance(section, dict):
+ continue
+
+ section_type = section.get("type", "verse")
+ section_counts[section_type] = section_counts.get(section_type, 0) + 1
+
+ if is_safe_dict_access(section, "syllable_template"):
+ # Process to create enhanced template
+ if is_safe_dict_access(song_structure, "beats") and is_safe_dict_access(song_structure.get("beats", {}), "beat_times"):
+ section_beats_info = {
+ "beat_times": [beat for beat in song_structure.get("beats", {}).get("beat_times", [])
+ if section.get("start", 0) <= beat < section.get("end", 0)],
+ "tempo": song_structure.get("beats", {}).get("tempo", 120)
+ }
+
+ if is_safe_dict_access(song_structure.get("beats", {}), "beat_strengths"):
+ section_beats_info["beat_strengths"] = [
+ strength for i, strength in enumerate(song_structure.get("beats", {}).get("beat_strengths", []))
+ if i < len(song_structure.get("beats", {}).get("beat_times", [])) and
+ section.get("start", 0) <= song_structure.get("beats", {}).get("beat_times", [])[i] < section.get("end", 0)
+ ]
+
+ # Create a phrase structure for this section
+ section_beats_info["phrases"] = [list(range(len(section_beats_info["beat_times"])))]
+
+ # Create a phrase structure for this section
+ section_beats_info["phrases"] = [list(range(len(section_beats_info["beat_times"])))]
+
+ # Generate enhanced template with genre awareness
+ enhanced_template = create_flexible_syllable_templates(
+ section_beats_info,
+ genre=genre,
+ phrase_mode='auto' if section['type'] == 'verse' else 'default'
+ )
+
+ syllable_guidance += f"[{section['type'].capitalize()}]:\n"
+ syllable_guidance += format_syllable_templates_for_prompt(
+ enhanced_template,
+ arrow="→",
+ line_wrap=6
+ ) + "\n\n"
+ templates_for_verification.append(section)
+ elif "syllable_count" in section:
+ syllable_guidance += f"[{section['type'].capitalize()}]: ~{section['syllable_count']} syllables total\n"
+
+ # Create structure visualization
+ structure_visualization += "Using traditional section-based structure:\n"
+ for section_type, count in section_counts.items():
+ if count > 0:
+ structure_visualization += f"{section_type.capitalize()}: {count} sections\n"
+
+ # Set traditional section counts
+ verse_lines = max(2, section_counts.get("verse", 0) * 4)
+ chorus_lines = max(2, section_counts.get("chorus", 0) * 4)
+ bridge_lines = max(0, section_counts.get("bridge", 0) * 2)
+
+ # Use sections approach
+ use_sections = True
+
+ # If we couldn't get specific templates, use general guidance
+ if not syllable_guidance:
+ syllable_guidance = "RHYTHM ALIGNMENT INSTRUCTIONS:\n\n"
+ syllable_guidance += "1. Align stressed syllables with strong beats (usually beats 1 and 3 in 4/4 time)\n"
+ syllable_guidance += "2. Use unstressed syllables on weak beats (usually beats 2 and 4 in 4/4 time)\n"
+ syllable_guidance += "3. Use appropriate syllable counts based on tempo:\n"
+ syllable_guidance += " - Fast tempo (>120 BPM): 4-6 syllables per line\n"
+ syllable_guidance += " - Medium tempo (90-120 BPM): 6-8 syllables per line\n"
+ syllable_guidance += " - Slow tempo (<90 BPM): 8-10 syllables per line\n"
+
+ # Create basic structure visualization
+ structure_visualization += "Using estimated structure (no detailed analysis available):\n"
+
+ # Calculate rough section counts based on duration
+ estimated_lines = max(8, int(duration / 10))
+ structure_visualization += f"Estimated total lines: {estimated_lines}\n"
+
+ # Set traditional section counts based on duration
+ verse_lines = estimated_lines // 2
+ chorus_lines = estimated_lines // 3
+ bridge_lines = estimated_lines // 6 if estimated_lines > 12 else 0
+
+ # Use sections approach
+ use_sections = True
+
+ # Add examples of syllable-beat alignment with enhanced format
+ syllable_guidance += "\nEXAMPLES OF PERFECT RHYTHM ALIGNMENT:\n"
+ syllable_guidance += "Pattern: S(0.95):1 → w(0.4):1 → m(0.7):1 → w(0.3):1\n"
+ syllable_guidance += "Lyric: 'HEAR the MU-sic PLAY'\n"
+ syllable_guidance += " ↑ ↑ ↑ ↑\n"
+ syllable_guidance += " S w m w <- BEAT TYPE\n\n"
+
+ syllable_guidance += "Pattern: S(0.9):2 → w(0.3):1 → S(0.85):1 → w(0.4):2\n"
+ syllable_guidance += "Lyric: 'DANC-ing TO the RHYTHM of LOVE'\n"
+ syllable_guidance += " ↑ ↑ ↑ ↑ ↑ ↑\n"
+ syllable_guidance += " S S w S w w <- BEAT TYPE\n\n"
+
+ syllable_guidance += "Pattern: S(0.92):1 → m(0.65):2 → S(0.88):1 → w(0.35):1\n"
+ syllable_guidance += "Lyric: 'TIME keeps FLOW-ing ON and ON'\n"
+ syllable_guidance += " ↑ ↑ ↑ ↑ ↑ ↑\n"
+ syllable_guidance += " S m m S w w <- BEAT TYPE\n\n"
+
+ # Add genre-specific guidance based on the detected genre
+ genre_guidance = ""
+ if any(term in genre.lower() for term in ["rap", "hip-hop", "hip hop"]):
+ genre_guidance += "\nSPECIFIC GUIDANCE FOR RAP/HIP-HOP RHYTHMS:\n"
+ genre_guidance += "- Use more syllables per beat for rapid-fire sections\n"
+ genre_guidance += "- Create internal rhymes within lines, not just at line endings\n"
+ genre_guidance += "- Emphasize the first beat of each bar with strong consonants\n"
+ elif any(term in genre.lower() for term in ["electronic", "edm", "techno", "house", "dance"]):
+ genre_guidance += "\nSPECIFIC GUIDANCE FOR ELECTRONIC MUSIC RHYTHMS:\n"
+ genre_guidance += "- Use repetitive phrases that build and release tension\n"
+ genre_guidance += "- Match syllables precisely to the beat grid\n"
+ genre_guidance += "- Use short, percussive words on strong beats\n"
+ elif any(term in genre.lower() for term in ["rock", "metal", "punk", "alternative"]):
+ genre_guidance += "\nSPECIFIC GUIDANCE FOR ROCK RHYTHMS:\n"
+ genre_guidance += "- Use powerful, emotive words on downbeats\n"
+ genre_guidance += "- Create contrast between verse and chorus energy levels\n"
+ genre_guidance += "- Emphasize hooks with simple, memorable phrases\n"
+ elif any(term in genre.lower() for term in ["folk", "country", "acoustic", "ballad"]):
+ genre_guidance += "\nSPECIFIC GUIDANCE FOR FOLK/ACOUSTIC RHYTHMS:\n"
+ genre_guidance += "- Focus on storytelling with clear narrative flow\n"
+ genre_guidance += "- Use natural speech patterns that flow conversationally\n"
+ genre_guidance += "- Place important words at the start of phrases\n"
+
+ # Add genre guidance to the main guidance
+ syllable_guidance += genre_guidance
+
+ # Store the syllable guidance for later use
+ syllable_guidance_text = syllable_guidance
+
+ # Determine if we should use traditional sections or second-level alignment
+ use_sections = True
+ use_second_level = False
+
+ if song_structure and "second_level" in song_structure and song_structure["second_level"]:
+ use_second_level = True
+ # If we have second-level templates, prioritize those over traditional sections
+ if isinstance(song_structure["second_level"], dict) and "templates" in song_structure["second_level"]:
+ templates = song_structure["second_level"]["templates"]
+ if isinstance(templates, list) and len(templates) > 0:
+ use_sections = False
+ elif song_structure and "flexible_structure" in song_structure and song_structure["flexible_structure"]:
+ # If we have more than 4 segments, it's likely not a traditional song structure
+ if "segments" in song_structure["flexible_structure"]:
+ segments = song_structure["flexible_structure"]["segments"]
+ if len(segments) > 4:
+ use_sections = False
+
+ # Create enhanced prompt with better rhythm alignment instructions
+ if use_second_level:
+ # Second-level approach with per-second alignment
+ content = f"""
+You are a talented songwriter who specializes in {genre} music.
+Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long.
+
+IMPORTANT: DO NOT include any thinking process, explanations, or analysis before the lyrics. Start directly with the song lyrics.
+
+Music analysis has detected the following qualities:
+- Tempo: {tempo:.1f} BPM
+- Key: {key} {mode}
+- Primary emotion: {primary_emotion}
+- Primary theme: {primary_theme}
+
+{syllable_guidance}
+
+CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT:
+1. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern)
+2. Natural word stress patterns must match the beat strength (strong words on strong beats)
+3. Line breaks should occur at phrase endings for natural breathing
+4. Consonant clusters should be avoided on fast notes and strong beats
+5. Open vowels (a, e, o) work better for sustained notes and syllables
+6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis)
+7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels
+
+The lyrics should:
+- Perfectly capture the essence and style of {genre} music
+- Express the {primary_emotion} emotion and {primary_theme} theme
+- Be completely original
+- Maintain a consistent theme throughout
+- Match the audio segment duration of {duration:.1f} seconds
+
+Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY.
+
+IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics.
+
+IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]"
+where you analyze how well the lyrics align with the musical rhythm. This section MUST appear
+even if there are no rhythm issues. Include the following in your analysis:
+1. Syllable counts for each line and how they match the rhythm pattern
+2. Where stressed syllables align with strong beats
+3. Any potential misalignments or improvements
+
+Your lyrics:
+"""
+ elif use_sections:
+ # Traditional approach with sections
+ content = f"""
+You are a talented songwriter who specializes in {genre} music.
+Write original {genre} song lyrics for a song that is {duration:.1f} seconds long.
+
+IMPORTANT: DO NOT include any thinking process, explanations, or analysis before the lyrics. Start directly with the song lyrics.
+
+Music analysis has detected the following qualities in the music:
+- Tempo: {tempo:.1f} BPM
+- Key: {key} {mode}
+- Primary emotion: {primary_emotion}
+- Primary theme: {primary_theme}
+
+{syllable_guidance}
+
+CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT:
+1. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern)
+2. Natural word stress patterns must match the beat strength (strong words on strong beats)
+3. Line breaks should occur at phrase endings for natural breathing
+4. Consonant clusters should be avoided on fast notes and strong beats
+5. Open vowels (a, e, o) work better for sustained notes and syllables
+6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis)
+7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels
+
+The lyrics should:
+- Perfectly capture the essence and style of {genre} music
+- Express the {primary_emotion} emotion and {primary_theme} theme
+- Follow the structure patterns provided above
+- Be completely original
+- Match the song duration of {duration:.1f} seconds
+
+IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics.
+
+IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]"
+where you analyze how well the lyrics align with the musical rhythm. This section MUST appear
+even if there are no rhythm issues. Include the following in your analysis:
+1. Syllable counts for each line and how they match the rhythm pattern
+2. Where stressed syllables align with strong beats
+3. Any potential misalignments or improvements
+
+Your lyrics:
+"""
+ else:
+ # Flexible approach without traditional sections
+ content = f"""
+You are a talented songwriter who specializes in {genre} music.
+Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long.
+
+IMPORTANT: DO NOT include any thinking process, explanations, or analysis before the lyrics. Start directly with the song lyrics.
+
+Music analysis has detected the following qualities:
+- Tempo: {tempo:.1f} BPM
+- Key: {key} {mode}
+- Primary emotion: {primary_emotion}
+- Primary theme: {primary_theme}
+
+{syllable_guidance}
+
+CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT:
+1. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern)
+2. Natural word stress patterns must match the beat strength (strong words on strong beats)
+3. Line breaks should occur at phrase endings for natural breathing
+4. Consonant clusters should be avoided on fast notes and strong beats
+5. Open vowels (a, e, o) work better for sustained notes and syllables
+6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis)
+7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels
+
+The lyrics should:
+- Perfectly capture the essence and style of {genre} music
+- Express the {primary_emotion} emotion and {primary_theme} theme
+- Be completely original
+- Maintain a consistent theme throughout
+- Match the audio segment duration of {duration:.1f} seconds
+
+Include any section labels like [Verse] or [Chorus] as indicated in the rhythm patterns above.
+Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY.
+
+IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics.
+
+IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]"
+where you analyze how well the lyrics align with the musical rhythm. This section MUST appear
+even if there are no rhythm issues. Include the following in your analysis:
+1. Syllable counts for each line and how they match the rhythm pattern
+2. Where stressed syllables align with strong beats
+3. Any potential misalignments or improvements
+
+Your lyrics:
+"""
+
+ # Format as a chat message for the LLM
+ messages = [
+ {"role": "system", "content": "You are a professional songwriter. Create lyrics that match the specified rhythm patterns exactly. Start with the lyrics immediately without any explanation or thinking. Be concise and direct."},
+ {"role": "user", "content": content}
+ ]
+
+ # Apply standard chat template without thinking enabled
+ text = llm_tokenizer.apply_chat_template(
+ messages,
+ tokenize=False,
+ add_generation_prompt=True
+ )
+
+ # Generate lyrics using the LLM
+ model_inputs = llm_tokenizer([text], return_tensors="pt").to(llm_model.device)
+
+ # Configure generation parameters based on model capability
+ generation_params = {
+ "do_sample": True,
+ "temperature": 0.5, # Lower for more consistent and direct output
+ "top_p": 0.85, # Slightly lower for more predictable responses
+ "top_k": 50,
+ "repetition_penalty": 1.2,
+ "max_new_tokens": 2048,
+ "num_return_sequences": 1
+ }
+
+ # Add specific stop sequences to prevent excessive explanation
+ if hasattr(llm_model.generation_config, "stopping_criteria"):
+ thinking_stops = ["Let me think", "First, I need to", "Let's analyze", "I'll approach this", "Step 1:", "To start,"]
+ for stop in thinking_stops:
+ if stop not in llm_model.generation_config.stopping_criteria:
+ llm_model.generation_config.stopping_criteria.append(stop)
+
+ # Generate output
+ generated_ids = llm_model.generate(
+ **model_inputs,
+ **generation_params
+ )
+
+ # Extract output tokens
+ output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
+
+ # Get the raw output and strip any thinking process
+ lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip()
+
+ # Enhanced thinking process removal - handle multiple formats
+ # First check for standard thinking tags
+ if "" in lyrics and "" in lyrics:
+ lyrics = lyrics.split("")[1].strip()
+
+ # Check for alternative thinking indicators with improved detection
+ thinking_markers = [
+ "", "",
+ "[thinking]", "[/thinking]",
+ "I'll think step by step:",
+ "First, I need to understand",
+ "Let me think about",
+ "Let's tackle this query",
+ "Okay, let's tackle this query",
+ "First, I need to understand the requirements",
+ "Looking at the rhythm patterns"
+ ]
+
+ # First try to find clear section breaks
+ for marker in thinking_markers:
+ if marker in lyrics:
+ parts = lyrics.split(marker)
+ if len(parts) > 1:
+ lyrics = parts[-1].strip() # Take the last part after any thinking marker
+
+ # Look for long analytical sections followed by clear lyrics
+ analytical_patterns = [
+ "Let me analyze",
+ "I need to understand",
+ "The tempo is",
+ "First, let's look at",
+ "Wait, maybe",
+ "Considering the emotional tone",
+ "Starting with the first line",
+ "Let me check the examples"
+ ]
+
+ # Check if lyrics begin with any analytical patterns
+ for pattern in analytical_patterns:
+ if lyrics.startswith(pattern):
+ # Try to find where the actual lyrics start - look for common lyrics markers
+ lyrics_markers = [
+ "\n\n[Verse",
+ "\n\n[Chorus",
+ "\n\nVerse",
+ "\n\nChorus",
+ "\n\n[Verse 1]",
+ "\n\n[Intro]"
+ ]
+
+ for marker in lyrics_markers:
+ if marker in lyrics:
+ lyrics = lyrics[lyrics.index(marker):].strip()
+ break
+
+ # One last effort to clean up - if the text is very long and contains obvious thinking
+ # before getting to actual lyrics, try to find a clear starting point
+ if len(lyrics.split()) > 100 and "\n\n" in lyrics:
+ paragraphs = lyrics.split("\n\n")
+ for i, paragraph in enumerate(paragraphs):
+ # Look for typical song structure indicators in a paragraph
+ if any(marker in paragraph for marker in ["[Verse", "[Chorus", "Verse 1", "Chorus:"]):
+ lyrics = "\n\n".join(paragraphs[i:])
+ break
+
+ # Clean up any remaining thinking artifacts at the beginning
+ lines = lyrics.split('\n')
+ clean_lines = []
+ lyrics_started = False
+
+ for line in lines:
+ # Skip initial commentary/thinking lines until we hit what looks like lyrics
+ if not lyrics_started:
+ if (line.strip().startswith('[') and ']' in line) or not any(thinking in line.lower() for thinking in ["i think", "let me", "maybe", "perhaps", "alternatively", "checking"]):
+ lyrics_started = True
+
+ if lyrics_started:
+ clean_lines.append(line)
+
+ # Only use the cleaning logic if we found some actual lyrics
+ if clean_lines:
+ lyrics = '\n'.join(clean_lines)
+
+ # Special handling for second-level templates
+ second_level_verification = None
+ if song_structure and "second_level" in song_structure and song_structure["second_level"]:
+ if isinstance(song_structure["second_level"], dict) and "templates" in song_structure["second_level"]:
+ second_level_verification = song_structure["second_level"]["templates"]
+ if not isinstance(second_level_verification, list):
+ second_level_verification = None
+
+ # Verify syllable counts with enhanced verification - pass second-level templates if available
+ if templates_for_verification:
+ # Convert any NumPy values to native types before verification - directly handle conversions
+ # Simple conversion for basic templates (non-recursive)
+ if isinstance(templates_for_verification, list):
+ safe_templates = []
+ for template in templates_for_verification:
+ if isinstance(template, dict):
+ processed_template = {}
+ for k, v in template.items():
+ if isinstance(v, np.ndarray):
+ if v.size == 1:
+ processed_template[k] = float(v.item())
+ else:
+ processed_template[k] = [float(x) if isinstance(x, np.number) else x for x in v]
+ elif isinstance(v, np.number):
+ processed_template[k] = float(v)
+ else:
+ processed_template[k] = v
+ safe_templates.append(processed_template)
+ else:
+ safe_templates.append(template)
+ else:
+ safe_templates = templates_for_verification
+
+ # Wrap verification in try-except to handle any potential string indices errors
+ try:
+ print(f"DEBUG: Calling verify_flexible_syllable_counts")
+ print(f"DEBUG: Type of lyrics: {type(lyrics)}")
+ print(f"DEBUG: Type of safe_templates: {type(safe_templates)}")
+ print(f"DEBUG: Type of second_level_verification: {type(second_level_verification)}")
+
+ verified_lyrics = verify_flexible_syllable_counts(lyrics, safe_templates, second_level_verification)
+ print(f"DEBUG: Type of verified_lyrics: {type(verified_lyrics)}")
+
+ except Exception as e:
+ print(f"ERROR in verify_flexible_syllable_counts: {str(e)}")
+ # Return the original lyrics if verification fails
+ return {
+ "lyrics": lyrics if isinstance(lyrics, str) else str(lyrics),
+ "rhythm_analysis": f"Error in rhythm analysis: {str(e)}",
+ "syllable_analysis": "Error performing syllable analysis",
+ "prompt_template": "Error generating prompt template"
+ }
+
+ if isinstance(verified_lyrics, str) and "[Note: Potential rhythm mismatches" in verified_lyrics and "Detailed Alignment Analysis" in verified_lyrics:
+ # Extract the original lyrics (before the notes section)
+ original_lyrics = lyrics.split("[Note:")[0].strip() if isinstance(lyrics, str) else str(lyrics)
+
+ # Extract the analysis
+ analysis = verified_lyrics.split("[Note:")[1] if "[Note:" in verified_lyrics else ""
+
+ # If we have serious alignment issues, consider a refinement step
+ if "stress misalignments" in analysis and len(templates_for_verification) > 0:
+ # Add a refinement prompt with the specific analysis
+ refinement_prompt = f"""
+You need to fix rhythm issues in these lyrics. Here's the analysis of the problems:
+
+{analysis}
+
+Revise the lyrics to perfectly match the rhythm pattern while maintaining the theme.
+Focus on fixing the stress misalignments by placing stressed syllables on STRONG beats.
+
+Original lyrics:
+{original_lyrics}
+
+Improved lyrics with fixed rhythm:
+"""
+ # Format as a chat message for refinement
+ refinement_messages = [
+ {"role": "user", "content": refinement_prompt}
+ ]
+
+ # Use standard template for refinement (no thinking mode needed)
+ refinement_text = llm_tokenizer.apply_chat_template(
+ refinement_messages,
+ tokenize=False,
+ add_generation_prompt=True
+ )
+
+ try:
+ # Generate refined lyrics with more focus on rhythm alignment
+ refinement_inputs = llm_tokenizer([refinement_text], return_tensors="pt").to(llm_model.device)
+
+ # Use stricter parameters for refinement
+ refinement_params = {
+ "do_sample": True,
+ "temperature": 0.4, # Lower temperature for more precise refinement
+ "top_p": 0.9,
+ "repetition_penalty": 1.3,
+ "max_new_tokens": 1024
+ }
+
+ refined_ids = llm_model.generate(
+ **refinement_inputs,
+ **refinement_params
+ )
+
+ # Extract refined lyrics
+ refined_output_ids = refined_ids[0][len(refinement_inputs.input_ids[0]):].tolist()
+ refined_lyrics = llm_tokenizer.decode(refined_output_ids, skip_special_tokens=True).strip()
+
+ # Verify the refined lyrics
+ try:
+ refined_verified_lyrics = verify_flexible_syllable_counts(refined_lyrics, safe_templates, second_level_verification)
+
+ # Only use refined lyrics if they're better (fewer notes)
+ if "[Note: Potential rhythm mismatches" not in refined_verified_lyrics:
+ lyrics = refined_lyrics
+ elif refined_verified_lyrics.count("misalignments") < verified_lyrics.count("misalignments"):
+ lyrics = refined_verified_lyrics
+ else:
+ lyrics = verified_lyrics
+ except Exception as e:
+ print(f"Error in refined lyrics verification: {str(e)}")
+ lyrics = verified_lyrics
+ except Exception as e:
+ print(f"Error in lyrics refinement: {str(e)}")
+ lyrics = verified_lyrics
+ else:
+ # Minor issues, just use the verification notes
+ lyrics = verified_lyrics
+ else:
+ # No significant issues detected
+ lyrics = verified_lyrics
+
+ # Check if we have the [RHYTHM_ANALYSIS_SECTION] tag
+ if "[RHYTHM_ANALYSIS_SECTION]" in lyrics:
+ # Split at our custom marker
+ parts = lyrics.split("[RHYTHM_ANALYSIS_SECTION]")
+ clean_lyrics = parts[0].strip()
+ rhythm_analysis = parts[1].strip()
+
+ # Add our standard marker for compatibility with existing code
+ lyrics = clean_lyrics + "\n\n[Note: Rhythm Analysis]\n" + rhythm_analysis
+
+ # For backwards compatibility - if we have the old format, still handle it
+ elif "[Note: Potential rhythm mismatches" in lyrics:
+ # Keep it as is, the existing parsing code can handle this format
+ pass
+ else:
+ # No analysis found, add a minimal one
+ lyrics = lyrics + "\n\n[Note: Rhythm Analysis]\nNo rhythm issues detected. All syllables align well with the beat pattern."
+
+ # Before returning, add syllable analysis and prompt template
+ if isinstance(lyrics, str):
+ # Extract clean lyrics and analysis
+ if "[Note: Rhythm Analysis]" in lyrics:
+ clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip()
+ rhythm_analysis = lyrics.split("[Note: Rhythm Analysis]")[1]
+ elif "[Note: Potential rhythm mismatches" in lyrics:
+ clean_lyrics = lyrics.split("[Note:")[0].strip()
+ rhythm_analysis = "[Note:" + lyrics.split("[Note:")[1]
+ else:
+ clean_lyrics = lyrics
+ rhythm_analysis = "No rhythm analysis available"
+
+ # Create syllable analysis
+ syllable_analysis = "=== SYLLABLE ANALYSIS ===\n\n"
+ if templates_for_verification:
+ syllable_analysis += "Template Analysis:\n"
+ for i, template in enumerate(templates_for_verification):
+ if i < min(len(templates_for_verification), 30): # Limit to 30 to avoid overwhelming output
+ syllable_analysis += f"Line {i+1}:\n"
+ if isinstance(template, dict):
+ if "syllable_template" in template:
+ syllable_analysis += f" Template: {template['syllable_template']}\n"
+ if "syllable_count" in template:
+ syllable_analysis += f" Expected syllables: {template['syllable_count']}\n"
+ elif isinstance(template, str):
+ syllable_analysis += f" Template: {template}\n"
+ syllable_analysis += "\n"
+
+ if len(templates_for_verification) > 30:
+ syllable_analysis += f"... and {len(templates_for_verification) - 30} more lines\n\n"
+
+ # Add second-level analysis if available
+ if second_level_verification:
+ syllable_analysis += "\nSecond-Level Template Analysis:\n"
+ for i, template in enumerate(second_level_verification):
+ if i < min(len(second_level_verification), 30): # Limit to 30 seconds
+ syllable_analysis += f"Second {i+1}: {template}\n"
+
+ if len(second_level_verification) > 30:
+ syllable_analysis += f"... and {len(second_level_verification) - 30} more seconds\n"
+
+ # Add structure visualization to syllable analysis
+ syllable_analysis += "\n" + structure_visualization
+
+ # Create prompt template
+ prompt_template = "=== PROMPT TEMPLATE ===\n\n"
+ prompt_template += "Genre: " + genre + "\n"
+ prompt_template += f"Duration: {duration:.1f} seconds\n"
+ prompt_template += f"Tempo: {tempo:.1f} BPM\n"
+ prompt_template += f"Key: {key} {mode}\n"
+ prompt_template += f"Primary Emotion: {primary_emotion}\n"
+ prompt_template += f"Primary Theme: {primary_theme}\n\n"
+ prompt_template += "Syllable Guidance:\n" + syllable_guidance_text
+
+ # Return all components
+ return {
+ "lyrics": clean_lyrics,
+ "rhythm_analysis": rhythm_analysis,
+ "syllable_analysis": syllable_analysis,
+ "prompt_template": prompt_template
+ }
+
+ return {
+ "lyrics": lyrics,
+ "rhythm_analysis": "No rhythm analysis available",
+ "syllable_analysis": "No syllable analysis available",
+ "prompt_template": "No prompt template available"
+ }
+
+def process_audio(audio_file):
+ """Main function to process audio file, classify genre, and generate lyrics with enhanced rhythm analysis."""
+ if audio_file is None:
+ return "Please upload an audio file.", None, None
+
+ try:
+ print("Step 1/5: Extracting audio features...")
+ # Extract audio features
+ audio_data = extract_audio_features(audio_file)
+
+ print("Step 2/5: Verifying audio contains music...")
+ # First check if it's music
+ try:
+ is_music, ast_results = detect_music(audio_data)
+ except Exception as e:
+ print(f"Error in music detection: {str(e)}")
+ return f"Error in music detection: {str(e)}", None, ast_results
+
+ if not is_music:
+ return "The uploaded audio does not appear to be music. Please upload a music file.", None, ast_results
+
+ print("Step 3/5: Classifying music genre...")
+ # Classify genre
+ try:
+ top_genres = classify_genre(audio_data)
+ # Format genre results using utility function
+ genre_results = format_genre_results(top_genres)
+ if not isinstance(top_genres, list) or len(top_genres) == 0:
+ # Fallback if we don't have valid top_genres
+ top_genres = [("rock", 1.0)]
+ except Exception as e:
+ print(f"Error in genre classification: {str(e)}")
+ top_genres = [("rock", 1.0)] # Ensure we have a default even when exception happens
+ return f"Error in genre classification: {str(e)}", None, ast_results
+
+ # Initialize default values
+ ast_results = ast_results if ast_results else []
+ song_structure = None
+ emotion_results = {
+ "emotion_analysis": {"primary_emotion": "Unknown"},
+ "theme_analysis": {"primary_theme": "Unknown"},
+ "rhythm_analysis": {"tempo": 0},
+ "tonal_analysis": {"key": "Unknown", "mode": ""},
+ "summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"}
+ }
+
+ print("Step 4/5: Analyzing music emotions, themes, and structure...")
+ # Analyze music emotions and themes
+ try:
+ emotion_results = music_analyzer.analyze_music(audio_file)
+ except Exception as e:
+ print(f"Error in emotion analysis: {str(e)}")
+ # Continue with default emotion_results
+
+ # Calculate detailed song structure for better lyrics alignment
+ try:
+ # Load audio data
+ y, sr = load_audio(audio_file, SAMPLE_RATE)
+
+ # Analyze beats and phrases for music-aligned lyrics
+ beats_info = detect_beats(y, sr)
+ sections_info = detect_sections(y, sr)
+
+ # Create structured segments for precise line-by-line matching
+ segments = []
+
+ # Try to break audio into meaningful segments based on sections
+ # Each segment will correspond to one line of lyrics
+ if sections_info and len(sections_info) > 1:
+ min_segment_duration = 1.5 # Minimum 1.5 seconds per segment
+
+ for section in sections_info:
+ section_start = section["start"]
+ section_end = section["end"]
+ section_duration = section["duration"]
+
+ # For very short sections, add as a single segment
+ if section_duration < min_segment_duration * 1.5:
+ segments.append({
+ "start": section_start,
+ "end": section_end
+ })
+ else:
+ # Calculate ideal number of segments for this section
+ # based on its duration - aiming for 2-4 second segments
+ ideal_segment_duration = 3.0 # Target 3 seconds per segment
+ segment_count = max(1, int(section_duration / ideal_segment_duration))
+
+ # Create evenly-spaced segments within this section
+ segment_duration = section_duration / segment_count
+ for i in range(segment_count):
+ segment_start = section_start + i * segment_duration
+ segment_end = segment_start + segment_duration
+ segments.append({
+ "start": segment_start,
+ "end": segment_end
+ })
+ # If no good sections found, create segments based on beats
+ elif beats_info and len(beats_info["beat_times"]) > 4:
+ beats = beats_info["beat_times"]
+ time_signature = beats_info.get("time_signature", 4)
+
+ # Target one segment per musical measure (typically 4 beats)
+ measure_size = time_signature
+ for i in range(0, len(beats), measure_size):
+ if i + 1 < len(beats): # Need at least 2 beats for a meaningful segment
+ measure_start = beats[i]
+ # If we have enough beats for the full measure
+ if i + measure_size < len(beats):
+ measure_end = beats[i + measure_size]
+ else:
+ # Use available beats and extrapolate for the last measure
+ if i > 0:
+ beat_interval = beats[i] - beats[i-1]
+ measure_end = beats[-1] + (beat_interval * (measure_size - (len(beats) - i)))
+ else:
+ measure_end = audio_data["duration"]
+
+ segments.append({
+ "start": measure_start,
+ "end": measure_end
+ })
+ # Last resort: simple time-based segments
+ else:
+ # Create segments of approximately 3 seconds each
+ segment_duration = 3.0
+ total_segments = max(4, int(audio_data["duration"] / segment_duration))
+ segment_duration = audio_data["duration"] / total_segments
+
+ for i in range(total_segments):
+ segment_start = i * segment_duration
+ segment_end = segment_start + segment_duration
+ segments.append({
+ "start": segment_start,
+ "end": segment_end
+ })
+
+ # Create flexible structure with the segments
+ flexible_structure = {
+ "beats": beats_info,
+ "segments": segments
+ }
+
+ # Create song structure object
+ song_structure = {
+ "beats": beats_info,
+ "sections": sections_info,
+ "flexible_structure": flexible_structure,
+ "syllables": []
+ }
+
+ # Add syllable counts to each section
+ for section in sections_info:
+ # Create syllable templates for sections
+ section_beats_info = {
+ "beat_times": [beat for beat in beats_info["beat_times"]
+ if section["start"] <= beat < section["end"]],
+ "tempo": beats_info.get("tempo", 120)
+ }
+ if "beat_strengths" in beats_info:
+ section_beats_info["beat_strengths"] = [
+ strength for i, strength in enumerate(beats_info["beat_strengths"])
+ if i < len(beats_info["beat_times"]) and
+ section["start"] <= beats_info["beat_times"][i] < section["end"]
+ ]
+
+ # Get a syllable count based on section duration and tempo
+ syllable_count = int(section["duration"] * (beats_info.get("tempo", 120) / 60) * 1.5)
+
+ section_info = {
+ "type": section["type"],
+ "start": section["start"],
+ "end": section["end"],
+ "duration": section["duration"],
+ "syllable_count": syllable_count,
+ "beat_count": len(section_beats_info["beat_times"])
+ }
+
+ # Try to create a more detailed syllable template
+ if len(section_beats_info["beat_times"]) >= 2:
+ # Ensure top_genres is a list with at least one element
+ if isinstance(top_genres, list) and len(top_genres) > 0 and isinstance(top_genres[0], tuple):
+ genre_name = top_genres[0][0]
+ else:
+ genre_name = "unknown" # Default genre if top_genres is invalid
+
+ section_info["syllable_template"] = create_flexible_syllable_templates(
+ section_beats_info,
+ genre=genre_name
+ )
+
+ song_structure["syllables"].append(section_info)
+
+ # Add second-level beat analysis
+ try:
+ # Get enhanced beat information with subbeats
+ subbeat_info = detect_beats_and_subbeats(y, sr, subdivision=4)
+
+ # Map beats to second-level windows
+ sec_map = map_beats_to_seconds(
+ subbeat_info["subbeat_times"],
+ audio_data["duration"]
+ )
+
+ # Create second-level templates
+ # Ensure top_genres is a list with at least one element
+ genre_name = "unknown"
+ if isinstance(top_genres, list) and len(top_genres) > 0 and isinstance(top_genres[0], tuple):
+ genre_name = top_genres[0][0]
+
+ second_level_templates = create_second_level_templates(
+ sec_map,
+ subbeat_info["tempo"],
+ genre_name # Use top genre with safety check
+ )
+
+ # Add to song structure
+ song_structure["second_level"] = {
+ "sec_map": sec_map,
+ "templates": second_level_templates
+ }
+
+ except Exception as e:
+ print(f"Error in second-level beat analysis: {str(e)}")
+ # Continue without second-level data
+
+ except Exception as e:
+ print(f"Error analyzing song structure: {str(e)}")
+ # Continue without song structure
+
+ print("Step 5/5: Generating rhythmically aligned lyrics...")
+ # Generate lyrics based on top genre, emotion analysis, and song structure
+ try:
+ # Ensure top_genres is a list with at least one element before accessing
+ primary_genre = "unknown"
+ if isinstance(top_genres, list) and len(top_genres) > 0 and isinstance(top_genres[0], tuple):
+ primary_genre, _ = top_genres[0]
+
+ # CRITICAL FIX: Create a sanitized version of song_structure to prevent string indices error
+ sanitized_song_structure = None
+ if song_structure:
+ sanitized_song_structure = {}
+
+ # Safely copy beats data
+ if "beats" in song_structure and isinstance(song_structure["beats"], dict):
+ sanitized_song_structure["beats"] = song_structure["beats"]
+
+ # Safely copy sections data
+ if "sections" in song_structure and isinstance(song_structure["sections"], list):
+ sanitized_song_structure["sections"] = song_structure["sections"]
+
+ # Safely handle flexible structure
+ if "flexible_structure" in song_structure and isinstance(song_structure["flexible_structure"], dict):
+ flex_struct = song_structure["flexible_structure"]
+ sanitized_flex = {}
+
+ # Safely handle segments
+ if "segments" in flex_struct and isinstance(flex_struct["segments"], list):
+ sanitized_flex["segments"] = flex_struct["segments"]
+
+ # Safely handle beats
+ if "beats" in flex_struct and isinstance(flex_struct["beats"], dict):
+ sanitized_flex["beats"] = flex_struct["beats"]
+
+ sanitized_song_structure["flexible_structure"] = sanitized_flex
+
+ # Safely handle syllables
+ if "syllables" in song_structure and isinstance(song_structure["syllables"], list):
+ sanitized_song_structure["syllables"] = song_structure["syllables"]
+
+ # Safely handle second-level
+ if "second_level" in song_structure and isinstance(song_structure["second_level"], dict):
+ second_level = song_structure["second_level"]
+ sanitized_second = {}
+
+ if "templates" in second_level and isinstance(second_level["templates"], list):
+ sanitized_second["templates"] = second_level["templates"]
+
+ if "sec_map" in second_level and isinstance(second_level["sec_map"], list):
+ sanitized_second["sec_map"] = second_level["sec_map"]
+
+ sanitized_song_structure["second_level"] = sanitized_second
+
+ try:
+ print("Calling generate_lyrics function...")
+ lyrics_result = generate_lyrics(primary_genre, audio_data["duration"], emotion_results, sanitized_song_structure)
+ print(f"Type of lyrics_result: {type(lyrics_result)}")
+
+ # Handle both old and new return formats with robust type checking
+ if isinstance(lyrics_result, dict) and all(k in lyrics_result for k in ["lyrics"]):
+ lyrics = lyrics_result.get("lyrics", "No lyrics generated")
+ rhythm_analysis = lyrics_result.get("rhythm_analysis", "No rhythm analysis available")
+ syllable_analysis = lyrics_result.get("syllable_analysis", "No syllable analysis available")
+ prompt_template = lyrics_result.get("prompt_template", "No prompt template available")
+ else:
+ # Convert to string regardless of the type
+ lyrics = str(lyrics_result) if lyrics_result is not None else "No lyrics generated"
+ rhythm_analysis = "No detailed rhythm analysis available"
+ syllable_analysis = "No syllable analysis available"
+ prompt_template = "No prompt template available"
+ except Exception as inner_e:
+ print(f"Inner error in lyrics generation: {str(inner_e)}")
+ # Create a simplified fallback result with just the error message
+ lyrics = f"Error generating lyrics: {str(inner_e)}"
+ rhythm_analysis = "Error in rhythm analysis"
+ syllable_analysis = "Error in syllable analysis"
+ prompt_template = "Error in prompt template generation"
+
+ except Exception as e:
+ print(f"Outer error in lyrics generation: {str(e)}")
+ lyrics = f"Error generating lyrics: {str(e)}"
+ rhythm_analysis = "No rhythm analysis available"
+ syllable_analysis = "No syllable analysis available"
+ prompt_template = "No prompt template available"
+ # Prepare results dictionary with additional rhythm analysis
+ results = {
+ "genre_results": genre_results,
+ "lyrics": lyrics,
+ "rhythm_analysis": rhythm_analysis,
+ "syllable_analysis": syllable_analysis,
+ "prompt_template": prompt_template,
+ "ast_results": ast_results
+ }
+
+ return results
+
+ except Exception as e:
+ error_msg = f"Error processing audio: {str(e)}"
+ print(error_msg)
+ return error_msg, None, []
+
+def format_complete_beat_timeline(audio_file, lyrics=None):
+ """Creates a complete formatted timeline showing all beat timings and their syllable patterns without truncation"""
+ if audio_file is None:
+ return "Please upload an audio file to see beat timeline."
+
+ try:
+ # Extract audio data
+ y, sr = load_audio(audio_file, SAMPLE_RATE)
+
+ # Get beat information
+ beats_info = detect_beats(y, sr)
+
+ # Helper function to convert numpy values to floats - FIXED
+ def ensure_float(value):
+ if isinstance(value, np.ndarray) or isinstance(value, np.number):
+ return float(value)
+ return value
+
+ # Format the timeline with enhanced scientific headers
+ timeline = "=== BEAT & SYLLABLE TIMELINE ===\n\n"
+
+ tempo = ensure_float(beats_info['tempo'])
+ tempo_confidence = ensure_float(beats_info.get('tempo_confidence', 90.0))
+ time_sig_confidence = ensure_float(beats_info.get('time_sig_confidence', 85.0))
+ beat_periodicity = ensure_float(beats_info.get('beat_periodicity', 60 / tempo))
+
+ timeline += f"Tempo: {tempo:.1f} BPM (±{tempo_confidence:.1f}%)\n"
+ timeline += f"Time Signature: {beats_info['time_signature']}/4 (Confidence: {time_sig_confidence:.1f}%)\n"
+ timeline += f"Beat Periodicity: {beat_periodicity:.3f}s\n"
+ timeline += f"Beat Entropy: {beats_info.get('beat_entropy', 'N/A')}\n"
+ timeline += f"Total Beats: {beats_info['beat_count']}\n"
+
+ # Add musicological context based on tempo classification
+ if tempo < 60:
+ tempo_class = "Largo (very slow)"
+ elif tempo < 76:
+ tempo_class = "Adagio (slow)"
+ elif tempo < 108:
+ tempo_class = "Andante (walking pace)"
+ elif tempo < 132:
+ tempo_class = "Moderato (moderate)"
+ elif tempo < 168:
+ tempo_class = "Allegro (fast)"
+ else:
+ tempo_class = "Presto (very fast)"
+
+ timeline += f"Tempo Classification: {tempo_class}\n\n"
+
+ # Create an enhanced table header with better column descriptions
+ timeline += "| Beat # | Time (s) | Beat Strength | Syllable Pattern |\n"
+ timeline += "|--------|----------|--------------|------------------|\n"
+
+ # Add beat-by-beat information with improved classification
+ for i, (time, strength) in enumerate(zip(beats_info['beat_times'], beats_info['beat_strengths'])):
+ # Convert numpy values to Python float if needed
+ time = ensure_float(time)
+ strength = ensure_float(strength)
+
+ # More scientific determination of beat type based on both strength and metrical position
+ metrical_position = i % beats_info['time_signature']
+
+ if metrical_position == 0: # Downbeat (first beat of measure)
+ beat_type = "STRONG"
+ syllable_value = 1.5
+ elif metrical_position == beats_info['time_signature'] // 2 and beats_info['time_signature'] > 2:
+ # Secondary strong beat (e.g., beat 3 in 4/4 time)
+ beat_type = "MEDIUM" if strength < 0.8 else "STRONG"
+ syllable_value = 1.0 if strength < 0.8 else 1.5
+ else:
+ # Other beats - classified by actual strength value
+ if strength >= 0.8:
+ beat_type = "STRONG"
+ syllable_value = 1.5
+ elif strength >= 0.5:
+ beat_type = "MEDIUM"
+ syllable_value = 1.0
+ else:
+ beat_type = "WEAK"
+ syllable_value = 1.0
+
+ # Determine pattern letter based on beat type for consistency
+ if beat_type == "STRONG":
+ pattern = "S"
+ elif beat_type == "MEDIUM":
+ pattern = "m"
+ else:
+ pattern = "w"
+
+ # Add row to table with the correct beat classification
+ timeline += f"| {i+1:<6} | {time:.2f}s | {beat_type:<12} | {pattern}:{syllable_value} |\n"
+
+ # No truncation - show all beats
+
+ # Add a visual timeline of beats
+ timeline += "\n=== VISUAL BEAT TIMELINE ===\n\n"
+ timeline += "Each character represents 0.5 seconds. Beats are marked as:\n"
+ timeline += "S = Strong beat | m = Medium beat | w = Weak beat | · = No beat\n\n"
+
+ # Calculate total duration and create time markers
+ if 'beat_times' in beats_info and len(beats_info['beat_times']) > 0:
+ # Get the max value safely
+ max_beat_time = max([ensure_float(t) for t in beats_info['beat_times']])
+ total_duration = max_beat_time + 2 # Add 2 seconds of padding
+ else:
+ total_duration = 30 # Default duration if no beats found
+
+ time_markers = ""
+ for i in range(0, int(total_duration) + 1, 5):
+ time_markers += f"{i:<5}"
+ timeline += time_markers + " (seconds)\n"
+
+ # Create a ruler for easier time tracking
+ ruler = ""
+ for i in range(0, int(total_duration) + 1):
+ if i % 5 == 0:
+ ruler += "+"
+ else:
+ ruler += "-"
+ ruler += "-" * 9 # Each second is 10 characters wide
+ timeline += ruler + "\n"
+
+ # Create a visualization of beats with symbols
+ beat_line = ["·"] * int(total_duration * 2) # 2 characters per second
+
+ for i, time in enumerate(beats_info['beat_times']):
+ if i >= len(beats_info['beat_strengths']):
+ break
+
+ # Convert to float if it's a numpy array
+ time_val = ensure_float(time)
+
+ # Determine position in the timeline
+ pos = int(time_val * 2) # Convert to position in the beat_line
+ if pos >= len(beat_line):
+ continue
+
+ # Determine beat type based on strength and position
+ strength = beats_info['beat_strengths'][i]
+ # Convert to float if it's a numpy array
+ strength = ensure_float(strength)
+
+ if i % beats_info['time_signature'] == 0:
+ beat_line[pos] = "S" # Strong beat at start of measure
+ elif strength >= 0.8:
+ beat_line[pos] = "S" # Strong beat
+ elif i % beats_info['time_signature'] == beats_info['time_signature'] // 2 and beats_info['time_signature'] > 3:
+ beat_line[pos] = "m" # Medium beat (3rd beat in 4/4)
+ elif strength >= 0.5:
+ beat_line[pos] = "m" # Medium beat
+ else:
+ beat_line[pos] = "w" # Weak beat
+
+ # Format and add to timeline
+ beat_visualization = ""
+ for i in range(0, len(beat_line), 10):
+ beat_visualization += "".join(beat_line[i:i+10])
+ if i + 10 < len(beat_line):
+ beat_visualization += " " # Add space every 5 seconds
+ timeline += beat_visualization + "\n\n"
+
+ # Add measure markers
+ timeline += "=== MEASURE MARKERS ===\n\n"
+
+ # Create a list to track measure start times
+ measure_starts = []
+ for i, time in enumerate(beats_info['beat_times']):
+ if i % beats_info['time_signature'] == 0: # Start of measure
+ # Convert to float if it's a numpy array
+ time_val = ensure_float(time)
+ measure_starts.append((i // beats_info['time_signature'] + 1, time_val))
+
+ # Format measure information
+ if measure_starts:
+ timeline += "| Measure # | Start Time | Duration |\n"
+ timeline += "|-----------|------------|----------|\n"
+
+ for i in range(len(measure_starts)):
+ measure_num, start_time = measure_starts[i]
+
+ # Calculate end time (start of next measure or end of song)
+ if i < len(measure_starts) - 1:
+ end_time = measure_starts[i+1][1]
+ elif 'beat_times' in beats_info and len(beats_info['beat_times']) > 0:
+ # Get the last beat time and convert to float if needed
+ last_beat = beats_info['beat_times'][-1]
+ end_time = ensure_float(last_beat)
+ else:
+ end_time = start_time + 2.0 # Default 2 seconds if no next measure
+
+ duration = end_time - start_time
+
+ timeline += f"| {measure_num:<9} | {start_time:.2f}s | {duration:.2f}s |\n"
+
+ # No truncation - show all measures
+
+ # Add phrase information
+ if 'phrases' in beats_info and beats_info['phrases']:
+ timeline += "\n=== MUSICAL PHRASES ===\n\n"
+ for i, phrase in enumerate(beats_info['phrases']):
+ # Show all phrases, not just the first 10
+ if not phrase:
+ continue
+
+ # Safely check phrase indices
+ if not (len(phrase) > 0 and len(beats_info['beat_times']) > 0):
+ continue
+
+ start_beat = min(phrase[0], len(beats_info['beat_times'])-1)
+ end_beat = min(phrase[-1], len(beats_info['beat_times'])-1)
+
+ # Convert to float if needed
+ phrase_start = ensure_float(beats_info['beat_times'][start_beat])
+ phrase_end = ensure_float(beats_info['beat_times'][end_beat])
+
+ timeline += f"Phrase {i+1}: Beats {start_beat+1}-{end_beat+1} ({phrase_start:.2f}s - {phrase_end:.2f}s)\n"
+
+ # Create syllable template for this phrase with simplified numpy handling
+ phrase_beats = {
+ "beat_times": [ensure_float(beats_info['beat_times'][j])
+ for j in phrase if j < len(beats_info['beat_times'])],
+ "beat_strengths": [ensure_float(beats_info['beat_strengths'][j])
+ for j in phrase if j < len(beats_info['beat_strengths'])],
+ "tempo": ensure_float(beats_info['tempo']),
+ "time_signature": beats_info['time_signature'],
+ "phrases": [list(range(len(phrase)))]
+ }
+
+ template = create_flexible_syllable_templates(phrase_beats)
+ timeline += f" Syllable Template: {template}\n"
+
+ # Create a visual representation of this phrase
+ if phrase_start < total_duration and phrase_end < total_duration:
+ # Create a timeline for this phrase
+ phrase_visualization = ["·"] * int(total_duration * 2)
+
+ # Mark the phrase boundaries
+ start_pos = int(phrase_start * 2)
+ end_pos = int(phrase_end * 2)
+
+ if start_pos < len(phrase_visualization):
+ phrase_visualization[start_pos] = "["
+
+ if end_pos < len(phrase_visualization):
+ phrase_visualization[end_pos] = "]"
+
+ # Mark the beats in this phrase
+ for j in phrase:
+ if j < len(beats_info['beat_times']):
+ beat_time = ensure_float(beats_info['beat_times'][j])
+ beat_pos = int(beat_time * 2)
+
+ if beat_pos < len(phrase_visualization) and beat_pos != start_pos and beat_pos != end_pos:
+ # Determine beat type
+ if j % beats_info['time_signature'] == 0:
+ phrase_visualization[beat_pos] = "S"
+ elif j % beats_info['time_signature'] == beats_info['time_signature'] // 2:
+ phrase_visualization[beat_pos] = "m"
+ else:
+ phrase_visualization[beat_pos] = "w"
+
+ # Format and add visualization
+ phrase_visual = ""
+ for k in range(0, len(phrase_visualization), 10):
+ phrase_visual += "".join(phrase_visualization[k:k+10])
+ if k + 10 < len(phrase_visualization):
+ phrase_visual += " "
+
+ timeline += f" Timeline: {phrase_visual}\n\n"
+
+ # Add second-level script display
+ try:
+ # Get second-level beat information
+ subbeat_info = detect_beats_and_subbeats(y, sr, subdivision=4)
+ duration = librosa.get_duration(y=y, sr=sr)
+
+ # Map to seconds
+ sec_map = map_beats_to_seconds(subbeat_info["subbeat_times"], duration)
+
+ # Create templates
+ templates = create_second_level_templates(sec_map, subbeat_info["tempo"])
+
+ # Add to timeline
+ timeline += "\n=== SECOND-LEVEL SCRIPT ===\n\n"
+ timeline += "Each line below represents ONE SECOND of audio with matching lyric content.\n"
+ timeline += "| Second | Beat Pattern | Lyric Content |\n"
+ timeline += "|--------|-------------|---------------|\n"
+
+ # Get clean lyrics (without analysis notes)
+ clean_lyrics = lyrics
+ if isinstance(lyrics, str):
+ if "[Note: Rhythm Analysis]" in lyrics:
+ clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip()
+ elif "[Note: Potential rhythm mismatches" in lyrics:
+ clean_lyrics = lyrics.split("[Note:")[0].strip()
+
+ # Get lyric lines
+ lines = clean_lyrics.strip().split('\n') if clean_lyrics else []
+
+ for i, template in enumerate(templates):
+ # Get corresponding lyric line if available
+ lyric = lines[i] if i < len(lines) else ""
+ if lyric.startswith('[') and ']' in lyric:
+ lyric = "" # Skip section headers
+
+ # Format nicely for display
+ timeline += f"| {i+1:<6} | {template:<30} | {lyric[:40]} |\n"
+
+ # Add ASCII visualization of second-level beats
+ timeline += "\n=== SECOND-LEVEL VISUALIZATION ===\n\n"
+ timeline += "Each row represents ONE SECOND. Beat types:\n"
+ timeline += "S = Strong beat | m = Medium beat | w = Weak beat | · = No beat\n\n"
+
+ for i, window in enumerate(sec_map):
+ beats = window["beats"]
+
+ # Create ASCII visualization
+ beat_viz = ["·"] * 20 # 20 columns for visualization
+
+ for beat in beats:
+ # Calculate position in visualization
+ pos = int(beat["relative_pos"] * 19) # Map 0-1 to 0-19
+ if 0 <= pos < len(beat_viz):
+ # Set marker based on beat type
+ if beat["type"] == "main":
+ beat_viz[pos] = "S"
+ elif beat["strength"] >= 0.7:
+ beat_viz[pos] = "m"
+ else:
+ beat_viz[pos] = "w"
+
+ # Get corresponding lyric
+ lyric = lines[i] if i < len(lines) else ""
+ if lyric.startswith('[') and ']' in lyric:
+ lyric = ""
+
+ # Format visualization line
+ viz_line = f"Second {i+1:2d}: [" + "".join(beat_viz) + "]"
+ if lyric:
+ viz_line += f" → {lyric[:40]}"
+
+ timeline += viz_line + "\n"
+
+ except Exception as e:
+ timeline += f"\n[Error generating second-level analysis: {str(e)}]"
+
+ # Add a section showing alignment if lyrics were generated
+ if lyrics and isinstance(lyrics, str):
+ timeline += "\n=== LYRICS-BEAT ALIGNMENT ===\n\n"
+ # Remove rhythm analysis notes from lyrics if present
+ if "[Note:" in lyrics:
+ clean_lyrics = lyrics.split("[Note:")[0].strip()
+ else:
+ clean_lyrics = lyrics
+
+ lines = clean_lyrics.strip().split('\n')
+
+ # Show alignment for ALL lines, not just the first 10
+ for i, line in enumerate(lines):
+ if not line.strip() or line.startswith('['):
+ continue
+
+ timeline += f"Line: \"{line}\"\n"
+
+ # Count syllables
+ syllable_count = count_syllables(line)
+ timeline += f" Syllables: {syllable_count}\n"
+
+ # Create adaptive phrase matching - if we don't have a direct phrase match,
+ # try to find the closest matching phrase by time or measure
+ matching_phrase = None
+ if 'phrases' in beats_info and beats_info['phrases']:
+ # First try direct index matching
+ if i < len(beats_info['phrases']) and beats_info['phrases'][i]:
+ matching_phrase = beats_info['phrases'][i]
+ else:
+ # If no direct match, try to find a phrase by musical position
+ # Calculate which section of the song we're in
+ if len(beats_info['phrases']) > 0:
+ section_size = max(1, len(beats_info['phrases']) // 4)
+ section_index = min(i // section_size, 3) # Limit to 4 sections
+ section_start = section_index * section_size
+ section_end = min(section_start + section_size, len(beats_info['phrases']))
+
+ # Try to find a phrase within this section
+ candidate_phrases = [phrase for j, phrase in enumerate(beats_info['phrases'])
+ if section_start <= j < section_end and phrase]
+
+ if candidate_phrases:
+ matching_phrase = candidate_phrases[min(i % section_size, len(candidate_phrases)-1)]
+ elif beats_info['phrases']:
+ # Fallback to cycling through available phrases
+ phrase_index = i % len(beats_info['phrases'])
+ if beats_info['phrases'][phrase_index]:
+ matching_phrase = beats_info['phrases'][phrase_index]
+
+ # Show timing and detailed alignment if we found a matching phrase
+ if matching_phrase and len(matching_phrase) > 0 and len(beats_info['beat_times']) > 0:
+ # Safely check if phrase has elements and indices are valid
+ if len(matching_phrase) > 0 and len(beats_info['beat_times']) > 0:
+ start_beat = min(matching_phrase[0], len(beats_info['beat_times'])-1)
+ end_beat = min(matching_phrase[-1], len(beats_info['beat_times'])-1)
+
+ start_time = ensure_float(beats_info['beat_times'][start_beat])
+ end_time = ensure_float(beats_info['beat_times'][end_beat])
+
+ timeline += f" Timing: {start_time:.2f}s - {end_time:.2f}s\n"
+
+ # Create an enhanced visualization of syllable alignment
+ timeline += " Alignment: "
+
+ # Create a timeline focused on just this phrase
+ phrase_duration = end_time - start_time
+ syllable_viz = []
+
+ # Initialize with beat markers for this phrase using improved algorithm
+ for j, beat_idx in enumerate(matching_phrase):
+ if beat_idx < len(beats_info['beat_times']):
+ beat_time = ensure_float(beats_info['beat_times'][beat_idx])
+
+ # Handle edge case where phrase_duration is very small
+ if phrase_duration > 0.001: # Avoid division by very small numbers
+ # Use non-linear mapping for more musical alignment
+ # This accounts for natural speech rhythms not being strictly linear
+ normalized_pos = (beat_time - start_time) / phrase_duration
+ # Apply slight curve to map syllable positions more naturally
+ curved_pos = min(1.0, normalized_pos * (1.0 + 0.1 * (normalized_pos - 0.5)))
+ relative_pos = int(curved_pos * syllable_count)
+ else:
+ relative_pos = j # Default to sequential if duration is too small
+
+ # Ensure we have enough space
+ while len(syllable_viz) <= relative_pos:
+ syllable_viz.append("·")
+
+ # Determine beat type with metrical context
+ metrical_pos = beat_idx % beats_info['time_signature']
+ beat_strength = beats_info['beat_strengths'][beat_idx] if beat_idx < len(beats_info['beat_strengths']) else 0
+
+ if metrical_pos == 0 or beat_strength >= 0.8:
+ syllable_viz[relative_pos] = "S" # Strong beat
+ elif metrical_pos == beats_info['time_signature'] // 2 or beat_strength >= 0.5:
+ syllable_viz[relative_pos] = "m" # Medium beat
+ else:
+ syllable_viz[relative_pos] = "w" # Weak beat
+
+ # Fill in any gaps
+ while len(syllable_viz) < syllable_count:
+ syllable_viz.append("·")
+
+ # Trim if too long
+ syllable_viz = syllable_viz[:syllable_count]
+
+ # Add alignment visualization with word stress analysis
+ timeline += "".join(syllable_viz) + "\n"
+
+ # Add word stress analysis
+ words = re.findall(r'\b[a-zA-Z]+\b', line.lower())
+ if words:
+ word_stresses = []
+ cumulative_syllables = 0
+
+ for word in words:
+ syllable_count_word = count_syllables_for_word(word)
+ stress_pattern = get_word_stress(word)
+
+ # Ensure stress pattern is as long as syllable count
+ while len(stress_pattern) < syllable_count_word:
+ stress_pattern += "0"
+
+ for j in range(syllable_count_word):
+ stress_char = "S" if j < len(stress_pattern) and stress_pattern[j] == "1" else "_"
+ word_stresses.append(stress_char)
+
+ cumulative_syllables += syllable_count_word
+
+ # Add word stress information
+ timeline += " Word stress: " + "".join(word_stresses) + "\n"
+
+ # Check if stressed syllables align with strong beats
+ alignment_score = 0
+ alignment_issues = []
+
+ for j, (stress, beat) in enumerate(zip(word_stresses, syllable_viz)):
+ if (stress == "S" and beat == "S") or (stress != "S" and beat != "S"):
+ alignment_score += 1
+ elif stress == "S" and beat != "S":
+ alignment_issues.append(f"Syllable {j+1} has stress but weak beat")
+ elif stress != "S" and beat == "S":
+ alignment_issues.append(f"Syllable {j+1} has no stress but strong beat")
+
+ if word_stresses:
+ alignment_percent = (alignment_score / len(word_stresses)) * 100
+ timeline += f" Stress alignment: {alignment_percent:.1f}% match\n"
+
+ if alignment_issues and len(alignment_issues) <= 3:
+ timeline += " Issues: " + "; ".join(alignment_issues) + "\n"
+ else:
+ timeline += " No matching phrase found for alignment\n"
+
+ timeline += "\n"
+
+ return timeline
+
+ except Exception as e:
+ print(f"Error generating complete beat timeline: {str(e)}")
+ return f"Error generating complete beat timeline: {str(e)}"
+
+def display_results(audio_file):
+ """Process audio file and return formatted results for display in the UI."""
+ # Default error response
+ error_response = ("Please upload an audio file.",
+ "No emotion analysis available.",
+ "No audio classification available.",
+ "No lyrics generated.",
+ "No beat timeline available.")
+
+ if audio_file is None:
+ return error_response
+
+ try:
+ # Process audio and get results
+ results = process_audio(audio_file)
+
+ # Check if we got an error message
+ if isinstance(results, str) and "Error" in results:
+ return results, *error_response[1:]
+ elif isinstance(results, tuple) and isinstance(results[0], str) and "Error" in results[0]:
+ return results[0], *error_response[1:]
+
+ # Extract results
+ if isinstance(results, dict):
+ # New format
+ genre_results = results.get("genre_results", "Genre classification failed")
+ lyrics = results.get("lyrics", "Lyrics generation failed")
+ ast_results = results.get("ast_results", [])
+ else:
+ # Old tuple format
+ genre_results, lyrics, ast_results = results
+
+ # Get clean lyrics (without analysis notes)
+ clean_lyrics = lyrics
+ if isinstance(lyrics, str):
+ if "[Note: Rhythm Analysis]" in lyrics:
+ clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip()
+ elif "[Note: Potential rhythm mismatches" in lyrics:
+ clean_lyrics = lyrics.split("[Note:")[0].strip()
+
+ # Generate beat timeline - use the complete timeline function that shows all beats
+ beat_timeline = format_complete_beat_timeline(audio_file, clean_lyrics)
+
+ # Format emotion analysis results
+ emotion_text = "No emotion analysis available."
+ try:
+ emotion_results = music_analyzer.analyze_music(audio_file)
+ emotion_text = (f"Tempo: {emotion_results['summary']['tempo']:.1f} BPM\n"
+ f"Key: {emotion_results['summary']['key']} {emotion_results['summary']['mode']}\n"
+ f"Primary Emotion: {emotion_results['summary']['primary_emotion']}\n"
+ f"Primary Theme: {emotion_results['summary']['primary_theme']}")
+
+ # Keep basic beat analysis without section information
+ y, sr = load_audio(audio_file, SAMPLE_RATE)
+ beats_info = detect_beats(y, sr)
+
+ # Add beat analysis info
+ emotion_text += f"\n\nBeat Analysis:\n"
+ emotion_text += f"- Tempo: {beats_info.get('tempo', 0):.1f} BPM\n"
+ emotion_text += f"- Time Signature: {beats_info.get('time_signature', 4)}/4\n"
+ emotion_text += f"- Total Beats: {beats_info.get('beat_count', 0)}\n"
+
+ except Exception as e:
+ print(f"Error in emotion analysis: {str(e)}")
+
+ # Format audio classification results
+ ast_text = "No valid audio classification results available."
+ if ast_results and isinstance(ast_results, list):
+ ast_text = "Audio Classification Results:\n"
+ for result in ast_results[:5]: # Show top 5 results
+ ast_text += f"{result['label']}: {result['score']*100:.2f}%\n"
+
+ # Return all results
+ return genre_results, emotion_text, ast_text, clean_lyrics, beat_timeline
+
+ except Exception as e:
+ error_msg = f"Error: {str(e)}"
+ print(error_msg)
+ return error_msg, *error_response[1:]
+
+# Create enhanced Gradio interface with tabs for better organization
+with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
+ gr.Markdown("# Music Genre Classifier & Lyrics Generator")
+ gr.Markdown("Upload a music file to classify its genre, analyze its emotions, and generate perfectly aligned lyrics.")
+
+ with gr.Row():
+ with gr.Column(scale=1):
+ audio_input = gr.Audio(label="Upload Music", type="filepath")
+ submit_btn = gr.Button("Analyze & Generate", variant="primary")
+
+ # Add genre info box
+ with gr.Accordion("About Music Genres", open=False):
+ gr.Markdown("""
+ The system recognizes various music genres including:
+ - Pop, Rock, Hip-Hop, R&B
+ - Electronic, Dance, Techno, House
+ - Jazz, Blues, Classical
+ - Folk, Country, Acoustic
+ - Metal, Punk, Alternative
+ - And many others!
+
+ For best results, use high-quality audio files (MP3, WAV, FLAC) with at least 10 seconds of music.
+ """)
+
+ with gr.Column(scale=2):
+ # Use tabs for better organization of outputs
+ with gr.Tabs():
+ with gr.TabItem("Analysis Results"):
+ genre_output = gr.Textbox(label="Detected Genres", lines=4)
+
+ # Create 2 columns for emotion and audio classification
+ with gr.Row():
+ with gr.Column():
+ emotion_output = gr.Textbox(label="Emotion & Structure Analysis", lines=8)
+ with gr.Column():
+ ast_output = gr.Textbox(label="Audio Classification", lines=8)
+
+ with gr.TabItem("Generated Lyrics"):
+ lyrics_output = gr.Textbox(label="Lyrics", lines=18)
+
+ with gr.TabItem("Beat & Syllable Timeline"):
+ beat_timeline_output = gr.Textbox(label="Beat Timings & Syllable Patterns", lines=40)
+
+ # Connect the button to the display function with updated outputs
+ submit_btn.click(
+ fn=display_results,
+ inputs=[audio_input],
+ outputs=[genre_output, emotion_output, ast_output, lyrics_output, beat_timeline_output]
+ )
+
+ # Enhanced explanation of how the system works
+ with gr.Accordion("How it works", open=False):
+ gr.Markdown("""
+ ## Advanced Lyrics Generation Process
+
+ 1. **Audio Analysis**: The system analyzes your uploaded music file using multiple machine learning models.
+
+ 2. **Genre Classification**: A specialized neural network identifies the musical genre, detecting subtle patterns in the audio.
+
+ 3. **Emotional Analysis**: The system examines harmonic, rhythmic, and timbral features to determine the emotional qualities of the music.
+
+ 4. **Rhythm Mapping**: Advanced beat detection algorithms create a detailed rhythmic map of the music, identifying:
+ - Strong and weak beats
+ - Natural phrase boundaries
+ - Time signature and tempo variations
+ - Beat subdivisions (half and quarter beats)
+
+ 5. **Second-Level Alignment**: The system maps beats and subbeats to each second of audio, creating precise templates for perfect alignment.
+
+ 6. **Syllable Template Creation**: For each second of audio, the system generates precise syllable templates that reflect:
+ - Beat stress patterns (strong, medium, weak)
+ - Appropriate syllable counts based on tempo
+ - Genre-specific rhythmic qualities
+ - Half-beat and quarter-beat subdivisions
+
+ 7. **Lyrics Generation**: Using the detected genre, emotion, and rhythm patterns, a large language model generates lyrics that:
+ - Match the emotional quality of the music
+ - Follow the precise syllable templates for each second
+ - Align stressed syllables with strong beats
+ - Maintain genre-appropriate style and themes
+
+ 8. **Rhythm Verification**: The system verifies the generated lyrics, analyzing:
+ - Syllable count accuracy
+ - Stress alignment with strong beats
+ - Word stress patterns
+ - Second-by-second alignment precision
+
+ 9. **Refinement**: If significant rhythm mismatches are detected, the system can automatically refine the lyrics for better alignment.
+
+ This multi-step process creates lyrics that feel naturally connected to the music, as if they were written specifically for it.
+ """)
+
+# Launch the app
+demo.launch()
\ No newline at end of file