diff --git "a/appp.py" "b/appp.py" new file mode 100644--- /dev/null +++ "b/appp.py" @@ -0,0 +1,3950 @@ +import os +import io +import gradio as gr +import torch +import numpy as np +import re +import pronouncing # Add this to requirements.txt for syllable counting +import functools # Add this for lru_cache functionality +from transformers import ( + AutoModelForAudioClassification, + AutoFeatureExtractor, + AutoTokenizer, + pipeline, + AutoModelForCausalLM, + BitsAndBytesConfig +) +from huggingface_hub import login +from utils import ( + load_audio, + extract_audio_duration, + extract_mfcc_features, + format_genre_results, + ensure_cuda_availability +) +from emotionanalysis import MusicAnalyzer +import librosa + +# Login to Hugging Face Hub if token is provided +if "HF_TOKEN" in os.environ: + login(token=os.environ["HF_TOKEN"]) + +# Constants +GENRE_MODEL_NAME = "dima806/music_genres_classification" +MUSIC_DETECTION_MODEL = "MIT/ast-finetuned-audioset-10-10-0.4593" +LLM_MODEL_NAME = "Qwen/Qwen3-14B" +SAMPLE_RATE = 22050 # Standard sample rate for audio processing + +# Check CUDA availability (for informational purposes) +CUDA_AVAILABLE = ensure_cuda_availability() + +# Create music detection pipeline +print(f"Loading music detection model: {MUSIC_DETECTION_MODEL}") +try: + music_detector = pipeline( + "audio-classification", + model=MUSIC_DETECTION_MODEL, + device=0 if CUDA_AVAILABLE else -1 + ) + print("Successfully loaded music detection pipeline") +except Exception as e: + print(f"Error creating music detection pipeline: {str(e)}") + # Fallback to manual loading + try: + music_processor = AutoFeatureExtractor.from_pretrained(MUSIC_DETECTION_MODEL) + music_model = AutoModelForAudioClassification.from_pretrained(MUSIC_DETECTION_MODEL) + print("Successfully loaded music detection model and feature extractor") + except Exception as e2: + print(f"Error loading music detection model components: {str(e2)}") + raise RuntimeError(f"Could not load music detection model: {str(e2)}") + +# Create genre classification pipeline +print(f"Loading audio classification model: {GENRE_MODEL_NAME}") +try: + genre_classifier = pipeline( + "audio-classification", + model=GENRE_MODEL_NAME, + device=0 if CUDA_AVAILABLE else -1 + ) + print("Successfully loaded audio classification pipeline") +except Exception as e: + print(f"Error creating pipeline: {str(e)}") + # Fallback to manual loading + try: + genre_processor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME) + genre_model = AutoModelForAudioClassification.from_pretrained(GENRE_MODEL_NAME) + print("Successfully loaded audio classification model and feature extractor") + except Exception as e2: + print(f"Error loading model components: {str(e2)}") + raise RuntimeError(f"Could not load genre classification model: {str(e2)}") + +# Load LLM with appropriate quantization for T4 GPU +bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, +) + +llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME) +llm_model = AutoModelForCausalLM.from_pretrained( + LLM_MODEL_NAME, + device_map="auto", + quantization_config=bnb_config, + torch_dtype=torch.float16, +) + +# Create LLM pipeline +llm_pipeline = pipeline( + "text-generation", + model=llm_model, + tokenizer=llm_tokenizer, + max_new_tokens=512, +) + +# Initialize music emotion analyzer +music_analyzer = MusicAnalyzer() + +# New global function moved outside of verify_flexible_syllable_counts +@functools.lru_cache(maxsize=512) +def cached_phones_for_word(word): + """Get word pronunciations with caching for better performance.""" + return pronouncing.phones_for_word(word) + +@functools.lru_cache(maxsize=512) +def count_syllables_for_word(word): + """Count syllables in a single word with caching for performance.""" + # Try using pronouncing library first + pronunciations = cached_phones_for_word(word.lower()) + if pronunciations: + return pronouncing.syllable_count(pronunciations[0]) + + # Fallback method for words not in the pronouncing dictionary + vowels = "aeiouy" + word = word.lower() + count = 0 + prev_is_vowel = False + + for char in word: + is_vowel = char in vowels + if is_vowel and not prev_is_vowel: + count += 1 + prev_is_vowel = is_vowel + + # Handle special cases + if word.endswith('e') and not word.endswith('le'): + count -= 1 + if word.endswith('le') and len(word) > 2 and word[-3] not in vowels: + count += 1 + if count == 0: + count = 1 + + return count + +@functools.lru_cache(maxsize=512) +def get_word_stress(word): + """Get the stress pattern for a word with improved fallback handling.""" + pronunciations = cached_phones_for_word(word.lower()) + if pronunciations: + return pronouncing.stresses(pronunciations[0]) + + # Enhanced fallback for words not in the dictionary + syllables = count_syllables_for_word(word) + + # Common English stress patterns by word length + if syllables == 1: + return "1" # Single syllable words are stressed + elif syllables == 2: + # Most 2-syllable nouns and adjectives stress first syllable + # Common endings that indicate second-syllable stress + second_syllable_stress = ["ing", "er", "or", "ize", "ise", "ate", "ect", "end", "ure"] + if any(word.endswith(ending) for ending in second_syllable_stress): + return "01" + else: + return "10" # Default for 2-syllable words + elif syllables == 3: + # Common endings for specific stress patterns in 3-syllable words + if any(word.endswith(ending) for ending in ["ity", "ety", "ify", "ogy", "graphy"]): + return "100" # First syllable stress + elif any(word.endswith(ending) for ending in ["ation", "ious", "itis"]): + return "010" # Middle syllable stress + else: + return "100" # Default for 3-syllable words + else: + # For longer words, use common English patterns + return "1" + "0" * (syllables - 1) + +# New function: Count syllables in text +def count_syllables(text): + """Count syllables in a given text using the pronouncing library.""" + words = re.findall(r'\b[a-zA-Z]+\b', text.lower()) + syllable_count = 0 + + for word in words: + syllable_count += count_syllables_for_word(word) + + return syllable_count + +def extract_audio_features(audio_file): + """Extract audio features from an audio file.""" + try: + # Load the audio file using utility function + y, sr = load_audio(audio_file, SAMPLE_RATE) + + if y is None or sr is None: + raise ValueError("Failed to load audio data") + + # Get audio duration in seconds + duration = extract_audio_duration(y, sr) + + # Extract MFCCs for genre classification (may not be needed with the pipeline) + mfccs_mean = extract_mfcc_features(y, sr, n_mfcc=20) + + return { + "features": mfccs_mean, + "duration": duration, + "waveform": y, + "sample_rate": sr, + "path": audio_file # Keep path for the pipeline + } + except Exception as e: + print(f"Error extracting audio features: {str(e)}") + raise ValueError(f"Failed to extract audio features: {str(e)}") + +def classify_genre(audio_data): + """Classify the genre of the audio using the loaded model.""" + try: + # First attempt: Try using the pipeline if available + if 'genre_classifier' in globals(): + results = genre_classifier(audio_data["path"]) + # Transform pipeline results to our expected format + top_genres = [(result["label"], result["score"]) for result in results[:3]] + return top_genres + + # Second attempt: Use manually loaded model components + elif 'genre_processor' in globals() and 'genre_model' in globals(): + # Process audio input with feature extractor + inputs = genre_processor( + audio_data["waveform"], + sampling_rate=audio_data["sample_rate"], + return_tensors="pt" + ) + + with torch.no_grad(): + outputs = genre_model(**inputs) + predictions = outputs.logits.softmax(dim=-1) + + # Get the top 3 genres + values, indices = torch.topk(predictions, 3) + + # Map indices to genre labels + genre_labels = genre_model.config.id2label + + top_genres = [] + for i, (value, index) in enumerate(zip(values[0], indices[0])): + genre = genre_labels[index.item()] + confidence = value.item() + top_genres.append((genre, confidence)) + + return top_genres + + else: + raise ValueError("No genre classification model available") + + except Exception as e: + print(f"Error in genre classification: {str(e)}") + # Fallback: return a default genre if everything fails + return [("rock", 1.0)] + +def detect_music(audio_data): + """Detect if the audio is music using the MIT AST model.""" + try: + # First attempt: Try using the pipeline if available + if 'music_detector' in globals(): + results = music_detector(audio_data["path"]) + # Look for music-related classes in the results + music_confidence = 0.0 + for result in results: + label = result["label"].lower() + if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]): + music_confidence = max(music_confidence, result["score"]) + return music_confidence >= 0.2, results + + # Second attempt: Use manually loaded model components + elif 'music_processor' in globals() and 'music_model' in globals(): + # Process audio input with feature extractor + inputs = music_processor( + audio_data["waveform"], + sampling_rate=audio_data["sample_rate"], + return_tensors="pt" + ) + + with torch.no_grad(): + outputs = music_model(**inputs) + predictions = outputs.logits.softmax(dim=-1) + + # Get the top predictions + values, indices = torch.topk(predictions, 5) + + # Map indices to labels + labels = music_model.config.id2label + + # Check for music-related classes + music_confidence = 0.0 + results = [] + + for i, (value, index) in enumerate(zip(values[0], indices[0])): + label = labels[index.item()].lower() + score = value.item() + results.append({"label": label, "score": score}) + + if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]): + music_confidence = max(music_confidence, score) + + return music_confidence >= 0.2, results + + else: + raise ValueError("No music detection model available") + + except Exception as e: + print(f"Error in music detection: {str(e)}") + return False, [] + +def detect_beats(y, sr): + """Enhanced beat detection with adaptive threshold analysis, improved time signature detection and scientific confidence metrics.""" + # STEP 1: Improved pre-processing with robustness for quiet sections + # Apply a small floor to avoid division-by-zero issues + y = np.clip(y, 1e-10, None) # Prevent extreme quiet sections from causing NaN + + # Separate harmonic and percussive components + y_harmonic, y_percussive = librosa.effects.hpss(y) + + # Generate multiple onset envelopes with smoothing for stability + onset_env_full = librosa.onset.onset_strength(y=y, sr=sr) + onset_env_perc = librosa.onset.onset_strength(y=y_percussive, sr=sr) + + # Apply small smoothing to handle quiet sections + onset_env_full = np.maximum(onset_env_full, 1e-6) # Minimum threshold to avoid NaN + onset_env_perc = np.maximum(onset_env_perc, 1e-6) + + # Create weighted combination + combined_onset = onset_env_full * 0.3 + onset_env_perc * 0.7 + + # STEP 2: Multi-strategy tempo and beat detection with confidence tracking + tempo_candidates = [] + beat_candidates = [] + consistency_metrics = [] + + # Strategy 1: Standard detection + tempo1, beats1 = librosa.beat.beat_track( + onset_envelope=combined_onset, + sr=sr, + tightness=100 # More sensitive tracking + ) + tempo_candidates.append(tempo1) + beat_candidates.append(beats1) + + # Calculate autocorrelation-based confidence for this tempo + ac = librosa.autocorrelate(combined_onset) + estimated_period = int(sr * 60.0 / (tempo1 * librosa.get_duration(y=y, sr=sr) / len(combined_onset))) + if estimated_period < len(ac) and estimated_period > 0: + # Measure peak height relative to surroundings + local_ac = ac[max(0, estimated_period-5):min(len(ac), estimated_period+6)] + if np.max(local_ac) > 0: + tempo1_confidence = ac[estimated_period] / np.max(local_ac) + else: + tempo1_confidence = 0.5 + else: + tempo1_confidence = 0.5 + consistency_metrics.append(tempo1_confidence) + + # Strategy 2: Try with different tempo range for complex signatures + tempo2, beats2 = librosa.beat.beat_track( + onset_envelope=combined_onset, + sr=sr, + tightness=100, + start_bpm=60 # Lower starting BPM helps find different time signatures + ) + tempo_candidates.append(tempo2) + beat_candidates.append(beats2) + + # Calculate confidence for the second tempo estimate + estimated_period2 = int(sr * 60.0 / (tempo2 * librosa.get_duration(y=y, sr=sr) / len(combined_onset))) + if estimated_period2 < len(ac) and estimated_period2 > 0: + local_ac2 = ac[max(0, estimated_period2-5):min(len(ac), estimated_period2+6)] + if np.max(local_ac2) > 0: + tempo2_confidence = ac[estimated_period2] / np.max(local_ac2) + else: + tempo2_confidence = 0.5 + else: + tempo2_confidence = 0.5 + consistency_metrics.append(tempo2_confidence) + + # Strategy 3: Use dynamic programming for beat tracking + try: + tempo3, beats3 = librosa.beat.beat_track( + onset_envelope=combined_onset, + sr=sr, + tightness=300, # Higher tightness for more structured detection + trim=False + ) + tempo_candidates.append(tempo3) + beat_candidates.append(beats3) + + # Calculate DP-based confidence + if len(beats3) > 1: + beat_times3 = librosa.frames_to_time(beats3, sr=sr) + intervals3 = np.diff(beat_times3) + tempo3_consistency = 1.0 / (1.0 + np.std(intervals3)/np.mean(intervals3)) if np.mean(intervals3) > 0 else 0.5 + else: + tempo3_consistency = 0.5 + consistency_metrics.append(tempo3_consistency) + except Exception: + # Skip if this approach fails + pass + + # Select the best strategy based on improved consistency measurement + beat_consistency = [] + for i, beats in enumerate(beat_candidates): + if len(beats) <= 1: + beat_consistency.append(0) + continue + + times = librosa.frames_to_time(beats, sr=sr) + intervals = np.diff(times) + + # Comprehensive consistency metrics with better statistical justification + if np.mean(intervals) > 0: + # Combine coefficient of variation with autocorrelation confidence + cv = np.std(intervals)/np.mean(intervals) # Lower is better + + # Add adjustments for beat count reasonability + duration = librosa.get_duration(y=y, sr=sr) + expected_beats = duration * tempo_candidates[i] / 60 + beats_ratio = min(len(beats) / expected_beats, expected_beats / len(beats)) if expected_beats > 0 else 0.5 + + # Combine metrics with scientific weighting + consistency = (0.7 * (1.0 / (1.0 + cv))) + (0.3 * consistency_metrics[i]) + (0.2 * beats_ratio) + beat_consistency.append(consistency) + else: + beat_consistency.append(0) + + # Select best model with scientific confidence calculation + if beat_consistency: + best_idx = np.argmax(beat_consistency) + best_confidence = beat_consistency[best_idx] * 100 # Convert to percentage + else: + best_idx = 0 + best_confidence = 50.0 # Default 50% confidence if no good metrics + + tempo = tempo_candidates[best_idx] + beat_frames = beat_candidates[best_idx] + + # Calculate beat entropy - scientific measure of beat pattern predictability + beat_entropy = 0.0 + if len(beat_frames) > 2: + times = librosa.frames_to_time(beat_frames, sr=sr) + intervals = np.diff(times) + + # Quantize intervals to detect patterns + if len(intervals) > 0 and np.std(intervals) > 0: + quantized = np.round(intervals / np.min(intervals)) + # Count frequencies of each interval type + unique, counts = np.unique(quantized, return_counts=True) + probs = counts / np.sum(counts) + # Calculate Shannon entropy + beat_entropy = -np.sum(probs * np.log2(probs)) + + # STEP 3: Improved beat strength extraction + beat_times = librosa.frames_to_time(beat_frames, sr=sr) + + # Vectorized extraction of beat strengths with improved error handling + beat_strengths = [] + if len(beat_frames) > 0: + # Filter out beat frames that exceed the onset envelope length + valid_frames = [frame for frame in beat_frames if frame < len(combined_onset)] + if valid_frames: + # Vectorized extraction with normalization for consistency + raw_strengths = combined_onset[valid_frames] + + # Normalize strengths to [0,1] for scientific consistency + if np.max(raw_strengths) > 0: + normalized_strengths = raw_strengths / np.max(raw_strengths) + else: + normalized_strengths = np.ones_like(raw_strengths) + + beat_strengths = normalized_strengths.tolist() + + # Handle remaining beats with interpolation instead of constant values + if len(beat_times) > len(beat_strengths): + missing_count = len(beat_times) - len(beat_strengths) + # Use linear interpolation for more scientific approach + if beat_strengths: + last_strength = beat_strengths[-1] + decay_factor = 0.9 # Gradual decay for trailing beats + beat_strengths.extend([last_strength * (decay_factor ** (i+1)) + for i in range(missing_count)]) + else: + beat_strengths = [1.0] * len(beat_times) + else: + beat_strengths = [1.0] * len(beat_times) + else: + beat_strengths = [1.0] * len(beat_times) + + # STEP 4: Calculate intervals between beats + intervals = np.diff(beat_times).tolist() if len(beat_times) > 1 else [] + + # STEP 5: Improved time signature detection with scientific confidence + # Start with default assumption + time_signature = 4 + time_sig_confidence = 70.0 # Default confidence + + if len(beat_strengths) > 8: + # Use autocorrelation to find periodicity in beat strengths + if len(beat_strengths) > 4: + # Normalize beat strengths for better pattern detection + norm_strengths = np.array(beat_strengths) + if np.max(norm_strengths) > 0: + norm_strengths = norm_strengths / np.max(norm_strengths) + + # Compute autocorrelation to find periodic patterns (N) + ac = librosa.autocorrelate(norm_strengths, max_size=len(norm_strengths)//2) + + # Find peaks in autocorrelation (indicates periodicity) + if len(ac) > 3: # Need enough data for peak picking + # Find peaks after lag 0 + peaks = librosa.util.peak_pick(ac[1:], pre_max=1, post_max=1, pre_avg=1, post_avg=1, delta=0.1, wait=1) + peaks = peaks + 1 # Adjust for the removed lag 0 + + if len(peaks) > 0: + # Get the first significant peak position (cycle length N) + peak_idx = peaks[0] + N = peak_idx + + # Calculate confidence based on peak prominence + if peak_idx < len(ac): + peak_height = ac[peak_idx] + local_prominence = peak_height / np.mean(ac[max(0, peak_idx-2):min(len(ac), peak_idx+3)]) + time_sig_confidence = min(95, 60 + 35 * local_prominence) # Scale between 60-95% + + # Map common cycle lengths to time signatures with improved musical theory + if N == 2: + time_signature = 2 # Clear binary meter (2/4, 2/2, etc.) + time_sig_confidence += 5 # Boost for simple meter + elif N == 3: + time_signature = 3 # Clear triple meter (3/4, 3/8, etc.) + time_sig_confidence += 5 # Boost for simple meter + elif 4 <= N <= 5: + time_signature = N # Direct mapping for common cases (4/4 or 5/4) + elif N == 6: + # Could be 6/8 (compound duple) or 3/4 with subdivisions + # Further analyze to distinguish + group_3_count = 0 + for i in range(0, len(beat_strengths) - 6, 3): + if i + 2 < len(beat_strengths): + if beat_strengths[i] > beat_strengths[i+1] and beat_strengths[i] > beat_strengths[i+2]: + group_3_count += 1 + + group_2_count = 0 + for i in range(0, len(beat_strengths) - 4, 2): + if i + 1 < len(beat_strengths): + if beat_strengths[i] > beat_strengths[i+1]: + group_2_count += 1 + + # Determine if it's grouped in 2s or 3s + time_signature = 3 if group_3_count > group_2_count else 6 + elif N == 8: + time_signature = 4 # 4/4 with embellishments + elif N == 5 or N == 7: + time_signature = N # Odd time signatures like 5/4 or 7/8 + + # STEP 6: Enhanced phrase detection with adaptive thresholds and scientific justification + phrases = [] + current_phrase = [] + + if len(beat_times) > 0: + # Calculate adaptive thresholds using percentiles instead of fixed ratios + if len(beat_strengths) > 4: + # Define thresholds based on distribution rather than fixed values + strong_threshold = np.percentile(beat_strengths, 75) # Top 25% are "strong" beats + # For gaps, calculate significant deviation using z-scores if we have intervals + if intervals: + mean_interval = np.mean(intervals) + std_interval = np.std(intervals) + # A significant gap is > 1.5 standard deviations above mean (95th percentile) + significant_gap = mean_interval + (1.5 * std_interval) if std_interval > 0 else mean_interval * 1.3 + else: + significant_gap = 0 + else: + # Fallback for limited data + strong_threshold = np.max(beat_strengths) * 0.8 if beat_strengths else 1.0 + significant_gap = 0 + + # Identify phrase boundaries with improved musical heuristics + for i in range(len(beat_times)): + current_phrase.append(i) + + # Check for phrase boundary conditions + if i < len(beat_times) - 1: + # Strong beat coming up (using adaptive threshold) + is_stronger_next = False + if i < len(beat_strengths) - 1: + is_stronger_next = beat_strengths[i+1] > strong_threshold and beat_strengths[i+1] > beat_strengths[i] * 1.1 + + # Significant gap (using adaptive threshold) + is_longer_gap = False + if i < len(beat_times) - 1 and intervals and i < len(intervals): + is_longer_gap = intervals[i] > significant_gap + + # Measure boundary based on time signature + is_measure_boundary = (i + 1) % time_signature == 0 and i > 0 + + # Check for significant dip in onset strength (phrase boundary often has reduced energy) + is_energy_dip = False + if i < len(beat_strengths) - 1: + onset_ratio = beat_strengths[i+1] / max(beat_strengths[i], 0.001) + is_energy_dip = onset_ratio < 0.6 + + # Combined decision for phrase boundary with scientific weighting + phrase_boundary_score = ( + (1.5 if is_stronger_next else 0) + + (2.0 if is_longer_gap else 0) + + (1.0 if is_measure_boundary else 0) + + (0.5 if is_energy_dip else 0) + ) + + if (phrase_boundary_score >= 1.5 and len(current_phrase) >= 2) or \ + (is_measure_boundary and len(current_phrase) >= time_signature): + phrases.append(current_phrase) + current_phrase = [] + + # Add the last phrase if not empty + if current_phrase and len(current_phrase) >= 2: + phrases.append(current_phrase) + + # Ensure we have at least one phrase + if not phrases and len(beat_times) >= 2: + # Default to grouping by measures based on detected time signature + for i in range(0, len(beat_times), time_signature): + end = min(i + time_signature, len(beat_times)) + if end - i >= 2: # Ensure at least 2 beats per phrase + phrases.append(list(range(i, end))) + + # Calculate beat periodicity (average time between beats) + beat_periodicity = np.mean(intervals) if intervals else (60 / tempo) + + # Return enhanced results with scientific confidence metrics + return { + "tempo": tempo, + "tempo_confidence": best_confidence, # New scientific confidence metric + "time_signature": time_signature, + "time_sig_confidence": time_sig_confidence, # New scientific confidence metric + "beat_frames": beat_frames, + "beat_times": beat_times, + "beat_count": len(beat_times), + "beat_strengths": beat_strengths, + "intervals": intervals, + "phrases": phrases, + "beat_periodicity": beat_periodicity, + "beat_entropy": beat_entropy # New scientific measure of rhythm complexity + } + +def detect_beats_and_subbeats(y, sr, subdivision=4): + """ + Detect main beats and interpolate subbeats between consecutive beats. + + Parameters: + y: Audio time series + sr: Sample rate + subdivision: Number of subdivisions between beats (default: 4 for quarter beats) + + Returns: + Dictionary containing beat times, subbeat times, and tempo information + """ + # Detect main beats using librosa + try: + tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr) + beat_times = librosa.frames_to_time(beat_frames, sr=sr) + + # Convert numpy values to native Python types + if isinstance(tempo, np.ndarray) or isinstance(tempo, np.number): + tempo = float(tempo) + + # Convert beat_times to a list of floats + if isinstance(beat_times, np.ndarray): + beat_times = [float(t) for t in beat_times] + except Exception as e: + print(f"Error in beat detection: {e}") + # Default fallbacks + tempo = 120.0 + beat_times = [] + + # Create subbeats by interpolating between main beats + subbeat_times = [] + + # Early return if no beats detected + if not beat_times or len(beat_times) < 2: + return { + "tempo": float(tempo) if tempo is not None else 120.0, + "beat_times": beat_times, + "subbeat_times": [] + } + + for i in range(len(beat_times) - 1): + # Get current and next beat time + try: + current_beat = float(beat_times[i]) + next_beat = float(beat_times[i + 1]) + except (IndexError, ValueError, TypeError): + continue + + # Calculate time interval between beats + interval = (next_beat - current_beat) / subdivision + + # Add the main beat + subbeat_times.append({ + "time": float(current_beat), + "type": "main", + "strength": 1.0, + "beat_index": i + }) + + # Add subbeats + for j in range(1, subdivision): + subbeat_time = current_beat + j * interval + # Calculate strength based on position + # For 4/4 time, beat 3 is stronger than beats 2 and 4 + if j == subdivision // 2 and subdivision == 4: + strength = 0.8 # Stronger subbeat (e.g., beat 3 in 4/4) + else: + strength = 0.5 # Weaker subbeat + + subbeat_times.append({ + "time": float(subbeat_time), + "type": "sub", + "strength": float(strength), + "beat_index": i, + "subbeat_index": j + }) + + # Add the last main beat + if beat_times: + try: + subbeat_times.append({ + "time": float(beat_times[-1]), + "type": "main", + "strength": 1.0, + "beat_index": len(beat_times) - 1 + }) + except (ValueError, TypeError): + # Skip if conversion fails + pass + + return { + "tempo": float(tempo) if tempo is not None else 120.0, + "beat_times": beat_times, + "subbeat_times": subbeat_times + } + +def map_beats_to_seconds(subbeat_times, duration, fps=1.0): + """ + Map beats and subbeats to second-level intervals. + + Parameters: + subbeat_times: List of dictionaries containing beat and subbeat information + duration: Total duration of the audio in seconds + fps: Frames per second (default: 1.0 for one-second intervals) + + Returns: + List of dictionaries, each containing beats within a time window + """ + # Safety check for input parameters + if not isinstance(subbeat_times, list): + print("Warning: subbeat_times is not a list") + subbeat_times = [] + + try: + duration = float(duration) + except (ValueError, TypeError): + print("Warning: duration is not convertible to float, defaulting to 30") + duration = 30.0 + + # Calculate number of time windows + num_windows = int(duration * fps) + 1 + + # Initialize time windows + time_windows = [] + + for i in range(num_windows): + # Calculate window boundaries + start_time = i / fps + end_time = (i + 1) / fps + + # Find beats and subbeats within this window + window_beats = [] + + for beat in subbeat_times: + # Safety check for beat object + if not isinstance(beat, dict): + continue + + # Safely access beat time + try: + beat_time = float(beat.get("time", 0)) + except (ValueError, TypeError): + continue + + if start_time <= beat_time < end_time: + # Safely extract beat properties with defaults + beat_type = beat.get("type", "sub") + if not isinstance(beat_type, str): + beat_type = "sub" + + # Safely handle strength + try: + strength = float(beat.get("strength", 0.5)) + except (ValueError, TypeError): + strength = 0.5 + + # Add beat to this window + window_beats.append({ + "time": beat_time, + "type": beat_type, + "strength": strength, + "relative_pos": (beat_time - start_time) / (1/fps) # Position within window (0-1) + }) + + # Add window to list + time_windows.append({ + "second": i, + "start": start_time, + "end": end_time, + "beats": window_beats + }) + + return time_windows + +def create_second_level_templates(sec_map, tempo, genre=None): + """ + Create syllable templates for each second-level window. + + Parameters: + sec_map: List of second-level time windows with beat information + tempo: Tempo in BPM + genre: Optional genre for genre-specific adjustments + + Returns: + List of template strings, one for each second + """ + # Helper function to map tempo to base syllable count + def tempo_to_syllable_base(tempo): + """Continuous function mapping tempo to syllable base count""" + # Sigmoid-like function that smoothly transitions between syllable counts + if tempo > 180: + return 1.0 + elif tempo > 140: + return 1.0 + (180 - tempo) * 0.02 # Gradual increase 1.0 → 1.8 + elif tempo > 100: + return 1.8 + (140 - tempo) * 0.01 # Gradual increase 1.8 → 2.2 + elif tempo > 70: + return 2.2 + (100 - tempo) * 0.02 # Gradual increase 2.2 → 2.8 + else: + return 2.8 + max(0, (70 - tempo) * 0.04) # Continue increasing for very slow tempos + + # Calculate base syllable count from tempo + base_syllables = tempo_to_syllable_base(tempo) + + # Apply genre-specific adjustments + genre_factor = 1.0 + if genre: + genre_lower = genre.lower() + if any(term in genre_lower for term in ["rap", "hip hop", "hip-hop"]): + genre_factor = 1.4 # Much higher syllable density for rap + elif any(term in genre_lower for term in ["folk", "country", "ballad"]): + genre_factor = 0.8 # Lower density for folk styles + + # Create templates for each second + templates = [] + + for window in sec_map: + beats = window["beats"] + + # If no beats in this second, create a default template + if not beats: + templates.append("w(0.5):1") + continue + + # Create beat patterns for this second + beat_patterns = [] + + for beat in beats: + # Ensure we're dealing with a dictionary and that it has a "strength" key + if not isinstance(beat, dict): + continue # Skip this beat if it's not a dictionary + + # Safely get beat type and strength + if "type" not in beat or not isinstance(beat["type"], str): + beat_type = "w" # Default to weak if type is missing or not a string + else: + beat_type = "S" if beat["type"] == "main" else "m" if beat.get("strength", 0) >= 0.7 else "w" + + # Safely get strength value with fallback + try: + strength = float(beat.get("strength", 0.5)) + except (ValueError, TypeError): + strength = 0.5 # Default if conversion fails + + # Adjust syllable count based on beat type and strength + if beat_type == "S": + syllable_factor = 1.2 # More syllables for strong beats + elif beat_type == "m": + syllable_factor = 1.0 # Normal for medium beats + else: + syllable_factor = 0.8 # Fewer for weak beats + + # Calculate final syllable count + syllable_count = base_syllables * syllable_factor * genre_factor + + # Round to half-syllable precision + syllable_count = round(syllable_count * 2) / 2 + + # Ensure reasonable limits + syllable_count = max(0.5, min(4, syllable_count)) + + # Format with embedded strength value + strength_pct = round(strength * 100) / 100 + beat_patterns.append(f"{beat_type}({strength_pct}):{syllable_count}") + + # Join patterns with dashes - ensure we have at least one pattern + if not beat_patterns: + templates.append("w(0.5):1") # Default if no valid patterns were created + else: + second_template = "-".join(beat_patterns) + templates.append(second_template) + + return templates + +def detect_sections(y, sr): + """ + Detect musical segments without classifying them by type (verse, chorus, etc.). + + Parameters: + y: Audio time series + sr: Sample rate + + Returns: + A list of section dictionaries with start time, end time, and duration + """ + # Step 1: Extract rich feature set for comprehensive analysis + # ---------------------------------------------------------------------- + hop_length = 512 # Common hop length for feature extraction + + # Spectral features + S = np.abs(librosa.stft(y, hop_length=hop_length)) + contrast = librosa.feature.spectral_contrast(S=S, sr=sr) + + # Harmonic features with CQT-based chroma (better for harmonic analysis) + chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=hop_length) + + # Timbral features + mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=hop_length) + + # Energy features + rms = librosa.feature.rms(y=y, hop_length=hop_length) + + # Harmonic-percussive source separation for better rhythm analysis + y_harmonic, y_percussive = librosa.effects.hpss(y) + + # Step 2: Adaptive determination of segment count based on song complexity + # ---------------------------------------------------------------------- + duration = librosa.get_duration(y=y, sr=sr) + + # Feature preparation for adaptive segmentation + # Stack features with proper normalization (addressing the scale issue) + feature_stack = np.vstack([ + librosa.util.normalize(contrast), + librosa.util.normalize(chroma), + librosa.util.normalize(mfcc), + librosa.util.normalize(rms) + ]) + + # Transpose to get time as first dimension + feature_matrix = feature_stack.T + + # Step 3: Feature fusion using dimensionality reduction + # ---------------------------------------------------------------------- + from sklearn.decomposition import PCA + + # Handle very short audio files + n_components = min(8, feature_matrix.shape[0], feature_matrix.shape[1]) + + if feature_matrix.shape[0] > n_components and feature_matrix.shape[1] > 0: + try: + pca = PCA(n_components=n_components) + reduced_features = pca.fit_transform(feature_matrix) + except Exception as e: + print(f"PCA failed, falling back to original features: {e}") + # Fallback to simpler approach if PCA fails + reduced_features = feature_matrix + else: + # Not enough data for PCA + reduced_features = feature_matrix + + # Step 4: Adaptive determination of optimal segment count + # ---------------------------------------------------------------------- + + # Initialize range of segment counts to try + min_segments = max(2, int(duration / 60)) # At least 2 segments, roughly 1 per minute + max_segments = min(10, int(duration / 20)) # At most 10 segments, roughly 1 per 20 seconds + + # Ensure reasonable bounds + min_segments = max(2, min(min_segments, 4)) + max_segments = max(min_segments + 1, min(max_segments, 8)) + + # Try different segment counts and evaluate with silhouette score + best_segments = min_segments + best_score = -1 + + from sklearn.metrics import silhouette_score + from sklearn.cluster import AgglomerativeClustering + + # Only do this analysis if we have enough data + if reduced_features.shape[0] > max_segments: + for n_segments in range(min_segments, max_segments + 1): + try: + # Perform agglomerative clustering + clustering = AgglomerativeClustering(n_clusters=n_segments) + labels = clustering.fit_predict(reduced_features) + + # Calculate silhouette score if we have enough samples + if len(np.unique(labels)) > 1 and len(labels) > n_segments + 1: + score = silhouette_score(reduced_features, labels) + + if score > best_score: + best_score = score + best_segments = n_segments + except Exception as e: + print(f"Clustering with {n_segments} segments failed: {e}") + continue + + # Use the optimal segment count for final segmentation + n_segments = best_segments + + # Step 5: Final segmentation using the optimal segment count + # ---------------------------------------------------------------------- + + # Method 1: Use agglomerative clustering on the reduced features + try: + clustering = AgglomerativeClustering(n_clusters=n_segments) + labels = clustering.fit_predict(reduced_features) + + # Convert cluster labels to boundaries by finding where labels change + boundaries = [0] # Start with the beginning + + for i in range(1, len(labels)): + if labels[i] != labels[i-1]: + boundaries.append(i) + + boundaries.append(len(labels)) # Add the end + + # Convert to frames + bounds_frames = np.array(boundaries) + + except Exception as e: + print(f"Final clustering failed: {e}") + # Fallback to librosa's agglomerative clustering on original features + bounds_frames = librosa.segment.agglomerative(feature_stack, n_segments) + + # Step 6: Convert boundaries to time and create sections + # ---------------------------------------------------------------------- + bounds_times = librosa.frames_to_time(bounds_frames, sr=sr, hop_length=hop_length) + + # Create sections from the boundaries + sections = [] + + for i in range(len(bounds_times) - 1): + start = bounds_times[i] + end = bounds_times[i+1] + duration = end - start + + # Skip extremely short sections + if duration < 4 and i > 0 and i < len(bounds_times) - 2: + continue + + # Add section to the list (without classifying as verse/chorus/etc) + sections.append({ + "type": "segment", # Generic type instead of verse/chorus/etc + "start": start, + "end": end, + "duration": duration + }) + + # Filter out any remaining extremely short sections + sections = [s for s in sections if s["duration"] >= 5] + + return sections + +def create_flexible_syllable_templates(beats_info, genre=None, phrase_mode='default'): + """ + Create enhanced syllable templates based on beat patterns with improved musical intelligence. + + Parameters: + beats_info: Dictionary containing beat analysis data + genre: Optional genre to influence template creation + phrase_mode: 'default' uses provided phrases, 'auto' forces recalculation + + Returns: + String of syllable templates with embedded strength values and flexible timing + """ + import numpy as np + from sklearn.cluster import KMeans + + # Convert any numpy values to native Python types for safety - directly handle conversions + # Process the dictionary to convert numpy values to Python native types + if isinstance(beats_info, dict): + processed_beats_info = {} + for k, v in beats_info.items(): + if isinstance(v, np.ndarray): + if v.size == 1: + processed_beats_info[k] = float(v.item()) + else: + processed_beats_info[k] = [float(x) if isinstance(x, np.number) else x for x in v] + elif isinstance(v, np.number): + processed_beats_info[k] = float(v) + elif isinstance(v, list): + processed_beats_info[k] = [float(x) if isinstance(x, np.number) else x for x in v] + else: + processed_beats_info[k] = v + beats_info = processed_beats_info + + # Extract basic beat information + beat_times = beats_info.get("beat_times", []) + beat_strengths = beats_info.get("beat_strengths", [1.0] * len(beat_times)) + tempo = beats_info.get("tempo", 120) + time_signature = beats_info.get("time_signature", 4) + + # Early return for insufficient data + if len(beat_times) < 2: + return "S(1.0):1-w(0.5):1|S(1.0):1-w(0.5):1" # Default fallback pattern + + # Step 1: Improved adaptive thresholding using k-means clustering + # ---------------------------------------------------------------------- + if len(beat_strengths) >= 6: # Need enough data points for clustering + # Reshape for k-means + X = np.array(beat_strengths).reshape(-1, 1) + + # Use k-means with 3 clusters for Strong, Medium, Weak classification + kmeans = KMeans(n_clusters=3, random_state=0, n_init=10).fit(X) + + # Find the centroid values and sort them + centroids = sorted([float(c[0]) for c in kmeans.cluster_centers_]) + + # Map to thresholds (using the midpoints between centroids) + if len(centroids) >= 3: + medium_threshold = (centroids[0] + centroids[1]) / 2 + strong_threshold = (centroids[1] + centroids[2]) / 2 + else: + # Fallback if clustering doesn't work well + medium_threshold = np.percentile(beat_strengths, 33) + strong_threshold = np.percentile(beat_strengths, 66) + else: + # For limited data, use percentile-based approach + medium_threshold = np.percentile(beat_strengths, 33) + strong_threshold = np.percentile(beat_strengths, 66) + + # Step 2: Create or refine phrases based on mode + # ---------------------------------------------------------------------- + phrases = beats_info.get("phrases", []) + + if phrase_mode == 'auto' or not phrases: + # Create phrases based on time signature and beat strengths + phrases = [] + current_phrase = [] + + for i in range(len(beat_times)): + current_phrase.append(i) + + # Check for natural phrase endings + if (i + 1) % time_signature == 0 or i == len(beat_times) - 1: + if len(current_phrase) >= 2: # Ensure minimum phrase length + phrases.append(current_phrase) + current_phrase = [] + + # Add any remaining beats + if current_phrase and len(current_phrase) >= 2: + phrases.append(current_phrase) + + # Step 3: Improved continuous tempo-to-syllable mapping function + # ---------------------------------------------------------------------- + def tempo_to_syllable_base(tempo): + """Continuous function mapping tempo to syllable base count with scientific curve""" + # Sigmoid-like function with more scientific parameters + # Using logistic function: L/(1+e^(-k(x-x0))) to create smooth transitions + if tempo < 40: # Very slow tempos + return 3.5 # Maximum syllables for extremely slow tempos + elif tempo > 200: # Very fast tempos + return 0.8 # Minimum syllables for extremely fast tempos + else: + # Scientific logistic function for middle range (40-200 BPM) + L = 3.5 # Upper limit + k = 0.04 # Steepness of curve + x0 = 120 # Midpoint (inflection point at normal tempo) + return L / (1 + np.exp(k * (tempo - x0))) + + # Step 4: Generate enhanced templates with flexible timing + # ---------------------------------------------------------------------- + syllable_templates = [] + + for phrase in phrases: + # Skip empty phrases + if not phrase: + continue + + # Extract beat strengths for this phrase + phrase_strengths = [beat_strengths[i] for i in phrase if i < len(beat_strengths)] + if not phrase_strengths: + phrase_strengths = [1.0] * len(phrase) + + # Apply improved adaptive thresholding for stress pattern detection + stress_pattern = [] + for i, strength in enumerate(phrase_strengths): + # Consider both strength and metrical position with improved weighting + metrical_position = i % time_signature + + # Apply improved position boosting based on musical theory + # In common time signatures, first beat gets strong emphasis, + # third beat gets moderate emphasis (in 4/4) + if metrical_position == 0: # Downbeat (first beat) + position_boost = 0.18 # Stronger boost for downbeats + elif time_signature == 4 and metrical_position == 2: # Third beat in 4/4 + position_boost = 0.1 # Moderate boost for third beat + elif time_signature == 3 and metrical_position == 1: # Second beat in 3/4 + position_boost = 0.05 # Slight boost for second beat in 3/4 + else: + position_boost = 0 # No boost for other beats + + effective_strength = strength + position_boost + + if effective_strength >= strong_threshold: + stress_pattern.append(("S", effective_strength)) # Strong beat with strength + elif effective_strength >= medium_threshold: + stress_pattern.append(("m", effective_strength)) # Medium beat with strength + else: + stress_pattern.append(("w", effective_strength)) # Weak beat with strength + + # Step 5: Calculate syllable counts using improved continuous function + # ---------------------------------------------------------------------- + detailed_template = [] + + for i, (stress_type, strength) in enumerate(stress_pattern): + # Get base syllable count from tempo with more nuanced mapping + base_syllables = tempo_to_syllable_base(tempo) + + # Adjust based on both stress type AND metrical position + metrical_position = i % time_signature + position_factor = 1.2 if metrical_position == 0 else 1.0 + + # More nuanced adjustment based on stress type + if stress_type == "S": + syllable_factor = 1.2 * position_factor # Emphasize strong beats more + elif stress_type == "m": + syllable_factor = 1.0 * position_factor # Medium beats + else: + syllable_factor = 0.8 # Weak beats + + # Apply improved genre-specific adjustments with more granular factors + genre_factor = 1.0 + if genre: + genre = genre.lower() + if "rap" in genre or "hip" in genre: + genre_factor = 1.5 # Significantly higher syllable density for rap + elif "folk" in genre or "country" in genre or "ballad" in genre: + genre_factor = 0.7 # Lower density for folk styles + elif "metal" in genre or "rock" in genre: + genre_factor = 1.1 # Slightly higher density for rock/metal + elif "jazz" in genre: + genre_factor = 1.2 # Higher density for jazz (complex rhythms) + elif "classical" in genre: + genre_factor = 0.9 # More moderate for classical + + # Calculate adjusted syllable count with scientific weighting + raw_count = base_syllables * syllable_factor * genre_factor + + # Use more precise rounding that preserves subtle differences + # Round to quarters rather than halves for more precision + rounded_count = round(raw_count * 4) / 4 + + # Limit to reasonable range (0.5 to 4) with improved bounds + syllable_count = max(0.5, min(4, rounded_count)) + + # Format with embedded strength value for reversibility + # Convert strength to 2-decimal precision percentage + strength_pct = round(strength * 100) / 100 + detailed_template.append(f"{stress_type}({strength_pct}):{syllable_count}") + + # Join beat templates for this phrase + phrase_template = "-".join(detailed_template) + syllable_templates.append(phrase_template) + + # Step 6: Ensure valid output with improved defaults + # ---------------------------------------------------------------------- + if not syllable_templates: + # Create sensible defaults based on time signature that reflect musical theory + if time_signature == 3: # 3/4 time - waltz pattern + syllable_templates = ["S(0.95):2-w(0.4):1-w(0.35):1"] # 3/4 default + elif time_signature == 2: # 2/4 time - march pattern + syllable_templates = ["S(0.95):1.5-w(0.4):1"] # 2/4 default + else: # 4/4 time - common time + syllable_templates = ["S(0.95):2-w(0.4):1-m(0.7):1.5-w(0.35):1"] # 4/4 default + + # Join all phrase templates with the original separator for compatibility + return "|".join(syllable_templates) + +def format_syllable_templates_for_prompt(syllable_templates, arrow="→", line_wrap=10, + structured_output=False, beat_types=None): + """ + Convert technical syllable templates into clear, human-readable instructions with + enhanced flexibility and customization options. + + Parameters: + syllable_templates: String or list of templates + arrow: Symbol to use between beats (default: "→") + line_wrap: Number of beats before automatic line wrapping (0 = no wrapping) + structured_output: If True, return structured data instead of text + beat_types: Custom mapping for beat types (default: None, uses standard mapping) + + Returns: + Human-readable instructions or structured data depending on parameters + """ + if not syllable_templates: + return {} if structured_output else "" + + # Define standard beat type mapping (extensible) + default_beat_types = { + "S": {"name": "STRONG", "description": "stressed syllable"}, + "m": {"name": "medium", "description": "medium-stressed syllable"}, + "w": {"name": "weak", "description": "unstressed syllable"}, + "X": {"name": "EXTRA", "description": "extra strong syllable"}, + "L": {"name": "legato", "description": "connected/tied syllable"} + } + + # Use custom mapping if provided, otherwise use default + beat_types = beat_types or default_beat_types + + # Initialize structured output if requested + structured_data = {"lines": [], "explanations": []} if structured_output else None + + # Improved format detection - more robust than just checking for "|" + is_enhanced_format = False + + # Check if it's a string with enhanced format patterns + if isinstance(syllable_templates, str): + # Look for enhanced format patterns - check for beat type indicators + if any(bt + "(" in syllable_templates or bt + ":" in syllable_templates or bt + "[" in syllable_templates + for bt in beat_types.keys()): + is_enhanced_format = True + # Secondary check for the "|" delimiter between phrases + elif "|" in syllable_templates: + is_enhanced_format = True + + # Initialize the output with a brief explanatory header + output = [] + + if is_enhanced_format: + # Split into individual phrase templates + phrases = syllable_templates.split("|") if "|" in syllable_templates else [syllable_templates] + + # Process each phrase into human-readable instructions + for i, phrase in enumerate(phrases): + # Check for special annotations + has_swing = "(swing)" in phrase + if has_swing: + phrase = phrase.replace("(swing)", "") # Remove annotation for processing + + beats = phrase.split("-") + beat_instructions = [] + + # Process each beat in the phrase + for j, beat in enumerate(beats): + # Extract beat type and information + beat_info = {"original": beat, "type": None, "count": None, "strength": None} + + # Handle enhanced format with embedded strength values: S(0.95):2 + if "(" in beat and ")" in beat and ":" in beat: + parts = beat.split(":") + beat_type = parts[0].split("(")[0] # Extract beat type + strength = parts[0].split("(")[1].rstrip(")") # Extract strength value + count = parts[1] # Extract syllable count + + beat_info["type"] = beat_type + beat_info["count"] = count + beat_info["strength"] = strength + + # Handle simpler format: S2, m1, w1 + elif any(beat.startswith(bt) for bt in beat_types.keys()) and len(beat) > 1: + beat_type = beat[0] + count = beat[1:] + + beat_info["type"] = beat_type + beat_info["count"] = count + + # Fallback for any other format + else: + beat_instructions.append(beat) + continue + + # Format the beat instruction based on type + if beat_info["type"] in beat_types: + type_name = beat_types[beat_info["type"]]["name"] + if beat_info["strength"]: + beat_instructions.append(f"{type_name}({beat_info['count']}) [{beat_info['strength']}]") + else: + beat_instructions.append(f"{type_name}({beat_info['count']})") + else: + # Unknown beat type, use as-is + beat_instructions.append(beat) + + # Handle line wrapping for readability + if line_wrap > 0 and len(beat_instructions) > line_wrap: + wrapped_instructions = [] + for k in range(0, len(beat_instructions), line_wrap): + section = beat_instructions[k:k+line_wrap] + wrapped_instructions.append(f"{arrow} ".join(section)) + line_desc = f"\n {arrow} ".join(wrapped_instructions) + else: + line_desc = f" {arrow} ".join(beat_instructions) + + # Add swing notation if present + if has_swing: + line_desc += " [with swing feel]" + + # Add to output + line_output = f"Line {i+1}: {line_desc}" + output.append(line_output) + + if structured_output: + structured_data["lines"].append({ + "line_number": i+1, + "beats": [{"original": beats[j], + "type": beat_info.get("type"), + "count": beat_info.get("count"), + "strength": beat_info.get("strength")} + for j, beat_info in enumerate([b for b in beats if isinstance(b, dict)])], + "has_swing": has_swing + }) + + # Add explanation of notation after the lines + explanation = [ + "\n📝 UNDERSTANDING THE NOTATION:" + ] + + # Add descriptions for each beat type that was actually used + used_beat_types = set() + for phrase in phrases: + for beat in phrase.split("-"): + for bt in beat_types.keys(): + if beat.startswith(bt): + used_beat_types.add(bt) + + for bt in used_beat_types: + if bt in beat_types: + name = beat_types[bt]["name"] + desc = beat_types[bt]["description"] + explanation.append(f"- {name}(n): Place a {desc} here, plus (n-1) unstressed syllables") + + explanation.extend([ + f"- {arrow}: Indicates flow from one beat to the next", + "- [0.xx]: Beat strength value (higher = more emphasis needed)" + ]) + + output.extend(explanation) + + if structured_output: + structured_data["explanations"] = explanation + + # Add examples for half-syllable values if they appear in the templates + has_half_syllables = any((".5" in beat) for phrase in phrases for beat in phrase.split("-")) + if has_half_syllables: + half_syllable_examples = [ + "\n🎵 HALF-SYLLABLE EXAMPLES:", + "- STRONG(1.5): One stressed syllable followed by an unstressed half-syllable", + " Example: \"LOVE you\" where \"LOVE\" is stressed and \"you\" is quick", + "- medium(2.5): One medium syllable plus one-and-a-half unstressed syllables", + " Example: \"Wait for the\" where \"Wait\" is medium-stressed and \"for the\" is quick" + ] + output.extend(half_syllable_examples) + + if structured_output: + structured_data["half_syllable_examples"] = half_syllable_examples + + # Add swing explanation if needed + if any("swing" in phrase for phrase in phrases): + swing_guide = [ + "\n🎶 SWING RHYTHM GUIDE:", + "- In swing, syllables should be unevenly timed (long-short pattern)", + "- Example: \"SUM-mer TIME\" in swing feels like \"SUM...mer-TIME\" with delay" + ] + output.extend(swing_guide) + + if structured_output: + structured_data["swing_guide"] = swing_guide + + # Handle the original format or segment dictionaries + else: + formatted_lines = [] + + if isinstance(syllable_templates, list): + for i, template in enumerate(syllable_templates): + if isinstance(template, dict) and "syllable_template" in template: + line = f"Line {i+1}: {template['syllable_template']} syllables" + formatted_lines.append(line) + + if structured_output: + structured_data["lines"].append({ + "line_number": i+1, + "syllable_count": template["syllable_template"] + }) + elif isinstance(template, str): + line = f"Line {i+1}: {template} syllables" + formatted_lines.append(line) + + if structured_output: + structured_data["lines"].append({ + "line_number": i+1, + "syllable_count": template + }) + + output = formatted_lines + else: + output = [str(syllable_templates)] + + if structured_output: + structured_data["raw_content"] = str(syllable_templates) + + # Add general application advice + application_tips = [ + "\n💡 APPLICATION TIPS:", + "1. Strong beats need naturally stressed syllables (like the START of \"RE-mem-ber\")", + "2. Place important words on strong beats for natural emphasis", + "3. Vowel sounds work best for sustained or emphasized syllables", + "4. Keep consonant clusters (like \"str\" or \"thr\") on weak beats" + ] + output.extend(application_tips) + + if structured_output: + structured_data["application_tips"] = application_tips + return structured_data + + return "\n".join(output) + +def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=None): + """ + Enhanced verification of syllable counts and stress patterns with precise alignment analysis + for both phrase-level and second-level templates. + """ + import re + import pronouncing + import numpy as np + import functools + from itertools import chain + + print(f"DEBUG: In verify_flexible_syllable_counts, type of lyrics={type(lyrics)}") + print(f"DEBUG: Type of templates={type(templates)}") + + # Ensure lyrics is a string + if not isinstance(lyrics, str): + print(f"DEBUG: lyrics is not a string, it's {type(lyrics)}") + # Convert to string if possible + try: + lyrics = str(lyrics) + except Exception as e: + print(f"DEBUG: Cannot convert lyrics to string: {str(e)}") + return "Error: Cannot process non-string lyrics" + + # Ensure templates is a list + if not isinstance(templates, list): + print(f"DEBUG: templates is not a list, it's {type(templates)}") + # If it's not a list, create a single-item list + if templates is not None: + templates = [templates] + else: + templates = [] + + # Split lyrics into lines + lines = [line.strip() for line in lyrics.split("\n") if line.strip()] + + # Initialize tracking variables + verification_notes = [] + detailed_analysis = [] + stress_misalignments = [] + total_mismatch_count = 0 + + # Process each lyric line against its template + for i, line in enumerate(lines): + if i >= len(templates): + break + + template = templates[i] + print(f"DEBUG: Processing template {i+1}, type={type(template)}") + + # Extract the template string from different possible formats + template_str = None + if isinstance(template, dict) and "syllable_template" in template: + template_str = template["syllable_template"] + elif isinstance(template, str): + template_str = template + else: + print(f"DEBUG: Skipping template {i+1}, not a string or dict with syllable_template") + continue + + if not isinstance(template_str, str): + print(f"DEBUG: template_str is not a string, it's {type(template_str)}") + continue + + # Handle multiple phrases in template - process ALL phrases, not just the first + template_phrases = [template_str] + if "|" in template_str: + template_phrases = template_str.split("|") + + # Check against all phrases and find the best match + best_match_diff = float('inf') + best_match_phrase = None + best_phrase_beats = None + actual_count = count_syllables(line) + + for phrase_idx, phrase in enumerate(template_phrases): + # Extract beat patterns and expected syllable counts from template + beats_info = [] + total_expected = 0 + + # Enhanced template parsing + if "-" in phrase: + beat_templates = phrase.split("-") + + # Parse each beat template + for beat in beat_templates: + beat_info = {"original": beat, "type": None, "count": 1, "strength": None} + + # Handle templates with embedded strength values: S(0.95):2 + if "(" in beat and ")" in beat and ":" in beat: + parts = beat.split(":") + beat_type = parts[0].split("(")[0] + try: + strength = float(parts[0].split("(")[1].rstrip(")")) + except ValueError: + strength = 1.0 + + # Handle potential float syllable counts + try: + count = float(parts[1]) + # Convert to int if it's a whole number + if count == int(count): + count = int(count) + except ValueError: + count = 1 + + beat_info.update({ + "type": beat_type, + "count": count, + "strength": strength + }) + + # Handle simple format: S2, m1, w1 + elif any(beat.startswith(x) for x in ["S", "m", "w", "X", "L"]): + beat_type = beat[0] + + # Extract count, supporting float values + try: + count_str = beat[1:] + count = float(count_str) + if count == int(count): + count = int(count) + except ValueError: + count = 1 + + beat_info.update({ + "type": beat_type, + "count": count + }) + + # Legacy format - just numbers + else: + try: + count = float(beat) + if count == int(count): + count = int(count) + beat_info["count"] = count + except ValueError: + pass + + beats_info.append(beat_info) + total_expected += beat_info["count"] + + # Compare this phrase to actual syllable count + phrase_diff = abs(actual_count - total_expected) + + # Adaptive threshold based on expected syllables + expected_ratio = 0.15 if total_expected > 10 else 0.25 + phrase_threshold = max(1, round(total_expected * expected_ratio)) + + # If this is the best match so far, store it + if phrase_diff < best_match_diff: + best_match_diff = phrase_diff + best_match_phrase = phrase + best_phrase_beats = beats_info + + # For very simple templates without "-" + else: + try: + total_expected = float(phrase) + phrase_diff = abs(actual_count - total_expected) + if phrase_diff < best_match_diff: + best_match_diff = phrase_diff + best_match_phrase = phrase + best_phrase_beats = [{"count": total_expected}] + except ValueError: + pass + + # If we found a reasonable match, proceed with analysis + if best_match_phrase and best_phrase_beats: + total_expected = sum(beat["count"] for beat in best_phrase_beats) + + # Calculate adaptive threshold based on expected syllables + expected_ratio = 0.15 if total_expected > 10 else 0.25 + threshold = max(1, round(total_expected * expected_ratio)) + + # Check if total syllable count is significantly off + if total_expected > 0 and best_match_diff > threshold: + verification_notes.append(f"Line {i+1}: Expected {total_expected} syllables, got {actual_count}") + total_mismatch_count += 1 + + # Extract words and perform detailed alignment analysis + words = re.findall(r'\b[a-zA-Z]+\b', line.lower()) + + # Get syllable count and stress for each word + word_analysis = [] + cumulative_syllables = 0 + + for word in words: + syllable_count = count_syllables_for_word(word) + + # Get stress pattern + stress_pattern = get_word_stress(word) + + word_analysis.append({ + "word": word, + "syllables": syllable_count, + "stress_pattern": stress_pattern, + "position": cumulative_syllables + }) + + cumulative_syllables += syllable_count + + # Analyze alignment with beats - only if there are beat types + if best_phrase_beats and any(b.get("type") == "S" for b in best_phrase_beats if "type" in b): + # Identify positions where strong syllables should fall + strong_positions = [] + current_pos = 0 + + for beat in best_phrase_beats: + if beat.get("type") == "S": + strong_positions.append(current_pos) + current_pos += beat.get("count", 1) + + # Check if strong syllables align with strong beats + alignment_issues = [] + + for pos in strong_positions: + # Find which word contains this position + misaligned_word = None + + for word_info in word_analysis: + word_start = word_info["position"] + word_end = word_start + word_info["syllables"] + + if word_start <= pos < word_end: + # Check if a stressed syllable falls on this position + syllable_in_word = pos - word_start + + # Get stress pattern for this word + stress = word_info["stress_pattern"] + + # If we have stress information and this syllable isn't stressed + if stress and syllable_in_word < len(stress) and stress[syllable_in_word] != '1': + misaligned_word = word_info["word"] + alignment_issues.append(f"'{word_info['word']}' (unstressed syllable on strong beat)") + stress_misalignments.append({ + "line": i+1, + "word": word_info["word"], + "position": pos, + "suggestion": get_stress_aligned_alternatives(word_info["word"], syllable_in_word) + }) + break + + if alignment_issues: + verification_notes.append(f" → Stress misalignments: {', '.join(alignment_issues)}") + + # Generate a visual alignment map for better understanding + alignment_map = generate_alignment_visualization(line, best_phrase_beats, word_analysis) + if alignment_map: + detailed_analysis.append(f"Line {i+1} Alignment Analysis:\n{alignment_map}") + else: + # If no matching template was found + verification_notes.append(f"Line {i+1}: Unable to find matching template pattern") + + # Add second-level verification if templates are provided + if second_level_templates: + verification_notes.append("\n=== SECOND-LEVEL VERIFICATION ===\n") + + # Check each second against corresponding line + for i, template in enumerate(second_level_templates): + if i >= len(lines): + break + + line = lines[i] + + # Skip section headers + if line.startswith('[') and ']' in line: + continue + + actual_count = count_syllables(line) + + # Parse template to get expected syllable count + total_expected = 0 + beat_patterns = [] + + # Handle templates with beat patterns like "S(0.95):2-w(0.4):1" + if isinstance(template, str) and "-" in template: + for beat in template.split("-"): + if ":" in beat: + try: + count_part = beat.split(":")[1] + count = float(count_part) + total_expected += count + + # Extract beat type for alignment check + beat_type = beat.split("(")[0] if "(" in beat else beat[0] + beat_patterns.append((beat_type, count)) + except (IndexError, ValueError): + pass + + # Compare actual vs expected count + if total_expected > 0: + # Calculate adaptive threshold based on expected syllables + expected_ratio = 0.2 # More strict at second level + threshold = max(0.5, round(total_expected * expected_ratio)) + + difference = abs(actual_count - total_expected) + + if difference > threshold: + verification_notes.append(f"Second {i+1}: Expected {total_expected} syllables, got {actual_count}") + total_mismatch_count += 1 + + # Check for stress misalignment in this second + words = re.findall(r'\b[a-zA-Z]+\b', line.lower()) + word_analysis = [] + cumulative_syllables = 0 + + for word in words: + syllable_count = count_syllables_for_word(word) + stress_pattern = get_word_stress(word) + + word_analysis.append({ + "word": word, + "syllables": syllable_count, + "stress_pattern": stress_pattern, + "position": cumulative_syllables + }) + + cumulative_syllables += syllable_count + + # Check if stressed syllables align with strong beats + if beat_patterns: + strong_positions = [] + current_pos = 0 + + for beat_type, count in beat_patterns: + if beat_type == "S": + strong_positions.append(current_pos) + current_pos += count + + # Look for misalignments + for pos in strong_positions: + for word_info in word_analysis: + word_start = word_info["position"] + word_end = word_start + word_info["syllables"] + + if word_start <= pos < word_end: + # Check if a stressed syllable falls on this position + syllable_in_word = int(pos - word_start) + stress = word_info["stress_pattern"] + + if stress and syllable_in_word < len(stress) and stress[syllable_in_word] != '1': + verification_notes.append(f" → In second {i+1}, '{word_info['word']}' has unstressed syllable on strong beat") + break + + # Only add detailed analysis if we have rhythm mismatches + if verification_notes: + lyrics += "\n\n[Note: Potential rhythm mismatches detected in these lines:]\n" + lyrics += "\n".join(verification_notes) + + if detailed_analysis: + lyrics += "\n\n[Detailed Alignment Analysis:]\n" + lyrics += "\n\n".join(detailed_analysis) + + lyrics += "\n\n[How to fix rhythm mismatches:]\n" + lyrics += "1. Make sure stressed syllables (like 'LO' in 'LOV-er') fall on STRONG beats\n" + lyrics += "2. Adjust syllable counts to match the template (add/remove words or use different words)\n" + lyrics += "3. Try using words where natural stress aligns with musical rhythm\n" + + # Add specific word substitution suggestions if we found stress misalignments + if stress_misalignments: + lyrics += "\n[Specific word replacement suggestions:]\n" + for issue in stress_misalignments[:5]: # Limit to first 5 issues + if issue["suggestion"]: + lyrics += f"Line {issue['line']}: Consider replacing '{issue['word']}' with: {issue['suggestion']}\n" + + return lyrics + +def generate_alignment_visualization(line, beats_info, word_analysis): + """Generate a visual representation of syllable alignment with beats.""" + if not beats_info or not word_analysis: + return None + + # Create a syllable breakdown with stress information + syllable_breakdown = [] + syllable_stresses = [] + + for word_info in word_analysis: + word = word_info["word"] + syllables = word_info["syllables"] + stress = word_info["stress_pattern"] or "" + + # Extend stress pattern if needed + while len(stress) < syllables: + stress += "0" + + # Get syllable breakdown + parts = naive_syllable_split(word, syllables) + + for i, part in enumerate(parts): + syllable_breakdown.append(part) + if i < len(stress): + syllable_stresses.append(stress[i]) + else: + syllable_stresses.append("0") + + # Create beat pattern + beat_types = [] + current_pos = 0 + + for beat in beats_info: + beat_type = beat.get("type", "-") + count = beat.get("count", 1) + + # Handle whole numbers and half syllables + if isinstance(count, int): + beat_types.extend([beat_type] * count) + else: + # For half syllables, round up and use markers + whole_part = int(count) + frac_part = count - whole_part + + if whole_part > 0: + beat_types.extend([beat_type] * whole_part) + + if frac_part > 0: + beat_types.append(f"{beat_type}½") + + # Ensure we have enough beat types + while len(beat_types) < len(syllable_breakdown): + beat_types.append("-") + + # Trim beat types if too many + beat_types = beat_types[:len(syllable_breakdown)] + + # Generate the visualization with highlighted misalignments + result = [] + + # First line: syllable breakdown with stress indicators + syllable_display = [] + for i, syllable in enumerate(syllable_breakdown): + if i < len(syllable_stresses) and syllable_stresses[i] == "1": + syllable_display.append(syllable.upper()) # Uppercase for stressed syllables + else: + syllable_display.append(syllable.lower()) # Lowercase for unstressed + + result.append(" - ".join(syllable_display)) + + # Second line: beat indicators with highlighting for misalignments + beat_indicators = [] + for i, (syllable, beat_type) in enumerate(zip(syllable_stresses, beat_types)): + if beat_type == "S" or beat_type.startswith("S"): + if syllable == "1": + beat_indicators.append("↑") # Aligned strong beat + else: + beat_indicators.append("❌") # Misaligned strong beat + elif beat_type == "m" or beat_type.startswith("m"): + beat_indicators.append("•") # Medium beat + elif beat_type == "w" or beat_type.startswith("w"): + beat_indicators.append("·") # Weak beat + else: + beat_indicators.append(" ") + + result.append(" ".join(beat_indicators)) + + # Third line: beat types + result.append(" - ".join(beat_types)) + + return "\n".join(result) + +@functools.lru_cache(maxsize=256) +def naive_syllable_split(word, syllable_count): + """Naively split a word into the specified number of syllables, with caching for performance.""" + if syllable_count <= 1: + return [word] + + # Common syllable break patterns + vowels = "aeiouy" + consonants = "bcdfghjklmnpqrstvwxz" + + # Find potential split points + splits = [] + for i in range(1, len(word) - 1): + if word[i] in consonants and word[i-1] in vowels: + splits.append(i) + elif word[i] in vowels and word[i-1] in consonants and word[i+1] in consonants: + splits.append(i+1) + + # Ensure we have enough split points + while len(splits) < syllable_count - 1: + for i in range(1, len(word)): + if i not in splits: + splits.append(i) + break + + # Sort and limit + splits.sort() + splits = splits[:syllable_count - 1] + + # Split the word + result = [] + prev = 0 + for pos in splits: + result.append(word[prev:pos]) + prev = pos + + result.append(word[prev:]) + return result + +def get_stress_aligned_alternatives(word, position_to_stress): + """Suggest alternative words with proper stress at the required position.""" + # This would ideally use a more sophisticated dictionary lookup, + # but here's a simple implementation with common word patterns + syllable_count = count_syllables_for_word(word) + + # Common synonyms/replacements by syllable count with stress position + if syllable_count == 2: + if position_to_stress == 0: # Need stress on first syllable + first_stress = ["love-ly", "won-der", "beau-ty", "danc-ing", "dream-ing", + "heart-beat", "sun-light", "moon-light", "star-light"] + return ", ".join(first_stress[:3]) + else: # Need stress on second syllable + second_stress = ["be-LIEVE", "a-BOVE", "a-ROUND", "to-DAY", "a-LIVE", + "a-LONE", "be-HOLD", "re-TURN", "de-LIGHT"] + return ", ".join(second_stress[:3]) + elif syllable_count == 3: + if position_to_stress == 0: # First syllable stress + return "MEM-o-ry, WON-der-ful, BEAU-ti-ful" + elif position_to_stress == 1: # Second syllable stress + return "a-MAZE-ing, to-GE-ther, for-EV-er" + else: # Third syllable stress + return "un-der-STAND, o-ver-COME, ne-ver-MORE" + + # For other cases, just provide general guidance + return f"a word with stress on syllable {position_to_stress + 1}" + +def generate_lyrics(genre, duration, emotion_results, song_structure=None): + """ + Generate lyrics based on the genre, emotion, and structure analysis with enhanced rhythmic alignment. + + This improved version uses advanced template creation, better formatting, and verification with + potential refinement for lyrics that perfectly match the musical rhythm patterns. + + Parameters: + genre: Musical genre of the audio + duration: Duration of the audio in seconds + emotion_results: Dictionary containing emotional analysis results + song_structure: Optional dictionary containing song structure analysis + + Returns: + Generated lyrics aligned with the rhythm patterns of the music + """ + # Safety check for strings + def is_safe_dict_access(obj, key): + """Safe dictionary key access with type checking""" + if not isinstance(obj, dict): + print(f"WARNING: Attempted to access key '{key}' on non-dictionary object of type {type(obj)}") + return False + return key in obj + + # Ensure emotion_results is a dictionary with the expected structure + if not isinstance(emotion_results, dict): + emotion_results = { + "emotion_analysis": {"primary_emotion": "Unknown"}, + "theme_analysis": {"primary_theme": "Unknown"}, + "rhythm_analysis": {"tempo": 0}, + "tonal_analysis": {"key": "Unknown", "mode": ""}, + "summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"} + } + + # Ensure song_structure is properly structured + if song_structure is not None and not isinstance(song_structure, dict): + print(f"WARNING: song_structure is not a dict, it's {type(song_structure)}") + song_structure = None + + print(f"DEBUG: Starting generate_lyrics with genre={genre}, duration={duration}") + print(f"DEBUG: Type of song_structure={type(song_structure)}") + print(f"DEBUG: Type of emotion_results={type(emotion_results)}") + + # Helper function to safely access dictionary with string keys + def safe_dict_get(d, key, default=None): + """Safely get a value from a dictionary, handling non-dictionary objects.""" + if not isinstance(d, dict): + print(f"WARNING: Attempted to access key '{key}' in non-dictionary object of type {type(d)}") + return default + return d.get(key, default) + + # Extract emotion and theme data with safe defaults + primary_emotion = safe_dict_get(safe_dict_get(emotion_results, "emotion_analysis", {}), "primary_emotion", "Unknown") + primary_theme = safe_dict_get(safe_dict_get(emotion_results, "theme_analysis", {}), "primary_theme", "Unknown") + + # Extract numeric values safely with fallbacks + try: + tempo = float(safe_dict_get(safe_dict_get(emotion_results, "rhythm_analysis", {}), "tempo", 0.0)) + except (ValueError, TypeError): + tempo = 0.0 + + key = safe_dict_get(safe_dict_get(emotion_results, "tonal_analysis", {}), "key", "Unknown") + mode = safe_dict_get(safe_dict_get(emotion_results, "tonal_analysis", {}), "mode", "") + + # Format syllable templates for the prompt + syllable_guidance = "" + templates_for_verification = [] + + # Create a structure visualization to help with lyrics-music matching + structure_visualization = "=== MUSIC-LYRICS STRUCTURE MATCHING ===\n\n" + structure_visualization += f"Song Duration: {duration:.1f} seconds\n" + structure_visualization += f"Tempo: {tempo:.1f} BPM\n\n" + + # Add second-level template guidance if available + if song_structure and is_safe_dict_access(song_structure, "second_level") and is_safe_dict_access(song_structure.get("second_level", {}), "templates"): + print(f"DEBUG: Using second-level templates") + second_level_templates = song_structure.get("second_level", {}).get("templates", []) + + # Create second-level guidance + second_level_guidance = "\nSECOND-BY-SECOND RHYTHM INSTRUCTIONS:\n" + second_level_guidance += "Each line below corresponds to ONE SECOND of audio. Follow these rhythm patterns EXACTLY:\n\n" + + # Format each second's template + formatted_second_templates = [] + for i, template in enumerate(second_level_templates): + if i < min(60, len(second_level_templates)): # Limit to 60 seconds to avoid overwhelming the LLM + formatted_template = format_syllable_templates_for_prompt(template, arrow="→", line_wrap=0) + formatted_second_templates.append(f"Second {i+1}: {formatted_template}") + + second_level_guidance += "\n".join(formatted_second_templates) + + # Add critical instructions for second-level alignment + second_level_guidance += "\n\nCRITICAL: Create ONE LINE of lyrics for EACH SECOND, following the exact rhythm pattern." + second_level_guidance += "\nIf a second has no beats, use it for a breath or pause in the lyrics." + second_level_guidance += "\nThe first line of your lyrics MUST match Second 1, the second line matches Second 2, and so on." + + # Add to syllable guidance + syllable_guidance = second_level_guidance + + # Store templates for verification + templates_for_verification = second_level_templates + + elif song_structure: + print(f"DEBUG: Checking flexible structure") + # Try to use flexible structure if available + if is_safe_dict_access(song_structure, "flexible_structure"): + print(f"DEBUG: Using flexible structure") + flexible = song_structure.get("flexible_structure", {}) + if is_safe_dict_access(flexible, "segments") and len(flexible.get("segments", [])) > 0: + print(f"DEBUG: Found segments in flexible structure") + # Get the segments + segments = flexible.get("segments", []) + + # Add structure visualization + structure_visualization += f"Total segments: {len(segments)}\n" + structure_visualization += "Each segment represents one musical phrase for which you should write ONE line of lyrics.\n\n" + + # Process each segment to create enhanced rhythmic templates + enhanced_templates = [] + + for i, segment in enumerate(segments): + if i < 30: # Extend limit to 30 lines to handle longer songs + # Get the beat information for this segment + segment_start = segment["start"] + segment_end = segment["end"] + + # Add segment info to visualization + structure_visualization += f"Segment {i+1}: {segment_start:.1f}s - {segment_end:.1f}s (duration: {segment_end-segment_start:.1f}s)\n" + + # Find beats within this segment + segment_beats = [] + + # Add type checking for beat_times access + print(f"DEBUG: Checking beat_times in flexible structure") + if is_safe_dict_access(flexible, "beats") and is_safe_dict_access(flexible.get("beats", {}), "beat_times"): + beat_times = flexible.get("beats", {}).get("beat_times", []) + if isinstance(beat_times, list): + beat_strengths = flexible.get("beats", {}).get("beat_strengths", []) + + for j, beat_time in enumerate(beat_times): + if segment_start <= beat_time < segment_end: + # Add this beat to the segment + segment_beats.append(j) + + # Create segment-specific beat info + segment_beats_info = { + "beat_times": [beat_times[j] for j in segment_beats if j < len(beat_times)], + "tempo": flexible.get("beats", {}).get("tempo", 120) + } + + if beat_strengths and isinstance(beat_strengths, list): + segment_beats_info["beat_strengths"] = [ + beat_strengths[j] for j in segment_beats + if j < len(beat_strengths) + ] + + # Create a phrase structure for this segment + segment_beats_info["phrases"] = [segment_beats] + + # Generate enhanced template with genre awareness and auto phrasing + print(f"DEBUG: Creating flexible syllable template for segment {i+1}") + enhanced_template = create_flexible_syllable_templates( + segment_beats_info, + genre=genre, + phrase_mode='auto' if i == 0 else 'default' + ) + enhanced_templates.append(enhanced_template) + templates_for_verification.append(enhanced_template) + + # Add template to visualization + structure_visualization += f" Template: {enhanced_template}\n" + else: + print(f"DEBUG: beat_times is not a list, it's {type(beat_times)}") + else: + print(f"DEBUG: beats or beat_times not found in flexible structure") + # Skip segment if we don't have beat information + continue + + # Use these templates to determine rhythm patterns, without classifying as verse/chorus + pattern_groups = {} + + for i, template in enumerate(enhanced_templates): + # Create simplified version for pattern matching + simple_pattern = template.replace("(", "").replace(")", "").replace(":", "") + + # Check if this pattern is similar to any we've seen + found_match = False + for group, patterns in pattern_groups.items(): + if any(simple_pattern == p.replace("(", "").replace(")", "").replace(":", "") for p in patterns): + pattern_groups[group].append(template) + found_match = True + break + + if not found_match: + # New pattern type + group_name = f"Group_{len(pattern_groups) + 1}" + pattern_groups[group_name] = [template] + + # Format templates with improved formatting for the prompt + syllable_guidance = "CRITICAL RHYTHM INSTRUCTIONS:\n" + syllable_guidance += "Each line of lyrics MUST match exactly with one musical phrase/segment.\n" + syllable_guidance += "Follow these rhythm patterns for each line (STRONG beats need stressed syllables):\n\n" + + # Add formatted templates without section labels + formatted_templates = [] + for i, template in enumerate(enhanced_templates): + formatted_templates.append(format_syllable_templates_for_prompt([template], arrow="→", line_wrap=8)) + + syllable_guidance += "\n".join(formatted_templates) + + # Store info for later use in traditional sections approach + use_sections = True + + # Use the detected section structure for traditional approach + if verse_lines > 0: + verse_lines = min(verse_lines, total_lines // 2) # Ensure reasonable limits + else: + verse_lines = total_lines // 2 + + if chorus_lines > 0: + chorus_lines = min(chorus_lines, total_lines // 3) + else: + chorus_lines = total_lines // 3 + + if bridge_lines > 0: + bridge_lines = min(bridge_lines, total_lines // 6) + else: + bridge_lines = 0 + + # Fallback to traditional sections if needed + elif song_structure and is_safe_dict_access(song_structure, "syllables") and song_structure.get("syllables"): + syllable_guidance = "RHYTHM PATTERN INSTRUCTIONS:\n" + syllable_guidance += "Follow these syllable patterns for each section. Each line should match ONE phrase:\n\n" + + # Count sections for visualization + section_counts = {"verse": 0, "chorus": 0, "bridge": 0, "intro": 0, "outro": 0} + + for section in song_structure.get("syllables", []): + if not isinstance(section, dict): + continue + + section_type = section.get("type", "verse") + section_counts[section_type] = section_counts.get(section_type, 0) + 1 + + if is_safe_dict_access(section, "syllable_template"): + # Process to create enhanced template + if is_safe_dict_access(song_structure, "beats") and is_safe_dict_access(song_structure.get("beats", {}), "beat_times"): + section_beats_info = { + "beat_times": [beat for beat in song_structure.get("beats", {}).get("beat_times", []) + if section.get("start", 0) <= beat < section.get("end", 0)], + "tempo": song_structure.get("beats", {}).get("tempo", 120) + } + + if is_safe_dict_access(song_structure.get("beats", {}), "beat_strengths"): + section_beats_info["beat_strengths"] = [ + strength for i, strength in enumerate(song_structure.get("beats", {}).get("beat_strengths", [])) + if i < len(song_structure.get("beats", {}).get("beat_times", [])) and + section.get("start", 0) <= song_structure.get("beats", {}).get("beat_times", [])[i] < section.get("end", 0) + ] + + # Create a phrase structure for this section + section_beats_info["phrases"] = [list(range(len(section_beats_info["beat_times"])))] + + # Create a phrase structure for this section + section_beats_info["phrases"] = [list(range(len(section_beats_info["beat_times"])))] + + # Generate enhanced template with genre awareness + enhanced_template = create_flexible_syllable_templates( + section_beats_info, + genre=genre, + phrase_mode='auto' if section['type'] == 'verse' else 'default' + ) + + syllable_guidance += f"[{section['type'].capitalize()}]:\n" + syllable_guidance += format_syllable_templates_for_prompt( + enhanced_template, + arrow="→", + line_wrap=6 + ) + "\n\n" + templates_for_verification.append(section) + elif "syllable_count" in section: + syllable_guidance += f"[{section['type'].capitalize()}]: ~{section['syllable_count']} syllables total\n" + + # Create structure visualization + structure_visualization += "Using traditional section-based structure:\n" + for section_type, count in section_counts.items(): + if count > 0: + structure_visualization += f"{section_type.capitalize()}: {count} sections\n" + + # Set traditional section counts + verse_lines = max(2, section_counts.get("verse", 0) * 4) + chorus_lines = max(2, section_counts.get("chorus", 0) * 4) + bridge_lines = max(0, section_counts.get("bridge", 0) * 2) + + # Use sections approach + use_sections = True + + # If we couldn't get specific templates, use general guidance + if not syllable_guidance: + syllable_guidance = "RHYTHM ALIGNMENT INSTRUCTIONS:\n\n" + syllable_guidance += "1. Align stressed syllables with strong beats (usually beats 1 and 3 in 4/4 time)\n" + syllable_guidance += "2. Use unstressed syllables on weak beats (usually beats 2 and 4 in 4/4 time)\n" + syllable_guidance += "3. Use appropriate syllable counts based on tempo:\n" + syllable_guidance += " - Fast tempo (>120 BPM): 4-6 syllables per line\n" + syllable_guidance += " - Medium tempo (90-120 BPM): 6-8 syllables per line\n" + syllable_guidance += " - Slow tempo (<90 BPM): 8-10 syllables per line\n" + + # Create basic structure visualization + structure_visualization += "Using estimated structure (no detailed analysis available):\n" + + # Calculate rough section counts based on duration + estimated_lines = max(8, int(duration / 10)) + structure_visualization += f"Estimated total lines: {estimated_lines}\n" + + # Set traditional section counts based on duration + verse_lines = estimated_lines // 2 + chorus_lines = estimated_lines // 3 + bridge_lines = estimated_lines // 6 if estimated_lines > 12 else 0 + + # Use sections approach + use_sections = True + + # Add examples of syllable-beat alignment with enhanced format + syllable_guidance += "\nEXAMPLES OF PERFECT RHYTHM ALIGNMENT:\n" + syllable_guidance += "Pattern: S(0.95):1 → w(0.4):1 → m(0.7):1 → w(0.3):1\n" + syllable_guidance += "Lyric: 'HEAR the MU-sic PLAY'\n" + syllable_guidance += " ↑ ↑ ↑ ↑\n" + syllable_guidance += " S w m w <- BEAT TYPE\n\n" + + syllable_guidance += "Pattern: S(0.9):2 → w(0.3):1 → S(0.85):1 → w(0.4):2\n" + syllable_guidance += "Lyric: 'DANC-ing TO the RHYTHM of LOVE'\n" + syllable_guidance += " ↑ ↑ ↑ ↑ ↑ ↑\n" + syllable_guidance += " S S w S w w <- BEAT TYPE\n\n" + + syllable_guidance += "Pattern: S(0.92):1 → m(0.65):2 → S(0.88):1 → w(0.35):1\n" + syllable_guidance += "Lyric: 'TIME keeps FLOW-ing ON and ON'\n" + syllable_guidance += " ↑ ↑ ↑ ↑ ↑ ↑\n" + syllable_guidance += " S m m S w w <- BEAT TYPE\n\n" + + # Add genre-specific guidance based on the detected genre + genre_guidance = "" + if any(term in genre.lower() for term in ["rap", "hip-hop", "hip hop"]): + genre_guidance += "\nSPECIFIC GUIDANCE FOR RAP/HIP-HOP RHYTHMS:\n" + genre_guidance += "- Use more syllables per beat for rapid-fire sections\n" + genre_guidance += "- Create internal rhymes within lines, not just at line endings\n" + genre_guidance += "- Emphasize the first beat of each bar with strong consonants\n" + elif any(term in genre.lower() for term in ["electronic", "edm", "techno", "house", "dance"]): + genre_guidance += "\nSPECIFIC GUIDANCE FOR ELECTRONIC MUSIC RHYTHMS:\n" + genre_guidance += "- Use repetitive phrases that build and release tension\n" + genre_guidance += "- Match syllables precisely to the beat grid\n" + genre_guidance += "- Use short, percussive words on strong beats\n" + elif any(term in genre.lower() for term in ["rock", "metal", "punk", "alternative"]): + genre_guidance += "\nSPECIFIC GUIDANCE FOR ROCK RHYTHMS:\n" + genre_guidance += "- Use powerful, emotive words on downbeats\n" + genre_guidance += "- Create contrast between verse and chorus energy levels\n" + genre_guidance += "- Emphasize hooks with simple, memorable phrases\n" + elif any(term in genre.lower() for term in ["folk", "country", "acoustic", "ballad"]): + genre_guidance += "\nSPECIFIC GUIDANCE FOR FOLK/ACOUSTIC RHYTHMS:\n" + genre_guidance += "- Focus on storytelling with clear narrative flow\n" + genre_guidance += "- Use natural speech patterns that flow conversationally\n" + genre_guidance += "- Place important words at the start of phrases\n" + + # Add genre guidance to the main guidance + syllable_guidance += genre_guidance + + # Store the syllable guidance for later use + syllable_guidance_text = syllable_guidance + + # Determine if we should use traditional sections or second-level alignment + use_sections = True + use_second_level = False + + if song_structure and "second_level" in song_structure and song_structure["second_level"]: + use_second_level = True + # If we have second-level templates, prioritize those over traditional sections + if isinstance(song_structure["second_level"], dict) and "templates" in song_structure["second_level"]: + templates = song_structure["second_level"]["templates"] + if isinstance(templates, list) and len(templates) > 0: + use_sections = False + elif song_structure and "flexible_structure" in song_structure and song_structure["flexible_structure"]: + # If we have more than 4 segments, it's likely not a traditional song structure + if "segments" in song_structure["flexible_structure"]: + segments = song_structure["flexible_structure"]["segments"] + if len(segments) > 4: + use_sections = False + + # Create enhanced prompt with better rhythm alignment instructions + if use_second_level: + # Second-level approach with per-second alignment + content = f""" +You are a talented songwriter who specializes in {genre} music. +Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long. + +IMPORTANT: DO NOT include any thinking process, explanations, or analysis before the lyrics. Start directly with the song lyrics. + +Music analysis has detected the following qualities: +- Tempo: {tempo:.1f} BPM +- Key: {key} {mode} +- Primary emotion: {primary_emotion} +- Primary theme: {primary_theme} + +{syllable_guidance} + +CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT: +1. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern) +2. Natural word stress patterns must match the beat strength (strong words on strong beats) +3. Line breaks should occur at phrase endings for natural breathing +4. Consonant clusters should be avoided on fast notes and strong beats +5. Open vowels (a, e, o) work better for sustained notes and syllables +6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis) +7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels + +The lyrics should: +- Perfectly capture the essence and style of {genre} music +- Express the {primary_emotion} emotion and {primary_theme} theme +- Be completely original +- Maintain a consistent theme throughout +- Match the audio segment duration of {duration:.1f} seconds + +Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY. + +IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics. + +IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]" +where you analyze how well the lyrics align with the musical rhythm. This section MUST appear +even if there are no rhythm issues. Include the following in your analysis: +1. Syllable counts for each line and how they match the rhythm pattern +2. Where stressed syllables align with strong beats +3. Any potential misalignments or improvements + +Your lyrics: +""" + elif use_sections: + # Traditional approach with sections + content = f""" +You are a talented songwriter who specializes in {genre} music. +Write original {genre} song lyrics for a song that is {duration:.1f} seconds long. + +IMPORTANT: DO NOT include any thinking process, explanations, or analysis before the lyrics. Start directly with the song lyrics. + +Music analysis has detected the following qualities in the music: +- Tempo: {tempo:.1f} BPM +- Key: {key} {mode} +- Primary emotion: {primary_emotion} +- Primary theme: {primary_theme} + +{syllable_guidance} + +CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT: +1. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern) +2. Natural word stress patterns must match the beat strength (strong words on strong beats) +3. Line breaks should occur at phrase endings for natural breathing +4. Consonant clusters should be avoided on fast notes and strong beats +5. Open vowels (a, e, o) work better for sustained notes and syllables +6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis) +7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels + +The lyrics should: +- Perfectly capture the essence and style of {genre} music +- Express the {primary_emotion} emotion and {primary_theme} theme +- Follow the structure patterns provided above +- Be completely original +- Match the song duration of {duration:.1f} seconds + +IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics. + +IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]" +where you analyze how well the lyrics align with the musical rhythm. This section MUST appear +even if there are no rhythm issues. Include the following in your analysis: +1. Syllable counts for each line and how they match the rhythm pattern +2. Where stressed syllables align with strong beats +3. Any potential misalignments or improvements + +Your lyrics: +""" + else: + # Flexible approach without traditional sections + content = f""" +You are a talented songwriter who specializes in {genre} music. +Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long. + +IMPORTANT: DO NOT include any thinking process, explanations, or analysis before the lyrics. Start directly with the song lyrics. + +Music analysis has detected the following qualities: +- Tempo: {tempo:.1f} BPM +- Key: {key} {mode} +- Primary emotion: {primary_emotion} +- Primary theme: {primary_theme} + +{syllable_guidance} + +CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT: +1. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern) +2. Natural word stress patterns must match the beat strength (strong words on strong beats) +3. Line breaks should occur at phrase endings for natural breathing +4. Consonant clusters should be avoided on fast notes and strong beats +5. Open vowels (a, e, o) work better for sustained notes and syllables +6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis) +7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels + +The lyrics should: +- Perfectly capture the essence and style of {genre} music +- Express the {primary_emotion} emotion and {primary_theme} theme +- Be completely original +- Maintain a consistent theme throughout +- Match the audio segment duration of {duration:.1f} seconds + +Include any section labels like [Verse] or [Chorus] as indicated in the rhythm patterns above. +Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY. + +IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics. + +IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]" +where you analyze how well the lyrics align with the musical rhythm. This section MUST appear +even if there are no rhythm issues. Include the following in your analysis: +1. Syllable counts for each line and how they match the rhythm pattern +2. Where stressed syllables align with strong beats +3. Any potential misalignments or improvements + +Your lyrics: +""" + + # Format as a chat message for the LLM + messages = [ + {"role": "system", "content": "You are a professional songwriter. Create lyrics that match the specified rhythm patterns exactly. Start with the lyrics immediately without any explanation or thinking. Be concise and direct."}, + {"role": "user", "content": content} + ] + + # Apply standard chat template without thinking enabled + text = llm_tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + + # Generate lyrics using the LLM + model_inputs = llm_tokenizer([text], return_tensors="pt").to(llm_model.device) + + # Configure generation parameters based on model capability + generation_params = { + "do_sample": True, + "temperature": 0.5, # Lower for more consistent and direct output + "top_p": 0.85, # Slightly lower for more predictable responses + "top_k": 50, + "repetition_penalty": 1.2, + "max_new_tokens": 2048, + "num_return_sequences": 1 + } + + # Add specific stop sequences to prevent excessive explanation + if hasattr(llm_model.generation_config, "stopping_criteria"): + thinking_stops = ["Let me think", "First, I need to", "Let's analyze", "I'll approach this", "Step 1:", "To start,"] + for stop in thinking_stops: + if stop not in llm_model.generation_config.stopping_criteria: + llm_model.generation_config.stopping_criteria.append(stop) + + # Generate output + generated_ids = llm_model.generate( + **model_inputs, + **generation_params + ) + + # Extract output tokens + output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() + + # Get the raw output and strip any thinking process + lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip() + + # Enhanced thinking process removal - handle multiple formats + # First check for standard thinking tags + if "" in lyrics and "" in lyrics: + lyrics = lyrics.split("")[1].strip() + + # Check for alternative thinking indicators with improved detection + thinking_markers = [ + "", "", + "[thinking]", "[/thinking]", + "I'll think step by step:", + "First, I need to understand", + "Let me think about", + "Let's tackle this query", + "Okay, let's tackle this query", + "First, I need to understand the requirements", + "Looking at the rhythm patterns" + ] + + # First try to find clear section breaks + for marker in thinking_markers: + if marker in lyrics: + parts = lyrics.split(marker) + if len(parts) > 1: + lyrics = parts[-1].strip() # Take the last part after any thinking marker + + # Look for long analytical sections followed by clear lyrics + analytical_patterns = [ + "Let me analyze", + "I need to understand", + "The tempo is", + "First, let's look at", + "Wait, maybe", + "Considering the emotional tone", + "Starting with the first line", + "Let me check the examples" + ] + + # Check if lyrics begin with any analytical patterns + for pattern in analytical_patterns: + if lyrics.startswith(pattern): + # Try to find where the actual lyrics start - look for common lyrics markers + lyrics_markers = [ + "\n\n[Verse", + "\n\n[Chorus", + "\n\nVerse", + "\n\nChorus", + "\n\n[Verse 1]", + "\n\n[Intro]" + ] + + for marker in lyrics_markers: + if marker in lyrics: + lyrics = lyrics[lyrics.index(marker):].strip() + break + + # One last effort to clean up - if the text is very long and contains obvious thinking + # before getting to actual lyrics, try to find a clear starting point + if len(lyrics.split()) > 100 and "\n\n" in lyrics: + paragraphs = lyrics.split("\n\n") + for i, paragraph in enumerate(paragraphs): + # Look for typical song structure indicators in a paragraph + if any(marker in paragraph for marker in ["[Verse", "[Chorus", "Verse 1", "Chorus:"]): + lyrics = "\n\n".join(paragraphs[i:]) + break + + # Clean up any remaining thinking artifacts at the beginning + lines = lyrics.split('\n') + clean_lines = [] + lyrics_started = False + + for line in lines: + # Skip initial commentary/thinking lines until we hit what looks like lyrics + if not lyrics_started: + if (line.strip().startswith('[') and ']' in line) or not any(thinking in line.lower() for thinking in ["i think", "let me", "maybe", "perhaps", "alternatively", "checking"]): + lyrics_started = True + + if lyrics_started: + clean_lines.append(line) + + # Only use the cleaning logic if we found some actual lyrics + if clean_lines: + lyrics = '\n'.join(clean_lines) + + # Special handling for second-level templates + second_level_verification = None + if song_structure and "second_level" in song_structure and song_structure["second_level"]: + if isinstance(song_structure["second_level"], dict) and "templates" in song_structure["second_level"]: + second_level_verification = song_structure["second_level"]["templates"] + if not isinstance(second_level_verification, list): + second_level_verification = None + + # Verify syllable counts with enhanced verification - pass second-level templates if available + if templates_for_verification: + # Convert any NumPy values to native types before verification - directly handle conversions + # Simple conversion for basic templates (non-recursive) + if isinstance(templates_for_verification, list): + safe_templates = [] + for template in templates_for_verification: + if isinstance(template, dict): + processed_template = {} + for k, v in template.items(): + if isinstance(v, np.ndarray): + if v.size == 1: + processed_template[k] = float(v.item()) + else: + processed_template[k] = [float(x) if isinstance(x, np.number) else x for x in v] + elif isinstance(v, np.number): + processed_template[k] = float(v) + else: + processed_template[k] = v + safe_templates.append(processed_template) + else: + safe_templates.append(template) + else: + safe_templates = templates_for_verification + + # Wrap verification in try-except to handle any potential string indices errors + try: + print(f"DEBUG: Calling verify_flexible_syllable_counts") + print(f"DEBUG: Type of lyrics: {type(lyrics)}") + print(f"DEBUG: Type of safe_templates: {type(safe_templates)}") + print(f"DEBUG: Type of second_level_verification: {type(second_level_verification)}") + + verified_lyrics = verify_flexible_syllable_counts(lyrics, safe_templates, second_level_verification) + print(f"DEBUG: Type of verified_lyrics: {type(verified_lyrics)}") + + except Exception as e: + print(f"ERROR in verify_flexible_syllable_counts: {str(e)}") + # Return the original lyrics if verification fails + return { + "lyrics": lyrics if isinstance(lyrics, str) else str(lyrics), + "rhythm_analysis": f"Error in rhythm analysis: {str(e)}", + "syllable_analysis": "Error performing syllable analysis", + "prompt_template": "Error generating prompt template" + } + + if isinstance(verified_lyrics, str) and "[Note: Potential rhythm mismatches" in verified_lyrics and "Detailed Alignment Analysis" in verified_lyrics: + # Extract the original lyrics (before the notes section) + original_lyrics = lyrics.split("[Note:")[0].strip() if isinstance(lyrics, str) else str(lyrics) + + # Extract the analysis + analysis = verified_lyrics.split("[Note:")[1] if "[Note:" in verified_lyrics else "" + + # If we have serious alignment issues, consider a refinement step + if "stress misalignments" in analysis and len(templates_for_verification) > 0: + # Add a refinement prompt with the specific analysis + refinement_prompt = f""" +You need to fix rhythm issues in these lyrics. Here's the analysis of the problems: + +{analysis} + +Revise the lyrics to perfectly match the rhythm pattern while maintaining the theme. +Focus on fixing the stress misalignments by placing stressed syllables on STRONG beats. + +Original lyrics: +{original_lyrics} + +Improved lyrics with fixed rhythm: +""" + # Format as a chat message for refinement + refinement_messages = [ + {"role": "user", "content": refinement_prompt} + ] + + # Use standard template for refinement (no thinking mode needed) + refinement_text = llm_tokenizer.apply_chat_template( + refinement_messages, + tokenize=False, + add_generation_prompt=True + ) + + try: + # Generate refined lyrics with more focus on rhythm alignment + refinement_inputs = llm_tokenizer([refinement_text], return_tensors="pt").to(llm_model.device) + + # Use stricter parameters for refinement + refinement_params = { + "do_sample": True, + "temperature": 0.4, # Lower temperature for more precise refinement + "top_p": 0.9, + "repetition_penalty": 1.3, + "max_new_tokens": 1024 + } + + refined_ids = llm_model.generate( + **refinement_inputs, + **refinement_params + ) + + # Extract refined lyrics + refined_output_ids = refined_ids[0][len(refinement_inputs.input_ids[0]):].tolist() + refined_lyrics = llm_tokenizer.decode(refined_output_ids, skip_special_tokens=True).strip() + + # Verify the refined lyrics + try: + refined_verified_lyrics = verify_flexible_syllable_counts(refined_lyrics, safe_templates, second_level_verification) + + # Only use refined lyrics if they're better (fewer notes) + if "[Note: Potential rhythm mismatches" not in refined_verified_lyrics: + lyrics = refined_lyrics + elif refined_verified_lyrics.count("misalignments") < verified_lyrics.count("misalignments"): + lyrics = refined_verified_lyrics + else: + lyrics = verified_lyrics + except Exception as e: + print(f"Error in refined lyrics verification: {str(e)}") + lyrics = verified_lyrics + except Exception as e: + print(f"Error in lyrics refinement: {str(e)}") + lyrics = verified_lyrics + else: + # Minor issues, just use the verification notes + lyrics = verified_lyrics + else: + # No significant issues detected + lyrics = verified_lyrics + + # Check if we have the [RHYTHM_ANALYSIS_SECTION] tag + if "[RHYTHM_ANALYSIS_SECTION]" in lyrics: + # Split at our custom marker + parts = lyrics.split("[RHYTHM_ANALYSIS_SECTION]") + clean_lyrics = parts[0].strip() + rhythm_analysis = parts[1].strip() + + # Add our standard marker for compatibility with existing code + lyrics = clean_lyrics + "\n\n[Note: Rhythm Analysis]\n" + rhythm_analysis + + # For backwards compatibility - if we have the old format, still handle it + elif "[Note: Potential rhythm mismatches" in lyrics: + # Keep it as is, the existing parsing code can handle this format + pass + else: + # No analysis found, add a minimal one + lyrics = lyrics + "\n\n[Note: Rhythm Analysis]\nNo rhythm issues detected. All syllables align well with the beat pattern." + + # Before returning, add syllable analysis and prompt template + if isinstance(lyrics, str): + # Extract clean lyrics and analysis + if "[Note: Rhythm Analysis]" in lyrics: + clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip() + rhythm_analysis = lyrics.split("[Note: Rhythm Analysis]")[1] + elif "[Note: Potential rhythm mismatches" in lyrics: + clean_lyrics = lyrics.split("[Note:")[0].strip() + rhythm_analysis = "[Note:" + lyrics.split("[Note:")[1] + else: + clean_lyrics = lyrics + rhythm_analysis = "No rhythm analysis available" + + # Create syllable analysis + syllable_analysis = "=== SYLLABLE ANALYSIS ===\n\n" + if templates_for_verification: + syllable_analysis += "Template Analysis:\n" + for i, template in enumerate(templates_for_verification): + if i < min(len(templates_for_verification), 30): # Limit to 30 to avoid overwhelming output + syllable_analysis += f"Line {i+1}:\n" + if isinstance(template, dict): + if "syllable_template" in template: + syllable_analysis += f" Template: {template['syllable_template']}\n" + if "syllable_count" in template: + syllable_analysis += f" Expected syllables: {template['syllable_count']}\n" + elif isinstance(template, str): + syllable_analysis += f" Template: {template}\n" + syllable_analysis += "\n" + + if len(templates_for_verification) > 30: + syllable_analysis += f"... and {len(templates_for_verification) - 30} more lines\n\n" + + # Add second-level analysis if available + if second_level_verification: + syllable_analysis += "\nSecond-Level Template Analysis:\n" + for i, template in enumerate(second_level_verification): + if i < min(len(second_level_verification), 30): # Limit to 30 seconds + syllable_analysis += f"Second {i+1}: {template}\n" + + if len(second_level_verification) > 30: + syllable_analysis += f"... and {len(second_level_verification) - 30} more seconds\n" + + # Add structure visualization to syllable analysis + syllable_analysis += "\n" + structure_visualization + + # Create prompt template + prompt_template = "=== PROMPT TEMPLATE ===\n\n" + prompt_template += "Genre: " + genre + "\n" + prompt_template += f"Duration: {duration:.1f} seconds\n" + prompt_template += f"Tempo: {tempo:.1f} BPM\n" + prompt_template += f"Key: {key} {mode}\n" + prompt_template += f"Primary Emotion: {primary_emotion}\n" + prompt_template += f"Primary Theme: {primary_theme}\n\n" + prompt_template += "Syllable Guidance:\n" + syllable_guidance_text + + # Return all components + return { + "lyrics": clean_lyrics, + "rhythm_analysis": rhythm_analysis, + "syllable_analysis": syllable_analysis, + "prompt_template": prompt_template + } + + return { + "lyrics": lyrics, + "rhythm_analysis": "No rhythm analysis available", + "syllable_analysis": "No syllable analysis available", + "prompt_template": "No prompt template available" + } + +def process_audio(audio_file): + """Main function to process audio file, classify genre, and generate lyrics with enhanced rhythm analysis.""" + if audio_file is None: + return "Please upload an audio file.", None, None + + try: + print("Step 1/5: Extracting audio features...") + # Extract audio features + audio_data = extract_audio_features(audio_file) + + print("Step 2/5: Verifying audio contains music...") + # First check if it's music + try: + is_music, ast_results = detect_music(audio_data) + except Exception as e: + print(f"Error in music detection: {str(e)}") + return f"Error in music detection: {str(e)}", None, ast_results + + if not is_music: + return "The uploaded audio does not appear to be music. Please upload a music file.", None, ast_results + + print("Step 3/5: Classifying music genre...") + # Classify genre + try: + top_genres = classify_genre(audio_data) + # Format genre results using utility function + genre_results = format_genre_results(top_genres) + if not isinstance(top_genres, list) or len(top_genres) == 0: + # Fallback if we don't have valid top_genres + top_genres = [("rock", 1.0)] + except Exception as e: + print(f"Error in genre classification: {str(e)}") + top_genres = [("rock", 1.0)] # Ensure we have a default even when exception happens + return f"Error in genre classification: {str(e)}", None, ast_results + + # Initialize default values + ast_results = ast_results if ast_results else [] + song_structure = None + emotion_results = { + "emotion_analysis": {"primary_emotion": "Unknown"}, + "theme_analysis": {"primary_theme": "Unknown"}, + "rhythm_analysis": {"tempo": 0}, + "tonal_analysis": {"key": "Unknown", "mode": ""}, + "summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"} + } + + print("Step 4/5: Analyzing music emotions, themes, and structure...") + # Analyze music emotions and themes + try: + emotion_results = music_analyzer.analyze_music(audio_file) + except Exception as e: + print(f"Error in emotion analysis: {str(e)}") + # Continue with default emotion_results + + # Calculate detailed song structure for better lyrics alignment + try: + # Load audio data + y, sr = load_audio(audio_file, SAMPLE_RATE) + + # Analyze beats and phrases for music-aligned lyrics + beats_info = detect_beats(y, sr) + sections_info = detect_sections(y, sr) + + # Create structured segments for precise line-by-line matching + segments = [] + + # Try to break audio into meaningful segments based on sections + # Each segment will correspond to one line of lyrics + if sections_info and len(sections_info) > 1: + min_segment_duration = 1.5 # Minimum 1.5 seconds per segment + + for section in sections_info: + section_start = section["start"] + section_end = section["end"] + section_duration = section["duration"] + + # For very short sections, add as a single segment + if section_duration < min_segment_duration * 1.5: + segments.append({ + "start": section_start, + "end": section_end + }) + else: + # Calculate ideal number of segments for this section + # based on its duration - aiming for 2-4 second segments + ideal_segment_duration = 3.0 # Target 3 seconds per segment + segment_count = max(1, int(section_duration / ideal_segment_duration)) + + # Create evenly-spaced segments within this section + segment_duration = section_duration / segment_count + for i in range(segment_count): + segment_start = section_start + i * segment_duration + segment_end = segment_start + segment_duration + segments.append({ + "start": segment_start, + "end": segment_end + }) + # If no good sections found, create segments based on beats + elif beats_info and len(beats_info["beat_times"]) > 4: + beats = beats_info["beat_times"] + time_signature = beats_info.get("time_signature", 4) + + # Target one segment per musical measure (typically 4 beats) + measure_size = time_signature + for i in range(0, len(beats), measure_size): + if i + 1 < len(beats): # Need at least 2 beats for a meaningful segment + measure_start = beats[i] + # If we have enough beats for the full measure + if i + measure_size < len(beats): + measure_end = beats[i + measure_size] + else: + # Use available beats and extrapolate for the last measure + if i > 0: + beat_interval = beats[i] - beats[i-1] + measure_end = beats[-1] + (beat_interval * (measure_size - (len(beats) - i))) + else: + measure_end = audio_data["duration"] + + segments.append({ + "start": measure_start, + "end": measure_end + }) + # Last resort: simple time-based segments + else: + # Create segments of approximately 3 seconds each + segment_duration = 3.0 + total_segments = max(4, int(audio_data["duration"] / segment_duration)) + segment_duration = audio_data["duration"] / total_segments + + for i in range(total_segments): + segment_start = i * segment_duration + segment_end = segment_start + segment_duration + segments.append({ + "start": segment_start, + "end": segment_end + }) + + # Create flexible structure with the segments + flexible_structure = { + "beats": beats_info, + "segments": segments + } + + # Create song structure object + song_structure = { + "beats": beats_info, + "sections": sections_info, + "flexible_structure": flexible_structure, + "syllables": [] + } + + # Add syllable counts to each section + for section in sections_info: + # Create syllable templates for sections + section_beats_info = { + "beat_times": [beat for beat in beats_info["beat_times"] + if section["start"] <= beat < section["end"]], + "tempo": beats_info.get("tempo", 120) + } + if "beat_strengths" in beats_info: + section_beats_info["beat_strengths"] = [ + strength for i, strength in enumerate(beats_info["beat_strengths"]) + if i < len(beats_info["beat_times"]) and + section["start"] <= beats_info["beat_times"][i] < section["end"] + ] + + # Get a syllable count based on section duration and tempo + syllable_count = int(section["duration"] * (beats_info.get("tempo", 120) / 60) * 1.5) + + section_info = { + "type": section["type"], + "start": section["start"], + "end": section["end"], + "duration": section["duration"], + "syllable_count": syllable_count, + "beat_count": len(section_beats_info["beat_times"]) + } + + # Try to create a more detailed syllable template + if len(section_beats_info["beat_times"]) >= 2: + # Ensure top_genres is a list with at least one element + if isinstance(top_genres, list) and len(top_genres) > 0 and isinstance(top_genres[0], tuple): + genre_name = top_genres[0][0] + else: + genre_name = "unknown" # Default genre if top_genres is invalid + + section_info["syllable_template"] = create_flexible_syllable_templates( + section_beats_info, + genre=genre_name + ) + + song_structure["syllables"].append(section_info) + + # Add second-level beat analysis + try: + # Get enhanced beat information with subbeats + subbeat_info = detect_beats_and_subbeats(y, sr, subdivision=4) + + # Map beats to second-level windows + sec_map = map_beats_to_seconds( + subbeat_info["subbeat_times"], + audio_data["duration"] + ) + + # Create second-level templates + # Ensure top_genres is a list with at least one element + genre_name = "unknown" + if isinstance(top_genres, list) and len(top_genres) > 0 and isinstance(top_genres[0], tuple): + genre_name = top_genres[0][0] + + second_level_templates = create_second_level_templates( + sec_map, + subbeat_info["tempo"], + genre_name # Use top genre with safety check + ) + + # Add to song structure + song_structure["second_level"] = { + "sec_map": sec_map, + "templates": second_level_templates + } + + except Exception as e: + print(f"Error in second-level beat analysis: {str(e)}") + # Continue without second-level data + + except Exception as e: + print(f"Error analyzing song structure: {str(e)}") + # Continue without song structure + + print("Step 5/5: Generating rhythmically aligned lyrics...") + # Generate lyrics based on top genre, emotion analysis, and song structure + try: + # Ensure top_genres is a list with at least one element before accessing + primary_genre = "unknown" + if isinstance(top_genres, list) and len(top_genres) > 0 and isinstance(top_genres[0], tuple): + primary_genre, _ = top_genres[0] + + # CRITICAL FIX: Create a sanitized version of song_structure to prevent string indices error + sanitized_song_structure = None + if song_structure: + sanitized_song_structure = {} + + # Safely copy beats data + if "beats" in song_structure and isinstance(song_structure["beats"], dict): + sanitized_song_structure["beats"] = song_structure["beats"] + + # Safely copy sections data + if "sections" in song_structure and isinstance(song_structure["sections"], list): + sanitized_song_structure["sections"] = song_structure["sections"] + + # Safely handle flexible structure + if "flexible_structure" in song_structure and isinstance(song_structure["flexible_structure"], dict): + flex_struct = song_structure["flexible_structure"] + sanitized_flex = {} + + # Safely handle segments + if "segments" in flex_struct and isinstance(flex_struct["segments"], list): + sanitized_flex["segments"] = flex_struct["segments"] + + # Safely handle beats + if "beats" in flex_struct and isinstance(flex_struct["beats"], dict): + sanitized_flex["beats"] = flex_struct["beats"] + + sanitized_song_structure["flexible_structure"] = sanitized_flex + + # Safely handle syllables + if "syllables" in song_structure and isinstance(song_structure["syllables"], list): + sanitized_song_structure["syllables"] = song_structure["syllables"] + + # Safely handle second-level + if "second_level" in song_structure and isinstance(song_structure["second_level"], dict): + second_level = song_structure["second_level"] + sanitized_second = {} + + if "templates" in second_level and isinstance(second_level["templates"], list): + sanitized_second["templates"] = second_level["templates"] + + if "sec_map" in second_level and isinstance(second_level["sec_map"], list): + sanitized_second["sec_map"] = second_level["sec_map"] + + sanitized_song_structure["second_level"] = sanitized_second + + try: + print("Calling generate_lyrics function...") + lyrics_result = generate_lyrics(primary_genre, audio_data["duration"], emotion_results, sanitized_song_structure) + print(f"Type of lyrics_result: {type(lyrics_result)}") + + # Handle both old and new return formats with robust type checking + if isinstance(lyrics_result, dict) and all(k in lyrics_result for k in ["lyrics"]): + lyrics = lyrics_result.get("lyrics", "No lyrics generated") + rhythm_analysis = lyrics_result.get("rhythm_analysis", "No rhythm analysis available") + syllable_analysis = lyrics_result.get("syllable_analysis", "No syllable analysis available") + prompt_template = lyrics_result.get("prompt_template", "No prompt template available") + else: + # Convert to string regardless of the type + lyrics = str(lyrics_result) if lyrics_result is not None else "No lyrics generated" + rhythm_analysis = "No detailed rhythm analysis available" + syllable_analysis = "No syllable analysis available" + prompt_template = "No prompt template available" + except Exception as inner_e: + print(f"Inner error in lyrics generation: {str(inner_e)}") + # Create a simplified fallback result with just the error message + lyrics = f"Error generating lyrics: {str(inner_e)}" + rhythm_analysis = "Error in rhythm analysis" + syllable_analysis = "Error in syllable analysis" + prompt_template = "Error in prompt template generation" + + except Exception as e: + print(f"Outer error in lyrics generation: {str(e)}") + lyrics = f"Error generating lyrics: {str(e)}" + rhythm_analysis = "No rhythm analysis available" + syllable_analysis = "No syllable analysis available" + prompt_template = "No prompt template available" + # Prepare results dictionary with additional rhythm analysis + results = { + "genre_results": genre_results, + "lyrics": lyrics, + "rhythm_analysis": rhythm_analysis, + "syllable_analysis": syllable_analysis, + "prompt_template": prompt_template, + "ast_results": ast_results + } + + return results + + except Exception as e: + error_msg = f"Error processing audio: {str(e)}" + print(error_msg) + return error_msg, None, [] + +def format_complete_beat_timeline(audio_file, lyrics=None): + """Creates a complete formatted timeline showing all beat timings and their syllable patterns without truncation""" + if audio_file is None: + return "Please upload an audio file to see beat timeline." + + try: + # Extract audio data + y, sr = load_audio(audio_file, SAMPLE_RATE) + + # Get beat information + beats_info = detect_beats(y, sr) + + # Helper function to convert numpy values to floats - FIXED + def ensure_float(value): + if isinstance(value, np.ndarray) or isinstance(value, np.number): + return float(value) + return value + + # Format the timeline with enhanced scientific headers + timeline = "=== BEAT & SYLLABLE TIMELINE ===\n\n" + + tempo = ensure_float(beats_info['tempo']) + tempo_confidence = ensure_float(beats_info.get('tempo_confidence', 90.0)) + time_sig_confidence = ensure_float(beats_info.get('time_sig_confidence', 85.0)) + beat_periodicity = ensure_float(beats_info.get('beat_periodicity', 60 / tempo)) + + timeline += f"Tempo: {tempo:.1f} BPM (±{tempo_confidence:.1f}%)\n" + timeline += f"Time Signature: {beats_info['time_signature']}/4 (Confidence: {time_sig_confidence:.1f}%)\n" + timeline += f"Beat Periodicity: {beat_periodicity:.3f}s\n" + timeline += f"Beat Entropy: {beats_info.get('beat_entropy', 'N/A')}\n" + timeline += f"Total Beats: {beats_info['beat_count']}\n" + + # Add musicological context based on tempo classification + if tempo < 60: + tempo_class = "Largo (very slow)" + elif tempo < 76: + tempo_class = "Adagio (slow)" + elif tempo < 108: + tempo_class = "Andante (walking pace)" + elif tempo < 132: + tempo_class = "Moderato (moderate)" + elif tempo < 168: + tempo_class = "Allegro (fast)" + else: + tempo_class = "Presto (very fast)" + + timeline += f"Tempo Classification: {tempo_class}\n\n" + + # Create an enhanced table header with better column descriptions + timeline += "| Beat # | Time (s) | Beat Strength | Syllable Pattern |\n" + timeline += "|--------|----------|--------------|------------------|\n" + + # Add beat-by-beat information with improved classification + for i, (time, strength) in enumerate(zip(beats_info['beat_times'], beats_info['beat_strengths'])): + # Convert numpy values to Python float if needed + time = ensure_float(time) + strength = ensure_float(strength) + + # More scientific determination of beat type based on both strength and metrical position + metrical_position = i % beats_info['time_signature'] + + if metrical_position == 0: # Downbeat (first beat of measure) + beat_type = "STRONG" + syllable_value = 1.5 + elif metrical_position == beats_info['time_signature'] // 2 and beats_info['time_signature'] > 2: + # Secondary strong beat (e.g., beat 3 in 4/4 time) + beat_type = "MEDIUM" if strength < 0.8 else "STRONG" + syllable_value = 1.0 if strength < 0.8 else 1.5 + else: + # Other beats - classified by actual strength value + if strength >= 0.8: + beat_type = "STRONG" + syllable_value = 1.5 + elif strength >= 0.5: + beat_type = "MEDIUM" + syllable_value = 1.0 + else: + beat_type = "WEAK" + syllable_value = 1.0 + + # Determine pattern letter based on beat type for consistency + if beat_type == "STRONG": + pattern = "S" + elif beat_type == "MEDIUM": + pattern = "m" + else: + pattern = "w" + + # Add row to table with the correct beat classification + timeline += f"| {i+1:<6} | {time:.2f}s | {beat_type:<12} | {pattern}:{syllable_value} |\n" + + # No truncation - show all beats + + # Add a visual timeline of beats + timeline += "\n=== VISUAL BEAT TIMELINE ===\n\n" + timeline += "Each character represents 0.5 seconds. Beats are marked as:\n" + timeline += "S = Strong beat | m = Medium beat | w = Weak beat | · = No beat\n\n" + + # Calculate total duration and create time markers + if 'beat_times' in beats_info and len(beats_info['beat_times']) > 0: + # Get the max value safely + max_beat_time = max([ensure_float(t) for t in beats_info['beat_times']]) + total_duration = max_beat_time + 2 # Add 2 seconds of padding + else: + total_duration = 30 # Default duration if no beats found + + time_markers = "" + for i in range(0, int(total_duration) + 1, 5): + time_markers += f"{i:<5}" + timeline += time_markers + " (seconds)\n" + + # Create a ruler for easier time tracking + ruler = "" + for i in range(0, int(total_duration) + 1): + if i % 5 == 0: + ruler += "+" + else: + ruler += "-" + ruler += "-" * 9 # Each second is 10 characters wide + timeline += ruler + "\n" + + # Create a visualization of beats with symbols + beat_line = ["·"] * int(total_duration * 2) # 2 characters per second + + for i, time in enumerate(beats_info['beat_times']): + if i >= len(beats_info['beat_strengths']): + break + + # Convert to float if it's a numpy array + time_val = ensure_float(time) + + # Determine position in the timeline + pos = int(time_val * 2) # Convert to position in the beat_line + if pos >= len(beat_line): + continue + + # Determine beat type based on strength and position + strength = beats_info['beat_strengths'][i] + # Convert to float if it's a numpy array + strength = ensure_float(strength) + + if i % beats_info['time_signature'] == 0: + beat_line[pos] = "S" # Strong beat at start of measure + elif strength >= 0.8: + beat_line[pos] = "S" # Strong beat + elif i % beats_info['time_signature'] == beats_info['time_signature'] // 2 and beats_info['time_signature'] > 3: + beat_line[pos] = "m" # Medium beat (3rd beat in 4/4) + elif strength >= 0.5: + beat_line[pos] = "m" # Medium beat + else: + beat_line[pos] = "w" # Weak beat + + # Format and add to timeline + beat_visualization = "" + for i in range(0, len(beat_line), 10): + beat_visualization += "".join(beat_line[i:i+10]) + if i + 10 < len(beat_line): + beat_visualization += " " # Add space every 5 seconds + timeline += beat_visualization + "\n\n" + + # Add measure markers + timeline += "=== MEASURE MARKERS ===\n\n" + + # Create a list to track measure start times + measure_starts = [] + for i, time in enumerate(beats_info['beat_times']): + if i % beats_info['time_signature'] == 0: # Start of measure + # Convert to float if it's a numpy array + time_val = ensure_float(time) + measure_starts.append((i // beats_info['time_signature'] + 1, time_val)) + + # Format measure information + if measure_starts: + timeline += "| Measure # | Start Time | Duration |\n" + timeline += "|-----------|------------|----------|\n" + + for i in range(len(measure_starts)): + measure_num, start_time = measure_starts[i] + + # Calculate end time (start of next measure or end of song) + if i < len(measure_starts) - 1: + end_time = measure_starts[i+1][1] + elif 'beat_times' in beats_info and len(beats_info['beat_times']) > 0: + # Get the last beat time and convert to float if needed + last_beat = beats_info['beat_times'][-1] + end_time = ensure_float(last_beat) + else: + end_time = start_time + 2.0 # Default 2 seconds if no next measure + + duration = end_time - start_time + + timeline += f"| {measure_num:<9} | {start_time:.2f}s | {duration:.2f}s |\n" + + # No truncation - show all measures + + # Add phrase information + if 'phrases' in beats_info and beats_info['phrases']: + timeline += "\n=== MUSICAL PHRASES ===\n\n" + for i, phrase in enumerate(beats_info['phrases']): + # Show all phrases, not just the first 10 + if not phrase: + continue + + # Safely check phrase indices + if not (len(phrase) > 0 and len(beats_info['beat_times']) > 0): + continue + + start_beat = min(phrase[0], len(beats_info['beat_times'])-1) + end_beat = min(phrase[-1], len(beats_info['beat_times'])-1) + + # Convert to float if needed + phrase_start = ensure_float(beats_info['beat_times'][start_beat]) + phrase_end = ensure_float(beats_info['beat_times'][end_beat]) + + timeline += f"Phrase {i+1}: Beats {start_beat+1}-{end_beat+1} ({phrase_start:.2f}s - {phrase_end:.2f}s)\n" + + # Create syllable template for this phrase with simplified numpy handling + phrase_beats = { + "beat_times": [ensure_float(beats_info['beat_times'][j]) + for j in phrase if j < len(beats_info['beat_times'])], + "beat_strengths": [ensure_float(beats_info['beat_strengths'][j]) + for j in phrase if j < len(beats_info['beat_strengths'])], + "tempo": ensure_float(beats_info['tempo']), + "time_signature": beats_info['time_signature'], + "phrases": [list(range(len(phrase)))] + } + + template = create_flexible_syllable_templates(phrase_beats) + timeline += f" Syllable Template: {template}\n" + + # Create a visual representation of this phrase + if phrase_start < total_duration and phrase_end < total_duration: + # Create a timeline for this phrase + phrase_visualization = ["·"] * int(total_duration * 2) + + # Mark the phrase boundaries + start_pos = int(phrase_start * 2) + end_pos = int(phrase_end * 2) + + if start_pos < len(phrase_visualization): + phrase_visualization[start_pos] = "[" + + if end_pos < len(phrase_visualization): + phrase_visualization[end_pos] = "]" + + # Mark the beats in this phrase + for j in phrase: + if j < len(beats_info['beat_times']): + beat_time = ensure_float(beats_info['beat_times'][j]) + beat_pos = int(beat_time * 2) + + if beat_pos < len(phrase_visualization) and beat_pos != start_pos and beat_pos != end_pos: + # Determine beat type + if j % beats_info['time_signature'] == 0: + phrase_visualization[beat_pos] = "S" + elif j % beats_info['time_signature'] == beats_info['time_signature'] // 2: + phrase_visualization[beat_pos] = "m" + else: + phrase_visualization[beat_pos] = "w" + + # Format and add visualization + phrase_visual = "" + for k in range(0, len(phrase_visualization), 10): + phrase_visual += "".join(phrase_visualization[k:k+10]) + if k + 10 < len(phrase_visualization): + phrase_visual += " " + + timeline += f" Timeline: {phrase_visual}\n\n" + + # Add second-level script display + try: + # Get second-level beat information + subbeat_info = detect_beats_and_subbeats(y, sr, subdivision=4) + duration = librosa.get_duration(y=y, sr=sr) + + # Map to seconds + sec_map = map_beats_to_seconds(subbeat_info["subbeat_times"], duration) + + # Create templates + templates = create_second_level_templates(sec_map, subbeat_info["tempo"]) + + # Add to timeline + timeline += "\n=== SECOND-LEVEL SCRIPT ===\n\n" + timeline += "Each line below represents ONE SECOND of audio with matching lyric content.\n" + timeline += "| Second | Beat Pattern | Lyric Content |\n" + timeline += "|--------|-------------|---------------|\n" + + # Get clean lyrics (without analysis notes) + clean_lyrics = lyrics + if isinstance(lyrics, str): + if "[Note: Rhythm Analysis]" in lyrics: + clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip() + elif "[Note: Potential rhythm mismatches" in lyrics: + clean_lyrics = lyrics.split("[Note:")[0].strip() + + # Get lyric lines + lines = clean_lyrics.strip().split('\n') if clean_lyrics else [] + + for i, template in enumerate(templates): + # Get corresponding lyric line if available + lyric = lines[i] if i < len(lines) else "" + if lyric.startswith('[') and ']' in lyric: + lyric = "" # Skip section headers + + # Format nicely for display + timeline += f"| {i+1:<6} | {template:<30} | {lyric[:40]} |\n" + + # Add ASCII visualization of second-level beats + timeline += "\n=== SECOND-LEVEL VISUALIZATION ===\n\n" + timeline += "Each row represents ONE SECOND. Beat types:\n" + timeline += "S = Strong beat | m = Medium beat | w = Weak beat | · = No beat\n\n" + + for i, window in enumerate(sec_map): + beats = window["beats"] + + # Create ASCII visualization + beat_viz = ["·"] * 20 # 20 columns for visualization + + for beat in beats: + # Calculate position in visualization + pos = int(beat["relative_pos"] * 19) # Map 0-1 to 0-19 + if 0 <= pos < len(beat_viz): + # Set marker based on beat type + if beat["type"] == "main": + beat_viz[pos] = "S" + elif beat["strength"] >= 0.7: + beat_viz[pos] = "m" + else: + beat_viz[pos] = "w" + + # Get corresponding lyric + lyric = lines[i] if i < len(lines) else "" + if lyric.startswith('[') and ']' in lyric: + lyric = "" + + # Format visualization line + viz_line = f"Second {i+1:2d}: [" + "".join(beat_viz) + "]" + if lyric: + viz_line += f" → {lyric[:40]}" + + timeline += viz_line + "\n" + + except Exception as e: + timeline += f"\n[Error generating second-level analysis: {str(e)}]" + + # Add a section showing alignment if lyrics were generated + if lyrics and isinstance(lyrics, str): + timeline += "\n=== LYRICS-BEAT ALIGNMENT ===\n\n" + # Remove rhythm analysis notes from lyrics if present + if "[Note:" in lyrics: + clean_lyrics = lyrics.split("[Note:")[0].strip() + else: + clean_lyrics = lyrics + + lines = clean_lyrics.strip().split('\n') + + # Show alignment for ALL lines, not just the first 10 + for i, line in enumerate(lines): + if not line.strip() or line.startswith('['): + continue + + timeline += f"Line: \"{line}\"\n" + + # Count syllables + syllable_count = count_syllables(line) + timeline += f" Syllables: {syllable_count}\n" + + # Create adaptive phrase matching - if we don't have a direct phrase match, + # try to find the closest matching phrase by time or measure + matching_phrase = None + if 'phrases' in beats_info and beats_info['phrases']: + # First try direct index matching + if i < len(beats_info['phrases']) and beats_info['phrases'][i]: + matching_phrase = beats_info['phrases'][i] + else: + # If no direct match, try to find a phrase by musical position + # Calculate which section of the song we're in + if len(beats_info['phrases']) > 0: + section_size = max(1, len(beats_info['phrases']) // 4) + section_index = min(i // section_size, 3) # Limit to 4 sections + section_start = section_index * section_size + section_end = min(section_start + section_size, len(beats_info['phrases'])) + + # Try to find a phrase within this section + candidate_phrases = [phrase for j, phrase in enumerate(beats_info['phrases']) + if section_start <= j < section_end and phrase] + + if candidate_phrases: + matching_phrase = candidate_phrases[min(i % section_size, len(candidate_phrases)-1)] + elif beats_info['phrases']: + # Fallback to cycling through available phrases + phrase_index = i % len(beats_info['phrases']) + if beats_info['phrases'][phrase_index]: + matching_phrase = beats_info['phrases'][phrase_index] + + # Show timing and detailed alignment if we found a matching phrase + if matching_phrase and len(matching_phrase) > 0 and len(beats_info['beat_times']) > 0: + # Safely check if phrase has elements and indices are valid + if len(matching_phrase) > 0 and len(beats_info['beat_times']) > 0: + start_beat = min(matching_phrase[0], len(beats_info['beat_times'])-1) + end_beat = min(matching_phrase[-1], len(beats_info['beat_times'])-1) + + start_time = ensure_float(beats_info['beat_times'][start_beat]) + end_time = ensure_float(beats_info['beat_times'][end_beat]) + + timeline += f" Timing: {start_time:.2f}s - {end_time:.2f}s\n" + + # Create an enhanced visualization of syllable alignment + timeline += " Alignment: " + + # Create a timeline focused on just this phrase + phrase_duration = end_time - start_time + syllable_viz = [] + + # Initialize with beat markers for this phrase using improved algorithm + for j, beat_idx in enumerate(matching_phrase): + if beat_idx < len(beats_info['beat_times']): + beat_time = ensure_float(beats_info['beat_times'][beat_idx]) + + # Handle edge case where phrase_duration is very small + if phrase_duration > 0.001: # Avoid division by very small numbers + # Use non-linear mapping for more musical alignment + # This accounts for natural speech rhythms not being strictly linear + normalized_pos = (beat_time - start_time) / phrase_duration + # Apply slight curve to map syllable positions more naturally + curved_pos = min(1.0, normalized_pos * (1.0 + 0.1 * (normalized_pos - 0.5))) + relative_pos = int(curved_pos * syllable_count) + else: + relative_pos = j # Default to sequential if duration is too small + + # Ensure we have enough space + while len(syllable_viz) <= relative_pos: + syllable_viz.append("·") + + # Determine beat type with metrical context + metrical_pos = beat_idx % beats_info['time_signature'] + beat_strength = beats_info['beat_strengths'][beat_idx] if beat_idx < len(beats_info['beat_strengths']) else 0 + + if metrical_pos == 0 or beat_strength >= 0.8: + syllable_viz[relative_pos] = "S" # Strong beat + elif metrical_pos == beats_info['time_signature'] // 2 or beat_strength >= 0.5: + syllable_viz[relative_pos] = "m" # Medium beat + else: + syllable_viz[relative_pos] = "w" # Weak beat + + # Fill in any gaps + while len(syllable_viz) < syllable_count: + syllable_viz.append("·") + + # Trim if too long + syllable_viz = syllable_viz[:syllable_count] + + # Add alignment visualization with word stress analysis + timeline += "".join(syllable_viz) + "\n" + + # Add word stress analysis + words = re.findall(r'\b[a-zA-Z]+\b', line.lower()) + if words: + word_stresses = [] + cumulative_syllables = 0 + + for word in words: + syllable_count_word = count_syllables_for_word(word) + stress_pattern = get_word_stress(word) + + # Ensure stress pattern is as long as syllable count + while len(stress_pattern) < syllable_count_word: + stress_pattern += "0" + + for j in range(syllable_count_word): + stress_char = "S" if j < len(stress_pattern) and stress_pattern[j] == "1" else "_" + word_stresses.append(stress_char) + + cumulative_syllables += syllable_count_word + + # Add word stress information + timeline += " Word stress: " + "".join(word_stresses) + "\n" + + # Check if stressed syllables align with strong beats + alignment_score = 0 + alignment_issues = [] + + for j, (stress, beat) in enumerate(zip(word_stresses, syllable_viz)): + if (stress == "S" and beat == "S") or (stress != "S" and beat != "S"): + alignment_score += 1 + elif stress == "S" and beat != "S": + alignment_issues.append(f"Syllable {j+1} has stress but weak beat") + elif stress != "S" and beat == "S": + alignment_issues.append(f"Syllable {j+1} has no stress but strong beat") + + if word_stresses: + alignment_percent = (alignment_score / len(word_stresses)) * 100 + timeline += f" Stress alignment: {alignment_percent:.1f}% match\n" + + if alignment_issues and len(alignment_issues) <= 3: + timeline += " Issues: " + "; ".join(alignment_issues) + "\n" + else: + timeline += " No matching phrase found for alignment\n" + + timeline += "\n" + + return timeline + + except Exception as e: + print(f"Error generating complete beat timeline: {str(e)}") + return f"Error generating complete beat timeline: {str(e)}" + +def display_results(audio_file): + """Process audio file and return formatted results for display in the UI.""" + # Default error response + error_response = ("Please upload an audio file.", + "No emotion analysis available.", + "No audio classification available.", + "No lyrics generated.", + "No beat timeline available.") + + if audio_file is None: + return error_response + + try: + # Process audio and get results + results = process_audio(audio_file) + + # Check if we got an error message + if isinstance(results, str) and "Error" in results: + return results, *error_response[1:] + elif isinstance(results, tuple) and isinstance(results[0], str) and "Error" in results[0]: + return results[0], *error_response[1:] + + # Extract results + if isinstance(results, dict): + # New format + genre_results = results.get("genre_results", "Genre classification failed") + lyrics = results.get("lyrics", "Lyrics generation failed") + ast_results = results.get("ast_results", []) + else: + # Old tuple format + genre_results, lyrics, ast_results = results + + # Get clean lyrics (without analysis notes) + clean_lyrics = lyrics + if isinstance(lyrics, str): + if "[Note: Rhythm Analysis]" in lyrics: + clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip() + elif "[Note: Potential rhythm mismatches" in lyrics: + clean_lyrics = lyrics.split("[Note:")[0].strip() + + # Generate beat timeline - use the complete timeline function that shows all beats + beat_timeline = format_complete_beat_timeline(audio_file, clean_lyrics) + + # Format emotion analysis results + emotion_text = "No emotion analysis available." + try: + emotion_results = music_analyzer.analyze_music(audio_file) + emotion_text = (f"Tempo: {emotion_results['summary']['tempo']:.1f} BPM\n" + f"Key: {emotion_results['summary']['key']} {emotion_results['summary']['mode']}\n" + f"Primary Emotion: {emotion_results['summary']['primary_emotion']}\n" + f"Primary Theme: {emotion_results['summary']['primary_theme']}") + + # Keep basic beat analysis without section information + y, sr = load_audio(audio_file, SAMPLE_RATE) + beats_info = detect_beats(y, sr) + + # Add beat analysis info + emotion_text += f"\n\nBeat Analysis:\n" + emotion_text += f"- Tempo: {beats_info.get('tempo', 0):.1f} BPM\n" + emotion_text += f"- Time Signature: {beats_info.get('time_signature', 4)}/4\n" + emotion_text += f"- Total Beats: {beats_info.get('beat_count', 0)}\n" + + except Exception as e: + print(f"Error in emotion analysis: {str(e)}") + + # Format audio classification results + ast_text = "No valid audio classification results available." + if ast_results and isinstance(ast_results, list): + ast_text = "Audio Classification Results:\n" + for result in ast_results[:5]: # Show top 5 results + ast_text += f"{result['label']}: {result['score']*100:.2f}%\n" + + # Return all results + return genre_results, emotion_text, ast_text, clean_lyrics, beat_timeline + + except Exception as e: + error_msg = f"Error: {str(e)}" + print(error_msg) + return error_msg, *error_response[1:] + +# Create enhanced Gradio interface with tabs for better organization +with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo: + gr.Markdown("# Music Genre Classifier & Lyrics Generator") + gr.Markdown("Upload a music file to classify its genre, analyze its emotions, and generate perfectly aligned lyrics.") + + with gr.Row(): + with gr.Column(scale=1): + audio_input = gr.Audio(label="Upload Music", type="filepath") + submit_btn = gr.Button("Analyze & Generate", variant="primary") + + # Add genre info box + with gr.Accordion("About Music Genres", open=False): + gr.Markdown(""" + The system recognizes various music genres including: + - Pop, Rock, Hip-Hop, R&B + - Electronic, Dance, Techno, House + - Jazz, Blues, Classical + - Folk, Country, Acoustic + - Metal, Punk, Alternative + - And many others! + + For best results, use high-quality audio files (MP3, WAV, FLAC) with at least 10 seconds of music. + """) + + with gr.Column(scale=2): + # Use tabs for better organization of outputs + with gr.Tabs(): + with gr.TabItem("Analysis Results"): + genre_output = gr.Textbox(label="Detected Genres", lines=4) + + # Create 2 columns for emotion and audio classification + with gr.Row(): + with gr.Column(): + emotion_output = gr.Textbox(label="Emotion & Structure Analysis", lines=8) + with gr.Column(): + ast_output = gr.Textbox(label="Audio Classification", lines=8) + + with gr.TabItem("Generated Lyrics"): + lyrics_output = gr.Textbox(label="Lyrics", lines=18) + + with gr.TabItem("Beat & Syllable Timeline"): + beat_timeline_output = gr.Textbox(label="Beat Timings & Syllable Patterns", lines=40) + + # Connect the button to the display function with updated outputs + submit_btn.click( + fn=display_results, + inputs=[audio_input], + outputs=[genre_output, emotion_output, ast_output, lyrics_output, beat_timeline_output] + ) + + # Enhanced explanation of how the system works + with gr.Accordion("How it works", open=False): + gr.Markdown(""" + ## Advanced Lyrics Generation Process + + 1. **Audio Analysis**: The system analyzes your uploaded music file using multiple machine learning models. + + 2. **Genre Classification**: A specialized neural network identifies the musical genre, detecting subtle patterns in the audio. + + 3. **Emotional Analysis**: The system examines harmonic, rhythmic, and timbral features to determine the emotional qualities of the music. + + 4. **Rhythm Mapping**: Advanced beat detection algorithms create a detailed rhythmic map of the music, identifying: + - Strong and weak beats + - Natural phrase boundaries + - Time signature and tempo variations + - Beat subdivisions (half and quarter beats) + + 5. **Second-Level Alignment**: The system maps beats and subbeats to each second of audio, creating precise templates for perfect alignment. + + 6. **Syllable Template Creation**: For each second of audio, the system generates precise syllable templates that reflect: + - Beat stress patterns (strong, medium, weak) + - Appropriate syllable counts based on tempo + - Genre-specific rhythmic qualities + - Half-beat and quarter-beat subdivisions + + 7. **Lyrics Generation**: Using the detected genre, emotion, and rhythm patterns, a large language model generates lyrics that: + - Match the emotional quality of the music + - Follow the precise syllable templates for each second + - Align stressed syllables with strong beats + - Maintain genre-appropriate style and themes + + 8. **Rhythm Verification**: The system verifies the generated lyrics, analyzing: + - Syllable count accuracy + - Stress alignment with strong beats + - Word stress patterns + - Second-by-second alignment precision + + 9. **Refinement**: If significant rhythm mismatches are detected, the system can automatically refine the lyrics for better alignment. + + This multi-step process creates lyrics that feel naturally connected to the music, as if they were written specifically for it. + """) + +# Launch the app +demo.launch() \ No newline at end of file