import gradio as gr import librosa import numpy as np import requests from gradio.outputs import Video from video_generator import generate_video def extract_lyrics(api_response): words_timing = api_response["results"]["channels"][0]["alternatives"][0]["words"] lyrics_with_timing = [] CHUNK_DURATION = 10 current_chunk = "" current_chunk_start_time = 0 for word_info in words_timing: word = word_info["word"] start_time = word_info["start"] if start_time >= current_chunk_start_time + CHUNK_DURATION: end_time = word_info["end"] lyrics_with_timing.append((current_chunk_start_time, end_time, current_chunk.strip())) current_chunk = "" current_chunk_start_time += CHUNK_DURATION current_chunk += " " + word lyrics_with_timing.append((current_chunk_start_time, words_timing[-1]["end"], current_chunk.strip())) return lyrics_with_timing def send_to_deepgram(audio_file_path): # Update with your Deepgram API endpoint and key endpoint = "https://api.deepgram.com/v1/listen" headers = { "Authorization": "Token 2114fe20a6bdccf930f9a7fd1931958f063745d7" } with open(audio_file_path, 'rb') as audio_file : audio_data = audio_file.read() response = requests.post(endpoint, headers=headers, data=audio_data) response_json = response.json() print("Deepgram API Response:", response_json) # Log the response return response_json def analyze_audio(audio_file_path): print("Analyzing audio...") # Log start of analysis last_frame = None y, sr = librosa.load(audio_file_path) chunk_length = 10 * sr # 10 seconds moods = [] deepgram_response = send_to_deepgram(audio_file_path) lyrics_chunks = extract_lyrics(deepgram_response) for start in range(0, len(y), chunk_length): chunk = y[start:start + chunk_length] mood, _, _, _, _, _ = analyze_chunk(chunk) moods.append(mood) for i, start in enumerate(range(0, len(y), chunk_length)): print(f"Analyzing chunk {i + 1}...") # Log chunk analysis chunk = y[start:start + chunk_length] lyrics_summary = lyrics_chunks[i] if i < len(lyrics_chunks) else 'Instrumental or silence' previous_mood = moods[i - 1] if i > 0 else None current_mood = moods[i] next_mood = moods[i + 1] if i < len(moods) - 1 else None _, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean = analyze_chunk(chunk) prompt = generate_video_prompt(previous_mood, current_mood, next_mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean, lyrics_summary) description = f"Chunk starting at {start / sr} seconds:
Mood: {current_mood}
Video Prompt: {prompt}

" print(f"Generating video for chunk {i + 1}...") lyrics_with_timing = extract_lyrics(deepgram_response) video = generate_video(lyrics_with_timing, last_frame) #last_frame = extract_last_frame(video) print(f"Description for chunk {i + 1}: {description}") print(f"Video for chunk {i + 1}: {video}") # Yield the result for this chunk yield (description, video) def analyze_chunk(chunk): tempo, _ = librosa.beat.beat_track(y=chunk) chroma_mean = np.mean(librosa.feature.chroma_stft(y=chunk)) spectral_contrast_mean = np.mean(librosa.feature.spectral_contrast(y=chunk)) zero_crossing_rate_mean = np.mean(librosa.feature.zero_crossing_rate(chunk)) mfcc_mean = np.mean(librosa.feature.mfcc(y=chunk)) mood = analyze_mood(tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean) return mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean def analyze_mood(tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean): # Happy Mood if tempo > 110 and chroma_mean > 0.4: return 'Happy' # Sad Mood elif tempo < 90 and chroma_mean < 0.5 and mfcc_mean < 0: return 'Sad' # Energetic Mood elif tempo > 130 and zero_crossing_rate_mean > 0.05: return 'Energetic' # Relaxed Mood elif tempo < 100 and chroma_mean > 0.3 and spectral_contrast_mean > 15: return 'Relaxed' # Romantic Mood elif tempo < 100 and chroma_mean > 0.5: return 'Romantic' # Nostalgic Mood elif tempo < 100 and chroma_mean < 0.5 and spectral_contrast_mean < 25: return 'Nostalgic' # Tense Mood elif 100 <= tempo <= 130 and chroma_mean < 0.5 and spectral_contrast_mean > 20: return 'Tense' # Dreamy Mood elif tempo < 80 and chroma_mean > 0.4: return 'Dreamy' # Aggressive Mood elif tempo > 140 and zero_crossing_rate_mean > 0.08: return 'Aggressive' # Neutral Mood (Catch-all) else: return 'Neutral' def describe_tempo(tempo): if tempo < 60: return "a very slow" elif tempo < 90: return "a slow" elif tempo < 120: return "a moderate" elif tempo < 150: return "a lively" else: return "a fast" def generate_video_prompt(previous_mood, current_mood, next_mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean, lyrics_summary): rhythm_description = "energetic rhythm" if zero_crossing_rate_mean > 0.05 else "smooth rhythm" tonal_quality = "bright tones" if chroma_mean > 0.5 else "mellow tones" spectral_description = "sharp contrasts" if spectral_contrast_mean > 20 else "soft contrasts" tempo_description = describe_tempo(tempo) transition_description = "" if previous_mood: transition_description += f"Transition from a {previous_mood.lower()} mood. " if next_mood: transition_description += f"Prepare to transition to a {next_mood.lower()} mood. " prompt = ( f"Essence of a {current_mood.lower()} mood. " f"{transition_description}" f"Showcase a scene with {rhythm_description}, {tonal_quality}, and {spectral_description}. " f"Visualize {tempo_description} tempo. " # Updated line f"Narrative based on the lyrics: '{lyrics_summary}'. " f"Emphasize the themes and emotions conveyed in the song." ) return prompt # Define Gradio interface gr.Interface( fn=analyze_audio, inputs=gr.Audio(type="filepath"), outputs=[gr.HTML(), Video()], ).launch()