Spaces:
Runtime error
Runtime error
import gradio as gr | |
import librosa | |
import numpy as np | |
import requests | |
from gradio.outputs import Video | |
from video_generator import generate_video | |
def extract_lyrics(api_response): | |
words_timing = api_response["results"]["channels"][0]["alternatives"][0]["words"] | |
lyrics_with_timing = [] | |
CHUNK_DURATION = 10 | |
current_chunk = "" | |
current_chunk_start_time = 0 | |
for word_info in words_timing: | |
word = word_info["word"] | |
start_time = word_info["start"] | |
if start_time >= current_chunk_start_time + CHUNK_DURATION: | |
end_time = word_info["end"] | |
lyrics_with_timing.append((current_chunk_start_time, end_time, current_chunk.strip())) | |
current_chunk = "" | |
current_chunk_start_time += CHUNK_DURATION | |
current_chunk += " " + word | |
lyrics_with_timing.append((current_chunk_start_time, words_timing[-1]["end"], current_chunk.strip())) | |
return lyrics_with_timing | |
def send_to_deepgram(audio_file_path): | |
# Update with your Deepgram API endpoint and key | |
endpoint = "https://api.deepgram.com/v1/listen" | |
headers = { | |
"Authorization": "Token 2114fe20a6bdccf930f9a7fd1931958f063745d7" | |
} | |
with open(audio_file_path, 'rb') as audio_file : | |
audio_data = audio_file.read() | |
response = requests.post(endpoint, headers=headers, data=audio_data) | |
response_json = response.json() | |
print("Deepgram API Response:", response_json) # Log the response | |
return response_json | |
def analyze_audio(audio_file_path): | |
print("Analyzing audio...") # Log start of analysis | |
last_frame = None | |
y, sr = librosa.load(audio_file_path) | |
chunk_length = 10 * sr # 10 seconds | |
moods = [] | |
deepgram_response = send_to_deepgram(audio_file_path) | |
lyrics_chunks = extract_lyrics(deepgram_response) | |
for start in range(0, len(y), chunk_length): | |
chunk = y[start:start + chunk_length] | |
mood, _, _, _, _, _ = analyze_chunk(chunk) | |
moods.append(mood) | |
for i, start in enumerate(range(0, len(y), chunk_length)): | |
print(f"Analyzing chunk {i + 1}...") # Log chunk analysis | |
chunk = y[start:start + chunk_length] | |
lyrics_summary = lyrics_chunks[i] if i < len(lyrics_chunks) else 'Instrumental or silence' | |
previous_mood = moods[i - 1] if i > 0 else None | |
current_mood = moods[i] | |
next_mood = moods[i + 1] if i < len(moods) - 1 else None | |
_, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean = analyze_chunk(chunk) | |
prompt = generate_video_prompt(previous_mood, current_mood, next_mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean, lyrics_summary) | |
description = f"Chunk starting at {start / sr} seconds:<br>Mood: {current_mood}<br>Video Prompt: {prompt}<br><br>" | |
print(f"Generating video for chunk {i + 1}...") | |
lyrics_with_timing = extract_lyrics(deepgram_response) | |
video = generate_video(lyrics_with_timing, last_frame) | |
#last_frame = extract_last_frame(video) | |
print(f"Description for chunk {i + 1}: {description}") | |
print(f"Video for chunk {i + 1}: {video}") | |
# Yield the result for this chunk | |
yield (description, video) | |
def analyze_chunk(chunk): | |
tempo, _ = librosa.beat.beat_track(y=chunk) | |
chroma_mean = np.mean(librosa.feature.chroma_stft(y=chunk)) | |
spectral_contrast_mean = np.mean(librosa.feature.spectral_contrast(y=chunk)) | |
zero_crossing_rate_mean = np.mean(librosa.feature.zero_crossing_rate(chunk)) | |
mfcc_mean = np.mean(librosa.feature.mfcc(y=chunk)) | |
mood = analyze_mood(tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean) | |
return mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean | |
def analyze_mood(tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean): | |
# Happy Mood | |
if tempo > 110 and chroma_mean > 0.4: | |
return 'Happy' | |
# Sad Mood | |
elif tempo < 90 and chroma_mean < 0.5 and mfcc_mean < 0: | |
return 'Sad' | |
# Energetic Mood | |
elif tempo > 130 and zero_crossing_rate_mean > 0.05: | |
return 'Energetic' | |
# Relaxed Mood | |
elif tempo < 100 and chroma_mean > 0.3 and spectral_contrast_mean > 15: | |
return 'Relaxed' | |
# Romantic Mood | |
elif tempo < 100 and chroma_mean > 0.5: | |
return 'Romantic' | |
# Nostalgic Mood | |
elif tempo < 100 and chroma_mean < 0.5 and spectral_contrast_mean < 25: | |
return 'Nostalgic' | |
# Tense Mood | |
elif 100 <= tempo <= 130 and chroma_mean < 0.5 and spectral_contrast_mean > 20: | |
return 'Tense' | |
# Dreamy Mood | |
elif tempo < 80 and chroma_mean > 0.4: | |
return 'Dreamy' | |
# Aggressive Mood | |
elif tempo > 140 and zero_crossing_rate_mean > 0.08: | |
return 'Aggressive' | |
# Neutral Mood (Catch-all) | |
else: | |
return 'Neutral' | |
def describe_tempo(tempo): | |
if tempo < 60: | |
return "a very slow" | |
elif tempo < 90: | |
return "a slow" | |
elif tempo < 120: | |
return "a moderate" | |
elif tempo < 150: | |
return "a lively" | |
else: | |
return "a fast" | |
def generate_video_prompt(previous_mood, current_mood, next_mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean, lyrics_summary): | |
rhythm_description = "energetic rhythm" if zero_crossing_rate_mean > 0.05 else "smooth rhythm" | |
tonal_quality = "bright tones" if chroma_mean > 0.5 else "mellow tones" | |
spectral_description = "sharp contrasts" if spectral_contrast_mean > 20 else "soft contrasts" | |
tempo_description = describe_tempo(tempo) | |
transition_description = "" | |
if previous_mood: | |
transition_description += f"Transition from a {previous_mood.lower()} mood. " | |
if next_mood: | |
transition_description += f"Prepare to transition to a {next_mood.lower()} mood. " | |
prompt = ( | |
f"Essence of a {current_mood.lower()} mood. " | |
f"{transition_description}" | |
f"Showcase a scene with {rhythm_description}, {tonal_quality}, and {spectral_description}. " | |
f"Visualize {tempo_description} tempo. " # Updated line | |
f"Narrative based on the lyrics: '{lyrics_summary}'. " | |
f"Emphasize the themes and emotions conveyed in the song." | |
) | |
return prompt | |
# Define Gradio interface | |
gr.Interface( | |
fn=analyze_audio, | |
inputs=gr.Audio(type="filepath"), | |
outputs=[gr.HTML(), Video()], | |
).launch() |