outsidellms / app.py
antonelli's picture
zfff
e33689b
import gradio as gr
import librosa
import numpy as np
import requests
from gradio.outputs import Video
from video_generator import generate_video
def extract_lyrics(api_response):
words_timing = api_response["results"]["channels"][0]["alternatives"][0]["words"]
lyrics_with_timing = []
CHUNK_DURATION = 10
current_chunk = ""
current_chunk_start_time = 0
for word_info in words_timing:
word = word_info["word"]
start_time = word_info["start"]
if start_time >= current_chunk_start_time + CHUNK_DURATION:
end_time = word_info["end"]
lyrics_with_timing.append((current_chunk_start_time, end_time, current_chunk.strip()))
current_chunk = ""
current_chunk_start_time += CHUNK_DURATION
current_chunk += " " + word
lyrics_with_timing.append((current_chunk_start_time, words_timing[-1]["end"], current_chunk.strip()))
return lyrics_with_timing
def send_to_deepgram(audio_file_path):
# Update with your Deepgram API endpoint and key
endpoint = "https://api.deepgram.com/v1/listen"
headers = {
"Authorization": "Token 2114fe20a6bdccf930f9a7fd1931958f063745d7"
}
with open(audio_file_path, 'rb') as audio_file :
audio_data = audio_file.read()
response = requests.post(endpoint, headers=headers, data=audio_data)
response_json = response.json()
print("Deepgram API Response:", response_json) # Log the response
return response_json
def analyze_audio(audio_file_path):
print("Analyzing audio...") # Log start of analysis
last_frame = None
y, sr = librosa.load(audio_file_path)
chunk_length = 10 * sr # 10 seconds
moods = []
deepgram_response = send_to_deepgram(audio_file_path)
lyrics_chunks = extract_lyrics(deepgram_response)
for start in range(0, len(y), chunk_length):
chunk = y[start:start + chunk_length]
mood, _, _, _, _, _ = analyze_chunk(chunk)
moods.append(mood)
for i, start in enumerate(range(0, len(y), chunk_length)):
print(f"Analyzing chunk {i + 1}...") # Log chunk analysis
chunk = y[start:start + chunk_length]
lyrics_summary = lyrics_chunks[i] if i < len(lyrics_chunks) else 'Instrumental or silence'
previous_mood = moods[i - 1] if i > 0 else None
current_mood = moods[i]
next_mood = moods[i + 1] if i < len(moods) - 1 else None
_, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean = analyze_chunk(chunk)
prompt = generate_video_prompt(previous_mood, current_mood, next_mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean, lyrics_summary)
description = f"Chunk starting at {start / sr} seconds:<br>Mood: {current_mood}<br>Video Prompt: {prompt}<br><br>"
print(f"Generating video for chunk {i + 1}...")
lyrics_with_timing = extract_lyrics(deepgram_response)
video = generate_video(lyrics_with_timing, last_frame)
#last_frame = extract_last_frame(video)
print(f"Description for chunk {i + 1}: {description}")
print(f"Video for chunk {i + 1}: {video}")
# Yield the result for this chunk
yield (description, video)
def analyze_chunk(chunk):
tempo, _ = librosa.beat.beat_track(y=chunk)
chroma_mean = np.mean(librosa.feature.chroma_stft(y=chunk))
spectral_contrast_mean = np.mean(librosa.feature.spectral_contrast(y=chunk))
zero_crossing_rate_mean = np.mean(librosa.feature.zero_crossing_rate(chunk))
mfcc_mean = np.mean(librosa.feature.mfcc(y=chunk))
mood = analyze_mood(tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean)
return mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean
def analyze_mood(tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean):
# Happy Mood
if tempo > 110 and chroma_mean > 0.4:
return 'Happy'
# Sad Mood
elif tempo < 90 and chroma_mean < 0.5 and mfcc_mean < 0:
return 'Sad'
# Energetic Mood
elif tempo > 130 and zero_crossing_rate_mean > 0.05:
return 'Energetic'
# Relaxed Mood
elif tempo < 100 and chroma_mean > 0.3 and spectral_contrast_mean > 15:
return 'Relaxed'
# Romantic Mood
elif tempo < 100 and chroma_mean > 0.5:
return 'Romantic'
# Nostalgic Mood
elif tempo < 100 and chroma_mean < 0.5 and spectral_contrast_mean < 25:
return 'Nostalgic'
# Tense Mood
elif 100 <= tempo <= 130 and chroma_mean < 0.5 and spectral_contrast_mean > 20:
return 'Tense'
# Dreamy Mood
elif tempo < 80 and chroma_mean > 0.4:
return 'Dreamy'
# Aggressive Mood
elif tempo > 140 and zero_crossing_rate_mean > 0.08:
return 'Aggressive'
# Neutral Mood (Catch-all)
else:
return 'Neutral'
def describe_tempo(tempo):
if tempo < 60:
return "a very slow"
elif tempo < 90:
return "a slow"
elif tempo < 120:
return "a moderate"
elif tempo < 150:
return "a lively"
else:
return "a fast"
def generate_video_prompt(previous_mood, current_mood, next_mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean, lyrics_summary):
rhythm_description = "energetic rhythm" if zero_crossing_rate_mean > 0.05 else "smooth rhythm"
tonal_quality = "bright tones" if chroma_mean > 0.5 else "mellow tones"
spectral_description = "sharp contrasts" if spectral_contrast_mean > 20 else "soft contrasts"
tempo_description = describe_tempo(tempo)
transition_description = ""
if previous_mood:
transition_description += f"Transition from a {previous_mood.lower()} mood. "
if next_mood:
transition_description += f"Prepare to transition to a {next_mood.lower()} mood. "
prompt = (
f"Essence of a {current_mood.lower()} mood. "
f"{transition_description}"
f"Showcase a scene with {rhythm_description}, {tonal_quality}, and {spectral_description}. "
f"Visualize {tempo_description} tempo. " # Updated line
f"Narrative based on the lyrics: '{lyrics_summary}'. "
f"Emphasize the themes and emotions conveyed in the song."
)
return prompt
# Define Gradio interface
gr.Interface(
fn=analyze_audio,
inputs=gr.Audio(type="filepath"),
outputs=[gr.HTML(), Video()],
).launch()