Spaces:

antonelli
/

outsidellms

Runtime error

App Files Files Community

outsidellms / app.py

antonelli

zfff

e33689b almost 2 years ago

raw

history blame contribute delete

6.47 kB

	import gradio as gr
	import librosa
	import numpy as np
	import requests
	from gradio.outputs import Video

	from video_generator import generate_video

	def extract_lyrics(api_response):
	words_timing = api_response["results"]["channels"][0]["alternatives"][0]["words"]
	lyrics_with_timing = []
	CHUNK_DURATION = 10
	current_chunk = ""
	current_chunk_start_time = 0
	for word_info in words_timing:
	word = word_info["word"]
	start_time = word_info["start"]
	if start_time >= current_chunk_start_time + CHUNK_DURATION:
	end_time = word_info["end"]
	lyrics_with_timing.append((current_chunk_start_time, end_time, current_chunk.strip()))
	current_chunk = ""
	current_chunk_start_time += CHUNK_DURATION
	current_chunk += " " + word
	lyrics_with_timing.append((current_chunk_start_time, words_timing[-1]["end"], current_chunk.strip()))
	return lyrics_with_timing


	def send_to_deepgram(audio_file_path):
	# Update with your Deepgram API endpoint and key
	endpoint = "https://api.deepgram.com/v1/listen"
	headers = {
	"Authorization": "Token 2114fe20a6bdccf930f9a7fd1931958f063745d7"
	}
	with open(audio_file_path, 'rb') as audio_file :
	audio_data = audio_file.read()

	response = requests.post(endpoint, headers=headers, data=audio_data)
	response_json = response.json()
	print("Deepgram API Response:", response_json) # Log the response
	return response_json

	def analyze_audio(audio_file_path):
	print("Analyzing audio...") # Log start of analysis
	last_frame = None
	y, sr = librosa.load(audio_file_path)
	chunk_length = 10 * sr # 10 seconds
	moods = []

	deepgram_response = send_to_deepgram(audio_file_path)
	lyrics_chunks = extract_lyrics(deepgram_response)

	for start in range(0, len(y), chunk_length):
	chunk = y[start:start + chunk_length]
	mood, _, _, _, _, _ = analyze_chunk(chunk)
	moods.append(mood)

	for i, start in enumerate(range(0, len(y), chunk_length)):
	print(f"Analyzing chunk {i + 1}...") # Log chunk analysis
	chunk = y[start:start + chunk_length]
	lyrics_summary = lyrics_chunks[i] if i < len(lyrics_chunks) else 'Instrumental or silence'
	previous_mood = moods[i - 1] if i > 0 else None
	current_mood = moods[i]
	next_mood = moods[i + 1] if i < len(moods) - 1 else None
	_, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean = analyze_chunk(chunk)
	prompt = generate_video_prompt(previous_mood, current_mood, next_mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean, lyrics_summary)
	description = f"Chunk starting at {start / sr} seconds:<br>Mood: {current_mood}<br>Video Prompt: {prompt}<br><br>"
	print(f"Generating video for chunk {i + 1}...")
	lyrics_with_timing = extract_lyrics(deepgram_response)
	video = generate_video(lyrics_with_timing, last_frame)
	#last_frame = extract_last_frame(video)
	print(f"Description for chunk {i + 1}: {description}")
	print(f"Video for chunk {i + 1}: {video}")

	# Yield the result for this chunk
	yield (description, video)

	def analyze_chunk(chunk):
	tempo, _ = librosa.beat.beat_track(y=chunk)
	chroma_mean = np.mean(librosa.feature.chroma_stft(y=chunk))
	spectral_contrast_mean = np.mean(librosa.feature.spectral_contrast(y=chunk))
	zero_crossing_rate_mean = np.mean(librosa.feature.zero_crossing_rate(chunk))
	mfcc_mean = np.mean(librosa.feature.mfcc(y=chunk))
	mood = analyze_mood(tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean)
	return mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean

	def analyze_mood(tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean):
	# Happy Mood
	if tempo > 110 and chroma_mean > 0.4:
	return 'Happy'
	# Sad Mood
	elif tempo < 90 and chroma_mean < 0.5 and mfcc_mean < 0:
	return 'Sad'
	# Energetic Mood
	elif tempo > 130 and zero_crossing_rate_mean > 0.05:
	return 'Energetic'
	# Relaxed Mood
	elif tempo < 100 and chroma_mean > 0.3 and spectral_contrast_mean > 15:
	return 'Relaxed'
	# Romantic Mood
	elif tempo < 100 and chroma_mean > 0.5:
	return 'Romantic'
	# Nostalgic Mood
	elif tempo < 100 and chroma_mean < 0.5 and spectral_contrast_mean < 25:
	return 'Nostalgic'
	# Tense Mood
	elif 100 <= tempo <= 130 and chroma_mean < 0.5 and spectral_contrast_mean > 20:
	return 'Tense'
	# Dreamy Mood
	elif tempo < 80 and chroma_mean > 0.4:
	return 'Dreamy'
	# Aggressive Mood
	elif tempo > 140 and zero_crossing_rate_mean > 0.08:
	return 'Aggressive'
	# Neutral Mood (Catch-all)
	else:
	return 'Neutral'

	def describe_tempo(tempo):
	if tempo < 60:
	return "a very slow"
	elif tempo < 90:
	return "a slow"
	elif tempo < 120:
	return "a moderate"
	elif tempo < 150:
	return "a lively"
	else:
	return "a fast"

	def generate_video_prompt(previous_mood, current_mood, next_mood, tempo, chroma_mean, spectral_contrast_mean, zero_crossing_rate_mean, mfcc_mean, lyrics_summary):
	rhythm_description = "energetic rhythm" if zero_crossing_rate_mean > 0.05 else "smooth rhythm"
	tonal_quality = "bright tones" if chroma_mean > 0.5 else "mellow tones"
	spectral_description = "sharp contrasts" if spectral_contrast_mean > 20 else "soft contrasts"
	tempo_description = describe_tempo(tempo)

	transition_description = ""
	if previous_mood:
	transition_description += f"Transition from a {previous_mood.lower()} mood. "
	if next_mood:
	transition_description += f"Prepare to transition to a {next_mood.lower()} mood. "

	prompt = (
	f"Essence of a {current_mood.lower()} mood. "
	f"{transition_description}"
	f"Showcase a scene with {rhythm_description}, {tonal_quality}, and {spectral_description}. "
	f"Visualize {tempo_description} tempo. " # Updated line
	f"Narrative based on the lyrics: '{lyrics_summary}'. "
	f"Emphasize the themes and emotions conveyed in the song."
	)

	return prompt

	# Define Gradio interface
	gr.Interface(
	fn=analyze_audio,
	inputs=gr.Audio(type="filepath"),
	outputs=[gr.HTML(), Video()],
	).launch()