Spaces:

thepatch
/

micro-slot-machine

Running on Zero

App Files Files Community

micro-slot-machine / app.py

thecollabagepatch

loudness issues

f8a1b4d 2 days ago

raw

history blame contribute delete

19.1 kB

	import gradio as gr
	import spaces
	import torchaudio
	from audiocraft.models import MusicGen
	from audiocraft.data.audio import audio_write
	import tempfile
	import os
	import torch
	from gradio_client import Client, handle_file
	import random
	import time
	import io
	from pydub import AudioSegment

	# Check if CUDA is available
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# MelodyFlow variation mapping - map your semantic variations to text prompts
	VARIATION_PROMPTS = {
	'accordion_folk': 'folk accordion melody with traditional folk instruments',
	'banjo_bluegrass': 'bluegrass banjo with country folk instruments',
	'piano_classical': 'classical piano with orchestral arrangement',
	'celtic': 'celtic harp and flute with traditional irish instruments',
	'strings_quartet': 'string quartet with violin, viola, cello arrangement',
	'synth_retro': 'retro 80s synthesizer with vintage electronic sounds',
	'synth_modern': 'modern synthesizer with contemporary electronic production',
	'synth_edm': 'edm synthesizer with dance electronic beats',
	'lofi_chill': 'lo-fi chill with relaxed jazz hip-hop elements',
	'synth_bass': 'heavy bass synthesizer with sub-bass frequencies',
	'rock_band': 'rock band with electric guitar, bass, and drums',
	'cinematic_epic': 'cinematic epic orchestral with dramatic strings and brass',
	'retro_rpg': 'retro rpg chiptune with 8-bit game music elements',
	'chiptune': '8-bit chiptune with retro video game sounds',
	'steel_drums': 'steel drums with caribbean tropical percussion',
	'gamelan_fusion': 'gamelan fusion with indonesian percussion instruments',
	'music_box': 'music box with delicate mechanical melody',
	'trap_808': 'trap beats with heavy 808 drums and hi-hats',
	'lo_fi_drums': 'lo-fi drums with vinyl crackle and jazz samples',
	'boom_bap': 'boom bap hip-hop with classic drum breaks',
	'percussion_ensemble': 'percussion ensemble with varied drum instruments',
	'future_bass': 'future bass with melodic drops and vocal chops',
	'synthwave_retro': 'synthwave retro with neon 80s aesthetic',
	'melodic_techno': 'melodic techno with driving beats and emotional melodies',
	'dubstep_wobble': 'dubstep with heavy wobble bass and electronic drops',
	'glitch_hop': 'glitch hop with broken beats and digital artifacts',
	'digital_disruption': 'digital disruption with glitchy electronic effects',
	'circuit_bent': 'circuit bent with broken electronic hardware sounds',
	'orchestral_glitch': 'orchestral glitch with classical instruments and digital errors',
	'vapor_drums': 'vaporwave drums with slowed down nostalgic beats',
	'industrial_textures': 'industrial textures with harsh mechanical sounds',
	'jungle_breaks': 'jungle breaks with fast drum and bass rhythms'
	}

	def preprocess_audio(waveform):
	waveform_np = waveform.cpu().squeeze().numpy()
	return torch.from_numpy(waveform_np).unsqueeze(0).to(device)

	# ========== MUSICGEN FUNCTIONS (Local ZeroGPU) ==========

	@spaces.GPU(duration=10)
	def generate_drum_sample():
	model = MusicGen.get_pretrained('pharoAIsanders420/micro-musicgen-jungle')
	model.set_generation_params(duration=10)
	wav = model.generate_unconditional(1).squeeze(0)

	filename_without_extension = f'jungle'
	filename_with_extension = f'{filename_without_extension}.wav'

	audio_write(filename_without_extension, wav.cpu(), model.sample_rate, strategy="loudness", loudness_compressor=True)
	return filename_with_extension

	@spaces.GPU
	def continue_drum_sample(existing_audio_path):
	if existing_audio_path is None:
	return None

	existing_audio, sr = torchaudio.load(existing_audio_path)
	existing_audio = existing_audio.to(device)

	prompt_duration = 2
	output_duration = 10

	num_samples = int(prompt_duration * sr)
	if existing_audio.shape[1] < num_samples:
	raise ValueError("The existing audio is too short for the specified prompt duration.")

	start_sample = existing_audio.shape[1] - num_samples
	prompt_waveform = existing_audio[..., start_sample:]

	model = MusicGen.get_pretrained('pharoAIsanders420/micro-musicgen-jungle')
	model.set_generation_params(duration=output_duration)

	output = model.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
	output = output.to(device)

	if output.dim() == 3:
	output = output.squeeze(0)
	if output.dim() == 1:
	output = output.unsqueeze(0)

	combined_audio = torch.cat((existing_audio, output), dim=1)
	combined_audio = combined_audio.cpu()

	combined_file_path = f'./continued_jungle_{random.randint(1000, 9999)}.wav'
	torchaudio.save(combined_file_path, combined_audio, sr)
	return combined_file_path

	@spaces.GPU(duration=30)
	def generate_music(wav_filename, prompt_duration, musicgen_model, output_duration):
	"""Generate music using the BEGINNING of the audio as prompt"""
	if wav_filename is None:
	return None

	song, sr = torchaudio.load(wav_filename)
	song = song.to(device)

	model_name = musicgen_model.split(" ")[0]
	model_continue = MusicGen.get_pretrained(model_name)

	model_continue.set_generation_params(
	use_sampling=True,
	top_k=250,
	top_p=0.0,
	temperature=1.0,
	duration=output_duration,
	cfg_coef=3
	)

	prompt_waveform = song[..., :int(prompt_duration * sr)]
	prompt_waveform = preprocess_audio(prompt_waveform)

	output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
	output = output.cpu()

	if len(output.size()) > 2:
	output = output.squeeze()

	filename_without_extension = f'continued_music'
	filename_with_extension = f'{filename_without_extension}.wav'
	audio_write(filename_without_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)

	return filename_with_extension

	@spaces.GPU(duration=30)
	def continue_music(input_audio_path, prompt_duration, musicgen_model, output_duration):
	"""Continue music using the END of the audio as prompt - extends the audio"""
	if input_audio_path is None:
	return None

	song, sr = torchaudio.load(input_audio_path)
	song = song.to(device)

	model_name = musicgen_model.split(" ")[0]
	model_continue = MusicGen.get_pretrained(model_name)
	model_continue.set_generation_params(
	use_sampling=True,
	top_k=250,
	top_p=0.0,
	temperature=1.0,
	duration=output_duration,
	cfg_coef=3
	)

	# Load original audio as AudioSegment for easier manipulation
	original_audio = AudioSegment.from_wav(input_audio_path)
	file_paths_for_cleanup = []

	# Get the last `prompt_duration` seconds as the prompt
	num_samples = int(prompt_duration * sr)
	if song.shape[1] < num_samples:
	raise ValueError("The prompt_duration is longer than the current audio length.")

	# Extract the end portion for prompting
	start_sample = song.shape[1] - num_samples
	prompt_waveform = song[..., start_sample:]
	prompt_waveform = preprocess_audio(prompt_waveform)

	# Generate continuation
	output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
	output = output.cpu()

	if len(output.size()) > 2:
	output = output.squeeze()

	# Save the generated audio WITHOUT aggressive loudness processing
	filename_without_extension = f'continue_extension_{random.randint(1000, 9999)}'
	filename_with_extension = f'{filename_without_extension}.wav'
	audio_write(filename_without_extension, output, model_continue.sample_rate,
	strategy="clip") # Just prevent clipping, no loudness changes

	# Handle the double .wav extension issue
	correct_filename = f'{filename_without_extension}.wav.wav'
	if os.path.exists(correct_filename):
	generated_audio_segment = AudioSegment.from_wav(correct_filename)
	file_paths_for_cleanup.append(correct_filename)
	else:
	generated_audio_segment = AudioSegment.from_wav(filename_with_extension)
	file_paths_for_cleanup.append(filename_with_extension)

	# SMART VOLUME MATCHING: Only match the prompt portion

	# 1. Remove prompt duration from original (no overlap)
	prompt_duration_ms = int(prompt_duration * 1000)
	original_minus_prompt = original_audio[:-prompt_duration_ms]

	# 2. Extract JUST the prompt portion from generated audio for RMS analysis
	generated_prompt_portion = generated_audio_segment[:prompt_duration_ms]

	# 3. Calculate RMS of the transition points
	original_rms = original_minus_prompt.rms
	prompt_portion_rms = generated_prompt_portion.rms

	print(f"🔊 Smart volume analysis:")
	print(f" Original ending RMS: {original_rms}")
	print(f" Generated prompt RMS: {prompt_portion_rms}")
	print(f" Generated full RMS: {generated_audio_segment.rms}")

	# 4. Match the prompt portion to original level
	if prompt_portion_rms > 0:
	from pydub.utils import ratio_to_db
	volume_adjustment = ratio_to_db(original_rms / prompt_portion_rms)
	print(f" Applying {volume_adjustment:.1f}dB to entire generated segment")

	# Apply to entire segment (preserves the buildup)
	generated_matched = generated_audio_segment + volume_adjustment
	else:
	generated_matched = generated_audio_segment

	# 5. Combine seamlessly
	combined_audio = original_minus_prompt + generated_matched

	# Save final result
	combined_audio_filename = f"extended_audio_{random.randint(1000, 9999)}.wav"
	combined_audio.export(combined_audio_filename, format="wav")

	# Cleanup temporary files
	for file_path in file_paths_for_cleanup:
	if os.path.exists(file_path):
	os.remove(file_path)

	return combined_audio_filename

	# ========== MELODYFLOW FUNCTIONS (Via Facebook Space) ==========

	def transform_with_melodyflow_api(audio_path, variation, custom_prompt="", solver="euler", flowstep=0.12):
	"""Transform audio using Facebook/MelodyFlow space API"""
	if audio_path is None:
	return None, "❌ No audio file provided"

	base_steps = 125
	effective_steps = 25

	try:
	# Initialize client for Facebook MelodyFlow space
	client = Client("facebook/MelodyFlow")

	# Determine the prompt to use
	if custom_prompt.strip():
	prompt_text = custom_prompt.strip()
	status_msg = f"✅ Transformed with custom prompt: '{prompt_text}' (flowstep: {flowstep}, {effective_steps} steps)"
	else:
	prompt_text = VARIATION_PROMPTS.get(variation, f"transform this audio to {variation} style")
	status_msg = f"✅ Transformed with {variation} style (flowstep: {flowstep}, {effective_steps} steps)"

	# Set steps based on solver and the fact we're doing editing
	# Facebook's space automatically reduces steps for editing:
	# EULER: divides by 5, MIDPOINT: divides by 2
	if solver == "midpoint":
	base_steps = 128
	effective_steps = base_steps // 2 # 64 effective steps
	else: # euler
	base_steps = 125
	effective_steps = base_steps // 5 # 25 effective steps

	# Call the MelodyFlow API with the base steps (it will auto-reduce)
	result = client.predict(
	model="facebook/melodyflow-t24-30secs",
	text=prompt_text,
	solver=solver,
	steps=base_steps, # Will be auto-reduced to effective_steps by the space
	target_flowstep=flowstep, # This is the key parameter!
	regularize=solver == "euler", # Regularize for euler, not for midpoint
	regularization_strength=0.2,
	duration=30, # Max duration
	melody=handle_file(audio_path),
	api_name="/predict"
	)

	# Result is a tuple of 3 audio files (variations)
	# We'll use the first variation
	if result and len(result) > 0 and result[0]:
	# Save the result locally
	output_filename = f"melodyflow_{variation}_{random.randint(1000, 9999)}.wav"

	# Copy the result file to our local filename
	import shutil
	shutil.copy2(result[0], output_filename)

	return output_filename, status_msg
	else:
	return None, "❌ MelodyFlow API returned no results"

	except Exception as e:
	return None, f"❌ MelodyFlow API error: {str(e)}"

	# ========== GRADIO INTERFACE ==========

	# Create the interface
	with gr.Blocks() as iface:
	gr.Markdown("# 🎰 The Mega Slot Machine")
	gr.Markdown("Hybrid Multi-Model Pipeline: MicroMusicGen → MelodyFlow (via API) → MusicGen Fine-tunes")
	gr.Markdown("Demonstrating the workflow from our Ableton device in a web interface!")

	with gr.Accordion("How This Works", open=False):
	gr.Markdown("""
	This demo shows how multiple AI models can work together:

	1. Generate initial audio with MicroMusicGen (super fast jungle drums)
	2. Transform it using MelodyFlow (via Facebook's space API)
	3. Continue with MusicGen fine-tunes (trained on specific genres)
	4. Repeat the cycle to create infinite musical journeys!

	The models run with different PyTorch versions, so we use the Facebook MelodyFlow space via API.

	Performance Note: For audio transformation, MelodyFlow automatically uses fewer steps than generation:
	- EULER solver: 25 effective steps (fast, good quality)
	- MIDPOINT solver: 64 effective steps (slower, potentially higher quality)
	""")

	# ========== STEP 1: GENERATE ==========
	gr.Markdown("## 🎵 Step 1: Generate Initial Audio")

	with gr.Row():
	with gr.Column():
	generate_button = gr.Button("Generate Jungle Drums", variant="primary", size="lg")
	continue_drum_button = gr.Button("Continue Drums", size="sm")

	main_audio = gr.Audio(
	label="🎵 Current Audio (flows through pipeline)",
	type="filepath",
	interactive=True,
	show_download_button=True
	)

	# ========== STEP 2: TRANSFORM ==========
	gr.Markdown("## 🎛️ Step 2: Transform with MelodyFlow")

	with gr.Row():
	with gr.Column(scale=2):
	transform_variation = gr.Dropdown(
	label="Transform Style",
	choices=list(VARIATION_PROMPTS.keys()),
	value="synth_modern",
	interactive=True
	)

	with gr.Column(scale=3):
	transform_prompt = gr.Textbox(
	label="Custom Prompt (optional)",
	placeholder="Leave empty to use style above, or enter custom transformation prompt",
	lines=2
	)

	with gr.Row():
	transform_solver = gr.Dropdown(
	label="Solver",
	choices=["euler", "midpoint"],
	value="euler",
	info="EULER: faster (25 steps), MIDPOINT: slower but potentially higher quality (64 steps)"
	)
	transform_flowstep = gr.Slider(
	label="Transform Intensity (Flowstep)",
	minimum=0.0,
	maximum=0.15,
	step=0.01,
	value=0.12,
	info="Lower values = more dramatic transformation"
	)
	transform_button = gr.Button("🎛️ Transform Audio", variant="secondary", size="lg")

	transform_status = gr.Textbox(label="Transform Status", value="Ready to transform", interactive=False)

	# ========== STEP 3: CONTINUE ==========
	gr.Markdown("## 🎼 Step 3: Continue with MusicGen")

	with gr.Row():
	with gr.Column():
	prompt_duration = gr.Dropdown(
	label="Prompt Duration (seconds)",
	choices=list(range(1, 11)),
	value=5
	)
	output_duration = gr.Slider(
	label="Output Duration (seconds)",
	minimum=10,
	maximum=30,
	step=1,
	value=20
	)

	with gr.Column():
	musicgen_model = gr.Dropdown(
	label="MusicGen Model",
	choices=[
	"thepatch/vanya_ai_dnb_0.1 (small)",
	"thepatch/budots_remix (small)",
	"thepatch/PhonkV2 (small)",
	"thepatch/bleeps-medium (medium)",
	"thepatch/hoenn_lofi (large)",
	"foureyednymph/musicgen-sza-sos-small (small)"
	],
	value="thepatch/vanya_ai_dnb_0.1 (small)"
	)

	# Two different continuation options with clear explanations
	with gr.Row():
	with gr.Column():
	gr.Markdown("### 🔄 Continue from Beginning")
	gr.Markdown("Uses the first* X seconds as prompt. Good for reimagining/reworking from a starting point.*")
	generate_music_button = gr.Button("🔄 Continue from Beginning", variant="primary", size="lg")

	with gr.Column():
	gr.Markdown("### ➡️ Extend from End")
	gr.Markdown("Uses the last* X seconds as prompt. Extends your audio by adding new content to the end.*")
	continue_music_button = gr.Button("➡️ Extend from End", variant="secondary", size="lg")

	# ========== EVENT HANDLERS ==========

	# Step 1: Generate
	generate_button.click(generate_drum_sample, outputs=[main_audio])
	continue_drum_button.click(continue_drum_sample, inputs=[main_audio], outputs=[main_audio])

	# Step 2: Transform (using Facebook MelodyFlow API)
	transform_button.click(
	transform_with_melodyflow_api,
	inputs=[main_audio, transform_variation, transform_prompt, transform_solver, transform_flowstep],
	outputs=[main_audio, transform_status]
	)

	# Step 3: Continue (two different approaches)
	generate_music_button.click(
	generate_music,
	inputs=[main_audio, prompt_duration, musicgen_model, output_duration],
	outputs=[main_audio]
	)

	continue_music_button.click(
	continue_music,
	inputs=[main_audio, prompt_duration, musicgen_model, output_duration],
	outputs=[main_audio]
	)

	if __name__ == "__main__":
	iface.launch()