thecollabagepatch's picture
loudness issues
f8a1b4d
import gradio as gr
import spaces
import torchaudio
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write
import tempfile
import os
import torch
from gradio_client import Client, handle_file
import random
import time
import io
from pydub import AudioSegment
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# MelodyFlow variation mapping - map your semantic variations to text prompts
VARIATION_PROMPTS = {
'accordion_folk': 'folk accordion melody with traditional folk instruments',
'banjo_bluegrass': 'bluegrass banjo with country folk instruments',
'piano_classical': 'classical piano with orchestral arrangement',
'celtic': 'celtic harp and flute with traditional irish instruments',
'strings_quartet': 'string quartet with violin, viola, cello arrangement',
'synth_retro': 'retro 80s synthesizer with vintage electronic sounds',
'synth_modern': 'modern synthesizer with contemporary electronic production',
'synth_edm': 'edm synthesizer with dance electronic beats',
'lofi_chill': 'lo-fi chill with relaxed jazz hip-hop elements',
'synth_bass': 'heavy bass synthesizer with sub-bass frequencies',
'rock_band': 'rock band with electric guitar, bass, and drums',
'cinematic_epic': 'cinematic epic orchestral with dramatic strings and brass',
'retro_rpg': 'retro rpg chiptune with 8-bit game music elements',
'chiptune': '8-bit chiptune with retro video game sounds',
'steel_drums': 'steel drums with caribbean tropical percussion',
'gamelan_fusion': 'gamelan fusion with indonesian percussion instruments',
'music_box': 'music box with delicate mechanical melody',
'trap_808': 'trap beats with heavy 808 drums and hi-hats',
'lo_fi_drums': 'lo-fi drums with vinyl crackle and jazz samples',
'boom_bap': 'boom bap hip-hop with classic drum breaks',
'percussion_ensemble': 'percussion ensemble with varied drum instruments',
'future_bass': 'future bass with melodic drops and vocal chops',
'synthwave_retro': 'synthwave retro with neon 80s aesthetic',
'melodic_techno': 'melodic techno with driving beats and emotional melodies',
'dubstep_wobble': 'dubstep with heavy wobble bass and electronic drops',
'glitch_hop': 'glitch hop with broken beats and digital artifacts',
'digital_disruption': 'digital disruption with glitchy electronic effects',
'circuit_bent': 'circuit bent with broken electronic hardware sounds',
'orchestral_glitch': 'orchestral glitch with classical instruments and digital errors',
'vapor_drums': 'vaporwave drums with slowed down nostalgic beats',
'industrial_textures': 'industrial textures with harsh mechanical sounds',
'jungle_breaks': 'jungle breaks with fast drum and bass rhythms'
}
def preprocess_audio(waveform):
waveform_np = waveform.cpu().squeeze().numpy()
return torch.from_numpy(waveform_np).unsqueeze(0).to(device)
# ========== MUSICGEN FUNCTIONS (Local ZeroGPU) ==========
@spaces.GPU(duration=10)
def generate_drum_sample():
model = MusicGen.get_pretrained('pharoAIsanders420/micro-musicgen-jungle')
model.set_generation_params(duration=10)
wav = model.generate_unconditional(1).squeeze(0)
filename_without_extension = f'jungle'
filename_with_extension = f'{filename_without_extension}.wav'
audio_write(filename_without_extension, wav.cpu(), model.sample_rate, strategy="loudness", loudness_compressor=True)
return filename_with_extension
@spaces.GPU
def continue_drum_sample(existing_audio_path):
if existing_audio_path is None:
return None
existing_audio, sr = torchaudio.load(existing_audio_path)
existing_audio = existing_audio.to(device)
prompt_duration = 2
output_duration = 10
num_samples = int(prompt_duration * sr)
if existing_audio.shape[1] < num_samples:
raise ValueError("The existing audio is too short for the specified prompt duration.")
start_sample = existing_audio.shape[1] - num_samples
prompt_waveform = existing_audio[..., start_sample:]
model = MusicGen.get_pretrained('pharoAIsanders420/micro-musicgen-jungle')
model.set_generation_params(duration=output_duration)
output = model.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
output = output.to(device)
if output.dim() == 3:
output = output.squeeze(0)
if output.dim() == 1:
output = output.unsqueeze(0)
combined_audio = torch.cat((existing_audio, output), dim=1)
combined_audio = combined_audio.cpu()
combined_file_path = f'./continued_jungle_{random.randint(1000, 9999)}.wav'
torchaudio.save(combined_file_path, combined_audio, sr)
return combined_file_path
@spaces.GPU(duration=30)
def generate_music(wav_filename, prompt_duration, musicgen_model, output_duration):
"""Generate music using the BEGINNING of the audio as prompt"""
if wav_filename is None:
return None
song, sr = torchaudio.load(wav_filename)
song = song.to(device)
model_name = musicgen_model.split(" ")[0]
model_continue = MusicGen.get_pretrained(model_name)
model_continue.set_generation_params(
use_sampling=True,
top_k=250,
top_p=0.0,
temperature=1.0,
duration=output_duration,
cfg_coef=3
)
prompt_waveform = song[..., :int(prompt_duration * sr)]
prompt_waveform = preprocess_audio(prompt_waveform)
output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
output = output.cpu()
if len(output.size()) > 2:
output = output.squeeze()
filename_without_extension = f'continued_music'
filename_with_extension = f'{filename_without_extension}.wav'
audio_write(filename_without_extension, output, model_continue.sample_rate, strategy="loudness", loudness_compressor=True)
return filename_with_extension
@spaces.GPU(duration=30)
def continue_music(input_audio_path, prompt_duration, musicgen_model, output_duration):
"""Continue music using the END of the audio as prompt - extends the audio"""
if input_audio_path is None:
return None
song, sr = torchaudio.load(input_audio_path)
song = song.to(device)
model_name = musicgen_model.split(" ")[0]
model_continue = MusicGen.get_pretrained(model_name)
model_continue.set_generation_params(
use_sampling=True,
top_k=250,
top_p=0.0,
temperature=1.0,
duration=output_duration,
cfg_coef=3
)
# Load original audio as AudioSegment for easier manipulation
original_audio = AudioSegment.from_wav(input_audio_path)
file_paths_for_cleanup = []
# Get the last `prompt_duration` seconds as the prompt
num_samples = int(prompt_duration * sr)
if song.shape[1] < num_samples:
raise ValueError("The prompt_duration is longer than the current audio length.")
# Extract the end portion for prompting
start_sample = song.shape[1] - num_samples
prompt_waveform = song[..., start_sample:]
prompt_waveform = preprocess_audio(prompt_waveform)
# Generate continuation
output = model_continue.generate_continuation(prompt_waveform, prompt_sample_rate=sr, progress=True)
output = output.cpu()
if len(output.size()) > 2:
output = output.squeeze()
# Save the generated audio WITHOUT aggressive loudness processing
filename_without_extension = f'continue_extension_{random.randint(1000, 9999)}'
filename_with_extension = f'{filename_without_extension}.wav'
audio_write(filename_without_extension, output, model_continue.sample_rate,
strategy="clip") # Just prevent clipping, no loudness changes
# Handle the double .wav extension issue
correct_filename = f'{filename_without_extension}.wav.wav'
if os.path.exists(correct_filename):
generated_audio_segment = AudioSegment.from_wav(correct_filename)
file_paths_for_cleanup.append(correct_filename)
else:
generated_audio_segment = AudioSegment.from_wav(filename_with_extension)
file_paths_for_cleanup.append(filename_with_extension)
# SMART VOLUME MATCHING: Only match the prompt portion
# 1. Remove prompt duration from original (no overlap)
prompt_duration_ms = int(prompt_duration * 1000)
original_minus_prompt = original_audio[:-prompt_duration_ms]
# 2. Extract JUST the prompt portion from generated audio for RMS analysis
generated_prompt_portion = generated_audio_segment[:prompt_duration_ms]
# 3. Calculate RMS of the transition points
original_rms = original_minus_prompt.rms
prompt_portion_rms = generated_prompt_portion.rms
print(f"πŸ”Š Smart volume analysis:")
print(f" Original ending RMS: {original_rms}")
print(f" Generated prompt RMS: {prompt_portion_rms}")
print(f" Generated full RMS: {generated_audio_segment.rms}")
# 4. Match the prompt portion to original level
if prompt_portion_rms > 0:
from pydub.utils import ratio_to_db
volume_adjustment = ratio_to_db(original_rms / prompt_portion_rms)
print(f" Applying {volume_adjustment:.1f}dB to entire generated segment")
# Apply to entire segment (preserves the buildup)
generated_matched = generated_audio_segment + volume_adjustment
else:
generated_matched = generated_audio_segment
# 5. Combine seamlessly
combined_audio = original_minus_prompt + generated_matched
# Save final result
combined_audio_filename = f"extended_audio_{random.randint(1000, 9999)}.wav"
combined_audio.export(combined_audio_filename, format="wav")
# Cleanup temporary files
for file_path in file_paths_for_cleanup:
if os.path.exists(file_path):
os.remove(file_path)
return combined_audio_filename
# ========== MELODYFLOW FUNCTIONS (Via Facebook Space) ==========
def transform_with_melodyflow_api(audio_path, variation, custom_prompt="", solver="euler", flowstep=0.12):
"""Transform audio using Facebook/MelodyFlow space API"""
if audio_path is None:
return None, "❌ No audio file provided"
base_steps = 125
effective_steps = 25
try:
# Initialize client for Facebook MelodyFlow space
client = Client("facebook/MelodyFlow")
# Determine the prompt to use
if custom_prompt.strip():
prompt_text = custom_prompt.strip()
status_msg = f"βœ… Transformed with custom prompt: '{prompt_text}' (flowstep: {flowstep}, {effective_steps} steps)"
else:
prompt_text = VARIATION_PROMPTS.get(variation, f"transform this audio to {variation} style")
status_msg = f"βœ… Transformed with {variation} style (flowstep: {flowstep}, {effective_steps} steps)"
# Set steps based on solver and the fact we're doing editing
# Facebook's space automatically reduces steps for editing:
# EULER: divides by 5, MIDPOINT: divides by 2
if solver == "midpoint":
base_steps = 128
effective_steps = base_steps // 2 # 64 effective steps
else: # euler
base_steps = 125
effective_steps = base_steps // 5 # 25 effective steps
# Call the MelodyFlow API with the base steps (it will auto-reduce)
result = client.predict(
model="facebook/melodyflow-t24-30secs",
text=prompt_text,
solver=solver,
steps=base_steps, # Will be auto-reduced to effective_steps by the space
target_flowstep=flowstep, # This is the key parameter!
regularize=solver == "euler", # Regularize for euler, not for midpoint
regularization_strength=0.2,
duration=30, # Max duration
melody=handle_file(audio_path),
api_name="/predict"
)
# Result is a tuple of 3 audio files (variations)
# We'll use the first variation
if result and len(result) > 0 and result[0]:
# Save the result locally
output_filename = f"melodyflow_{variation}_{random.randint(1000, 9999)}.wav"
# Copy the result file to our local filename
import shutil
shutil.copy2(result[0], output_filename)
return output_filename, status_msg
else:
return None, "❌ MelodyFlow API returned no results"
except Exception as e:
return None, f"❌ MelodyFlow API error: {str(e)}"
# ========== GRADIO INTERFACE ==========
# Create the interface
with gr.Blocks() as iface:
gr.Markdown("# 🎰 The Mega Slot Machine")
gr.Markdown("**Hybrid Multi-Model Pipeline**: MicroMusicGen β†’ MelodyFlow (via API) β†’ MusicGen Fine-tunes")
gr.Markdown("*Demonstrating the workflow from our Ableton device in a web interface!*")
with gr.Accordion("How This Works", open=False):
gr.Markdown("""
This demo shows how multiple AI models can work together:
1. **Generate** initial audio with MicroMusicGen (super fast jungle drums)
2. **Transform** it using MelodyFlow (via Facebook's space API)
3. **Continue** with MusicGen fine-tunes (trained on specific genres)
4. **Repeat** the cycle to create infinite musical journeys!
The models run with different PyTorch versions, so we use the Facebook MelodyFlow space via API.
**Performance Note**: For audio transformation, MelodyFlow automatically uses fewer steps than generation:
- EULER solver: 25 effective steps (fast, good quality)
- MIDPOINT solver: 64 effective steps (slower, potentially higher quality)
""")
# ========== STEP 1: GENERATE ==========
gr.Markdown("## 🎡 Step 1: Generate Initial Audio")
with gr.Row():
with gr.Column():
generate_button = gr.Button("Generate Jungle Drums", variant="primary", size="lg")
continue_drum_button = gr.Button("Continue Drums", size="sm")
main_audio = gr.Audio(
label="🎡 Current Audio (flows through pipeline)",
type="filepath",
interactive=True,
show_download_button=True
)
# ========== STEP 2: TRANSFORM ==========
gr.Markdown("## πŸŽ›οΈ Step 2: Transform with MelodyFlow")
with gr.Row():
with gr.Column(scale=2):
transform_variation = gr.Dropdown(
label="Transform Style",
choices=list(VARIATION_PROMPTS.keys()),
value="synth_modern",
interactive=True
)
with gr.Column(scale=3):
transform_prompt = gr.Textbox(
label="Custom Prompt (optional)",
placeholder="Leave empty to use style above, or enter custom transformation prompt",
lines=2
)
with gr.Row():
transform_solver = gr.Dropdown(
label="Solver",
choices=["euler", "midpoint"],
value="euler",
info="EULER: faster (25 steps), MIDPOINT: slower but potentially higher quality (64 steps)"
)
transform_flowstep = gr.Slider(
label="Transform Intensity (Flowstep)",
minimum=0.0,
maximum=0.15,
step=0.01,
value=0.12,
info="Lower values = more dramatic transformation"
)
transform_button = gr.Button("πŸŽ›οΈ Transform Audio", variant="secondary", size="lg")
transform_status = gr.Textbox(label="Transform Status", value="Ready to transform", interactive=False)
# ========== STEP 3: CONTINUE ==========
gr.Markdown("## 🎼 Step 3: Continue with MusicGen")
with gr.Row():
with gr.Column():
prompt_duration = gr.Dropdown(
label="Prompt Duration (seconds)",
choices=list(range(1, 11)),
value=5
)
output_duration = gr.Slider(
label="Output Duration (seconds)",
minimum=10,
maximum=30,
step=1,
value=20
)
with gr.Column():
musicgen_model = gr.Dropdown(
label="MusicGen Model",
choices=[
"thepatch/vanya_ai_dnb_0.1 (small)",
"thepatch/budots_remix (small)",
"thepatch/PhonkV2 (small)",
"thepatch/bleeps-medium (medium)",
"thepatch/hoenn_lofi (large)",
"foureyednymph/musicgen-sza-sos-small (small)"
],
value="thepatch/vanya_ai_dnb_0.1 (small)"
)
# Two different continuation options with clear explanations
with gr.Row():
with gr.Column():
gr.Markdown("### πŸ”„ Continue from Beginning")
gr.Markdown("*Uses the **first** X seconds as prompt. Good for reimagining/reworking from a starting point.*")
generate_music_button = gr.Button("πŸ”„ Continue from Beginning", variant="primary", size="lg")
with gr.Column():
gr.Markdown("### ➑️ Extend from End")
gr.Markdown("*Uses the **last** X seconds as prompt. Extends your audio by adding new content to the end.*")
continue_music_button = gr.Button("➑️ Extend from End", variant="secondary", size="lg")
# ========== EVENT HANDLERS ==========
# Step 1: Generate
generate_button.click(generate_drum_sample, outputs=[main_audio])
continue_drum_button.click(continue_drum_sample, inputs=[main_audio], outputs=[main_audio])
# Step 2: Transform (using Facebook MelodyFlow API)
transform_button.click(
transform_with_melodyflow_api,
inputs=[main_audio, transform_variation, transform_prompt, transform_solver, transform_flowstep],
outputs=[main_audio, transform_status]
)
# Step 3: Continue (two different approaches)
generate_music_button.click(
generate_music,
inputs=[main_audio, prompt_duration, musicgen_model, output_duration],
outputs=[main_audio]
)
continue_music_button.click(
continue_music,
inputs=[main_audio, prompt_duration, musicgen_model, output_duration],
outputs=[main_audio]
)
if __name__ == "__main__":
iface.launch()