Spaces:

flosstradamus
/

FluxMusicGUI

Running

App Files Files Community

flosstradamus commited on Sep 5, 2024

Commit

f8cd83e

verified ·

1 Parent(s): 772add9

Upload app.py

Browse files

Files changed (1) hide show

app.py +119 -11

app.py CHANGED Viewed

@@ -15,9 +15,6 @@ from utils import load_t5, load_clap
 from train import RF
 from constants import build_model
-# Disable flash attention if not available
-torch.backends.cuda.enable_flash_sdp(False)
 # Global variables to store loaded models and resources
 global_model = None
 global_t5 = None
@@ -31,8 +28,39 @@ MODELS_DIR = "/content/models"
 GENERATIONS_DIR = "/content/generations"
 def prepare(t5, clip, img, prompt):
-    # ... [The prepare function remains unchanged]
-    pass
 def unload_current_model():
     global global_model
@@ -87,12 +115,92 @@ def load_resources():
     print("Base resources loaded successfully!")
 def generate_music(prompt, seed, cfg_scale, steps, duration, progress=gr.Progress()):
-    # ... [The generate_music function remains largely unchanged]
-    # Update the output directory
-    output_dir = GENERATIONS_DIR
-    os.makedirs(output_dir, exist_ok=True)
-    # ... [Rest of the function remains the same]
-    pass
 # Load base resources at startup
 load_resources()

 from train import RF
 from constants import build_model
 # Global variables to store loaded models and resources
 global_model = None
 global_t5 = None
 GENERATIONS_DIR = "/content/generations"
 def prepare(t5, clip, img, prompt):
+    bs, c, h, w = img.shape
+    if bs == 1 and not isinstance(prompt, str):
+        bs = len(prompt)
+    img = rearrange(img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+    if img.shape[0] == 1 and bs > 1:
+        img = repeat(img, "1 ... -> bs ...", bs=bs)
+    img_ids = torch.zeros(h // 2, w // 2, 3)
+    img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // 2)[:, None]
+    img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // 2)[None, :]
+    img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
+    if isinstance(prompt, str):
+        prompt = [prompt]
+    # Generate text embeddings
+    txt = t5(prompt)
+    if txt.shape[0] == 1 and bs > 1:
+        txt = repeat(txt, "1 ... -> bs ...", bs=bs)
+    txt_ids = torch.zeros(bs, txt.shape[1], 3)
+    vec = clip(prompt)
+    if vec.shape[0] == 1 and bs > 1:
+        vec = repeat(vec, "1 ... -> bs ...", bs=bs)
+    return img, {
+        "img_ids": img_ids.to(img.device),
+        "txt": txt.to(img.device),
+        "txt_ids": txt_ids.to(img.device),
+        "y": vec.to(img.device),
+    }
 def unload_current_model():
     global global_model
     print("Base resources loaded successfully!")
 def generate_music(prompt, seed, cfg_scale, steps, duration, progress=gr.Progress()):
+    global global_model, global_t5, global_clap, global_vae, global_vocoder, global_diffusion
+    if global_model is None:
+        return "Please select a model first.", None
+    if seed == 0:
+        seed = random.randint(1, 1000000)
+    print(f"Using seed: {seed}")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    torch.manual_seed(seed)
+    torch.set_grad_enabled(False)
+    # Calculate the number of segments needed for the desired duration
+    segment_duration = 10  # Each segment is 10 seconds
+    num_segments = int(np.ceil(duration / segment_duration))
+    all_waveforms = []
+    for i in range(num_segments):
+        progress(i / num_segments, desc=f"Generating segment {i+1}/{num_segments}")
+        # Use the same seed for all segments
+        torch.manual_seed(seed + i)  # Add i to slightly vary each segment while maintaining consistency
+        latent_size = (256, 16)
+        conds_txt = [prompt]
+        unconds_txt = ["low quality, gentle"]
+        L = len(conds_txt)
+        init_noise = torch.randn(L, 8, latent_size[0], latent_size[1]).to(device)
+        img, conds = prepare(global_t5, global_clap, init_noise, conds_txt)
+        _, unconds = prepare(global_t5, global_clap, init_noise, unconds_txt)
+        with torch.autocast(device_type='cuda'):
+            images = global_diffusion.sample_with_xps(global_model, img, conds=conds, null_cond=unconds, sample_steps=steps, cfg=cfg_scale)
+        images = rearrange(
+            images[-1],
+            "b (h w) (c ph pw) -> b c (h ph) (w pw)",
+            h=128,
+            w=8,
+            ph=2,
+            pw=2,)
+        latents = 1 / global_vae.config.scaling_factor * images
+        mel_spectrogram = global_vae.decode(latents).sample
+        x_i = mel_spectrogram[0]
+        if x_i.dim() == 4:
+            x_i = x_i.squeeze(1)
+        waveform = global_vocoder(x_i)
+        waveform = waveform[0].cpu().float().detach().numpy()
+        all_waveforms.append(waveform)
+    # Concatenate all waveforms
+    final_waveform = np.concatenate(all_waveforms)
+    # Trim to exact duration
+    sample_rate = 16000
+    final_waveform = final_waveform[:int(duration * sample_rate)]
+    progress(0.9, desc="Saving audio file")
+    # Create 'generations' folder
+    os.makedirs(GENERATIONS_DIR, exist_ok=True)
+    # Generate filename
+    prompt_part = re.sub(r'[^\w\s-]', '', prompt)[:10].strip().replace(' ', '_')
+    model_name = os.path.splitext(os.path.basename(global_model.model_path))[0]
+    model_suffix = '_mf_b' if model_name == 'musicflow_b' else f'_{model_name}'
+    base_filename = f"{prompt_part}_{seed}{model_suffix}"
+    output_path = os.path.join(GENERATIONS_DIR, f"{base_filename}.wav")
+    # Check if file exists and add numerical suffix if needed
+    counter = 1
+    while os.path.exists(output_path):
+        output_path = os.path.join(GENERATIONS_DIR, f"{base_filename}_{counter}.wav")
+        counter += 1
+    wavfile.write(output_path, sample_rate, final_waveform)
+    progress(1.0, desc="Audio generation complete")
+    return f"Generated with seed: {seed}", output_path
 # Load base resources at startup
 load_resources()