Spaces:

WeReCooking
/

ACE-Step-CPU

Running

Nekochu commited on 18 days ago

Commit

ff239f5

1 Parent(s): bc97006

major update: PyTorch inference, Gradio 6, session isolation, /understand captioning

- generate_audio(): full PyTorch inference pipeline (no ace-server needed)
- tiled_vae_decode(): memory-bounded VAE decoding
- Gradio 6 migration (gr.update -> component constructors)
- Session isolation (random suffix on LoRA names)
- /understand captioning before training (falls back to librosa)
- Active tab detection pattern from rvc-beatrice
- Gradio API fix (tempfile for adapter download)
- dtype fix for mixed precision inference

Files changed (3) hide show

Dockerfile +1 -1
app.py +133 -20
train_engine.py +287 -0

Dockerfile CHANGED Viewed

@@ -75,7 +75,7 @@ RUN curl -fL --retry 3 --retry-delay 5 -o /app/models/vae-BF16.gguf \
 # Install Python deps for Gradio UI + training
 RUN pip3 install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu \
-    "gradio[mcp]==5.29.0" requests torch safetensors \
     "transformers>=4.51.0,<4.58.0" peft>=0.18.0 \
     loguru "torchaudio==2.4.0" "diffusers==0.30.3" lightning numpy tensorboard soundfile \
     einops vector_quantize_pytorch librosa mutagen

 # Install Python deps for Gradio UI + training
 RUN pip3 install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu \
+    "gradio[mcp]>=6.0.0,<7.0.0" requests torch safetensors \
     "transformers>=4.51.0,<4.58.0" peft>=0.18.0 \
     loguru "torchaudio==2.4.0" "diffusers==0.30.3" lightning numpy tensorboard soundfile \
     einops vector_quantize_pytorch librosa mutagen

app.py CHANGED Viewed

@@ -5,9 +5,12 @@ import sys
 import time
 import json
 import argparse
 import tempfile
 import subprocess
 import shutil
 import requests
 import logging
@@ -97,6 +100,61 @@ def _fetch_result(job_id, timeout=60):
     return r
 def _run_pipeline(caption, lyrics, bpm, duration, seed, steps, output_format,
                   adapter=None, lm_model=None, progress_cb=None):
     """Run full LM -> synth pipeline. Returns (audio_path, status_msg) or raises."""
@@ -460,17 +518,20 @@ def gradio_main():
         # -- Validation --
         if not audio_files:
             _log("[FAIL] No audio files uploaded.")
-            yield _log_text(), gr.update(visible=True), gr.update(visible=False), gr.update()
             return
         if len(audio_files) > MAX_AUDIO_FILES:
             _log(f"[FAIL] Too many files ({len(audio_files)}). Max: {MAX_AUDIO_FILES}")
-            yield _log_text(), gr.update(visible=True), gr.update(visible=False), gr.update()
             return
         lora_name = (lora_name or "").strip() or "my-lora"
         # Sanitize: alphanumeric, dash, underscore only
         lora_name = "".join(c if c.isalnum() or c in "-_" else "-" for c in lora_name)
         epochs = max(1, min(int(epochs), 10))
         lr = float(lr)
@@ -485,7 +546,7 @@ def gradio_main():
         # Copy uploaded audio files + check total duration
         _log(f"[INFO] Preparing {len(audio_files)} audio files...")
-        yield _log_text(), gr.update(visible=False), gr.update(visible=True), gr.update()
         import librosa as _lr
         total_dur = 0.0
@@ -530,18 +591,49 @@ def gradio_main():
         _log(f"[INFO] LoRA: '{lora_name}' | Files: {len(audio_files)} | "
              f"Epochs: {epochs} | LR: {lr} | Rank: {rank}")
-        yield _log_text(), gr.update(visible=False), gr.update(visible=True), gr.update()
         # Stop ace-server before training (frees memory)
         _log("[INFO] Stopping ace-server for training...")
-        yield _log_text(), gr.update(visible=False), gr.update(visible=True), gr.update()
         _stop_ace_server()
         _gc.collect()
         try:
             # -- Phase 1: Preprocessing --
             _log("[Step 1/2] Preprocessing audio...")
-            yield _log_text(), gr.update(visible=False), gr.update(visible=True), gr.update()
             preprocessed_dir = os.path.join(work_dir, "preprocessed_tensors")
@@ -558,24 +650,24 @@ def gradio_main():
                 progress_callback=preprocess_progress,
                 cancel_check=lambda: False,
             )
-            yield _log_text(), gr.update(visible=False), gr.update(visible=True), gr.update()
             processed = result.get("processed", 0)
             failed = result.get("failed", 0)
             total = result.get("total", 0)
             _log(f"[OK] Preprocessed: {processed}/{total} (failed: {failed})")
-            yield _log_text(), gr.update(visible=False), gr.update(visible=True), gr.update()
             if processed == 0:
                 _log("[FAIL] No files preprocessed successfully. Cannot train.")
-                yield _log_text(), gr.update(visible=True), gr.update(visible=False), gr.update()
                 return
             _gc.collect()
             # -- Phase 2: Training --
             _log("[Step 2/2] Training LoRA...")
-            yield _log_text(), gr.update(visible=False), gr.update(visible=True), gr.update()
             for msg in train_lora_generator(
                 dataset_dir=preprocessed_dir,
@@ -605,24 +697,24 @@ def gradio_main():
                     break
                 _log(msg)
-                yield _log_text(), gr.update(visible=False), gr.update(visible=True), gr.update()
                 if msg.strip() == "[DONE]":
                     break
             _log(f"[INFO] Total time: {time.time() - train_start:.0f}s")
-            yield _log_text(), gr.update(visible=False), gr.update(visible=True), gr.update()
         except Exception as exc:
             _log(f"[FAIL] Training error: {exc}")
             import traceback
             _log(traceback.format_exc())
-            yield _log_text(), gr.update(visible=True), gr.update(visible=False), gr.update()
         finally:
             # Always restart ace-server
             _log("[INFO] Restarting ace-server...")
-            yield _log_text(), gr.update(visible=False), gr.update(visible=True), gr.update()
             _gc.collect()
             ok = _start_ace_server()
             if ok:
@@ -631,9 +723,20 @@ def gradio_main():
                 _log("[WARN] ace-server may not have restarted -- check logs")
             adapter_safetensors = os.path.join(adapter_out, "adapter_model.safetensors")
             if os.path.isfile(adapter_safetensors):
-                yield _log_text(), gr.update(visible=True), gr.update(visible=False), gr.update(value=adapter_safetensors, visible=True)
             else:
-                yield _log_text(), gr.update(visible=True), gr.update(visible=False), gr.update()
     # -- Cancel handler --
     def _on_cancel():
@@ -657,7 +760,7 @@ def gradio_main():
     .status-box textarea { font-family: monospace; font-size: 13px; }
     """
-    with gr.Blocks(title="ACE-Step 1.5 XL (CPU)", css=CSS) as demo:
         with gr.Tabs():
             # ============================================================
@@ -777,6 +880,14 @@ def gradio_main():
                     elem_classes="status-box",
                 )
                 # Training generator -- yields (log, train_btn, cancel_btn, output_file)
                 train_event = train_btn.click(
                     train_lora_ui,
@@ -787,11 +898,12 @@ def gradio_main():
                 )
                 # After training completes, restore buttons and refresh LoRA dropdown
                 def _post_training():
                     return (
-                        gr.update(visible=True),
-                        gr.update(visible=False),
-                        gr.update(choices=_list_lora_choices()),
                     )
                 train_event.then(
@@ -816,6 +928,7 @@ def gradio_main():
             server_name="0.0.0.0",
             server_port=7860,
             mcp_server=True,
         )

 import time
 import json
 import argparse
+import base64
 import tempfile
 import subprocess
 import shutil
+import string
+import random
 import requests
 import logging
     return r
+def _caption_via_understand(audio_path, timeout=120):
+    """Call ace-server /understand to get a rich caption for an audio file.
+    Returns a dict with caption, bpm, key, signature, lyrics on success,
+    or None on failure (caller should fall back to librosa).
+    """
+    fname = os.path.basename(audio_path)
+    try:
+        with open(audio_path, "rb") as f:
+            audio_b64 = base64.b64encode(f.read()).decode("ascii")
+    except Exception as exc:
+        logger.warning("[Caption] %s: failed to read file: %s", fname, exc)
+        return None
+    # Submit
+    try:
+        r = requests.post(
+            f"{ACE_SERVER}/understand",
+            json={"audio": audio_b64},
+            timeout=30,
+        )
+        if r.status_code != 200:
+            logger.warning("[Caption] %s: /understand returned %d", fname, r.status_code)
+            return None
+        job_id = r.json().get("id")
+        if not job_id:
+            logger.warning("[Caption] %s: /understand returned no job id", fname)
+            return None
+    except Exception as exc:
+        logger.warning("[Caption] %s: /understand submit failed: %s", fname, exc)
+        return None
+    # Poll until done
+    status, _ = _poll_job(job_id, timeout=timeout)
+    if status != "done":
+        logger.warning("[Caption] %s: /understand job %s -> %s", fname, job_id, status)
+        return None
+    # Fetch result
+    try:
+        r = _fetch_result(job_id, timeout=30)
+        if r.status_code != 200:
+            logger.warning("[Caption] %s: /understand result fetch failed: %d", fname, r.status_code)
+            return None
+        data = r.json()
+        # The result should contain caption, bpm, key, signature, lyrics
+        if isinstance(data, dict) and data.get("caption"):
+            return data
+        logger.warning("[Caption] %s: /understand returned no caption field", fname)
+        return None
+    except Exception as exc:
+        logger.warning("[Caption] %s: /understand result parse failed: %s", fname, exc)
+        return None
 def _run_pipeline(caption, lyrics, bpm, duration, seed, steps, output_format,
                   adapter=None, lm_model=None, progress_cb=None):
     """Run full LM -> synth pipeline. Returns (audio_path, status_msg) or raises."""
         # -- Validation --
         if not audio_files:
             _log("[FAIL] No audio files uploaded.")
+            yield _log_text(), gr.Button(visible=True), gr.Button(visible=False), gr.File()
             return
         if len(audio_files) > MAX_AUDIO_FILES:
             _log(f"[FAIL] Too many files ({len(audio_files)}). Max: {MAX_AUDIO_FILES}")
+            yield _log_text(), gr.Button(visible=True), gr.Button(visible=False), gr.File()
             return
         lora_name = (lora_name or "").strip() or "my-lora"
         # Sanitize: alphanumeric, dash, underscore only
         lora_name = "".join(c if c.isalnum() or c in "-_" else "-" for c in lora_name)
+        # Append random suffix to prevent naming collisions between users
+        suffix = "".join(random.choices(string.ascii_lowercase + string.digits, k=4))
+        lora_name = f"{lora_name}-{suffix}"
         epochs = max(1, min(int(epochs), 10))
         lr = float(lr)
         # Copy uploaded audio files + check total duration
         _log(f"[INFO] Preparing {len(audio_files)} audio files...")
+        yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
         import librosa as _lr
         total_dur = 0.0
         _log(f"[INFO] LoRA: '{lora_name}' | Files: {len(audio_files)} | "
              f"Epochs: {epochs} | LR: {lr} | Rank: {rank}")
+        yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
+        # Caption each audio file via ace-server /understand BEFORE stopping it
+        if _server_ok():
+            _log("[INFO] Captioning audio via ace-server /understand...")
+            yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
+            for audio_fname in sorted(os.listdir(audio_dir)):
+                full_path = os.path.join(audio_dir, audio_fname)
+                if not os.path.isfile(full_path) or audio_fname.endswith(".json"):
+                    continue
+                caption_json_path = full_path + ".json"
+                caption_data = _caption_via_understand(full_path, timeout=120)
+                if caption_data:
+                    _log(f"[Caption] {audio_fname}: using ace-server /understand")
+                    with open(caption_json_path, "w") as cj:
+                        json.dump(caption_data, cj)
+                else:
+                    # Fallback to librosa for basic metadata
+                    _log(f"[Caption] {audio_fname}: fallback to librosa")
+                    try:
+                        y_cap, sr_cap = _lr.load(full_path, sr=None, mono=True)
+                        tempo, _ = _lr.beat.beat_track(y=y_cap, sr=sr_cap)
+                        bpm_val = float(tempo) if hasattr(tempo, '__float__') else float(tempo[0])
+                        fallback = {"caption": "", "bpm": round(bpm_val), "key": "", "signature": "", "lyrics": ""}
+                        with open(caption_json_path, "w") as cj:
+                            json.dump(fallback, cj)
+                    except Exception as cap_exc:
+                        _log(f"[Caption] {audio_fname}: librosa fallback also failed: {cap_exc}")
+                yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
+        else:
+            _log("[INFO] ace-server not running, skipping /understand captioning")
+            yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
         # Stop ace-server before training (frees memory)
         _log("[INFO] Stopping ace-server for training...")
+        yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
         _stop_ace_server()
         _gc.collect()
         try:
             # -- Phase 1: Preprocessing --
             _log("[Step 1/2] Preprocessing audio...")
+            yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
             preprocessed_dir = os.path.join(work_dir, "preprocessed_tensors")
                 progress_callback=preprocess_progress,
                 cancel_check=lambda: False,
             )
+            yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
             processed = result.get("processed", 0)
             failed = result.get("failed", 0)
             total = result.get("total", 0)
             _log(f"[OK] Preprocessed: {processed}/{total} (failed: {failed})")
+            yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
             if processed == 0:
                 _log("[FAIL] No files preprocessed successfully. Cannot train.")
+                yield _log_text(), gr.Button(visible=True), gr.Button(visible=False), gr.File()
                 return
             _gc.collect()
             # -- Phase 2: Training --
             _log("[Step 2/2] Training LoRA...")
+            yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
             for msg in train_lora_generator(
                 dataset_dir=preprocessed_dir,
                     break
                 _log(msg)
+                yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
                 if msg.strip() == "[DONE]":
                     break
             _log(f"[INFO] Total time: {time.time() - train_start:.0f}s")
+            yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
         except Exception as exc:
             _log(f"[FAIL] Training error: {exc}")
             import traceback
             _log(traceback.format_exc())
+            yield _log_text(), gr.Button(visible=True), gr.Button(visible=False), gr.File()
         finally:
             # Always restart ace-server
             _log("[INFO] Restarting ace-server...")
+            yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
             _gc.collect()
             ok = _start_ace_server()
             if ok:
                 _log("[WARN] ace-server may not have restarted -- check logs")
             adapter_safetensors = os.path.join(adapter_out, "adapter_model.safetensors")
             if os.path.isfile(adapter_safetensors):
+                # Copy to a temp file so Gradio doesn't try to validate /app paths
+                # (avoids InvalidPathError: "Cannot move /app to the gradio cache dir
+                # because it was not uploaded by a user")
+                tmp_out = tempfile.NamedTemporaryFile(
+                    suffix=".safetensors",
+                    prefix=f"{lora_name}_",
+                    delete=False,
+                )
+                tmp_out.close()
+                shutil.copy2(adapter_safetensors, tmp_out.name)
+                _log(f"[OK] LoRA saved: {lora_name}")
+                yield _log_text(), gr.Button(visible=True), gr.Button(visible=False), gr.File(value=tmp_out.name, visible=True)
             else:
+                yield _log_text(), gr.Button(visible=True), gr.Button(visible=False), gr.File()
     # -- Cancel handler --
     def _on_cancel():
     .status-box textarea { font-family: monospace; font-size: 13px; }
     """
+    with gr.Blocks(title="ACE-Step 1.5 XL (CPU)") as demo:
         with gr.Tabs():
             # ============================================================
                     elem_classes="status-box",
                 )
+                # Button swap on click (separate handler, like rvc-beatrice)
+                # This fires immediately so user sees Cancel even if training
+                # queues behind concurrency_limit=1
+                train_btn.click(
+                    lambda: (gr.Button(visible=False), gr.Button(visible=True)),
+                    outputs=[train_btn, cancel_btn],
+                )
                 # Training generator -- yields (log, train_btn, cancel_btn, output_file)
                 train_event = train_btn.click(
                     train_lora_ui,
                 )
                 # After training completes, restore buttons and refresh LoRA dropdown
+                # This ensures cleanup even if the user navigated away
                 def _post_training():
                     return (
+                        gr.Button(visible=True),
+                        gr.Button(visible=False),
+                        gr.Dropdown(choices=_list_lora_choices()),
                     )
                 train_event.then(
             server_name="0.0.0.0",
             server_port=7860,
             mcp_server=True,
+            css=CSS,
         )

train_engine.py CHANGED Viewed

@@ -15,6 +15,8 @@ Exports:
     train_lora_generator()   - Generator-based LoRA training loop
     cancel_training()        - Set the cancel flag
     get_trained_loras()      - List saved adapters
 """
 from __future__ import annotations
@@ -2799,3 +2801,288 @@ def get_trained_loras(adapter_dir: str) -> List[str]:
                 break
     return sorted(set(result))

     train_lora_generator()   - Generator-based LoRA training loop
     cancel_training()        - Set the cancel flag
     get_trained_loras()      - List saved adapters
+    generate_audio()         - Standalone inference (text -> WAV, optional LoRA)
+    tiled_vae_decode()       - Tiled VAE latent-to-waveform decode
 """
 from __future__ import annotations
                 break
     return sorted(set(result))
+# ============================================================================
+# TILED VAE DECODE (mirror of tiled_vae_encode)
+# ============================================================================
+def tiled_vae_decode(
+    vae, latents: torch.Tensor, dtype: torch.dtype,
+    chunk_frames: int = 1024, overlap_frames: int = 64,
+) -> torch.Tensor:
+    """Decode latents [B, T, C] -> waveform [B, 2, samples] using tiled VAE.
+    Mirrors tiled_vae_encode but in the reverse direction.  Tiles along
+    the time axis of the latent to keep peak memory bounded.
+    Args:
+        vae: AutoencoderOobleck decoder.
+        latents: Latent tensor in [B, T, C] layout (C=64).
+        dtype: Target dtype for the output waveform.
+        chunk_frames: Number of latent frames per tile.
+        overlap_frames: Overlap frames per side for crossfade.
+    Returns:
+        Waveform tensor [B, 2, total_samples] in *dtype*.
+    """
+    vae_device = next(vae.parameters()).device
+    vae_dtype = vae.dtype
+    # Transpose to VAE convention [B, C, T]
+    lat = latents.transpose(1, 2).contiguous()
+    B, C, T = lat.shape
+    if T <= chunk_frames:
+        with torch.inference_mode():
+            audio = vae.decode(lat.to(vae_device, dtype=vae_dtype)).sample
+        return audio.to(dtype=dtype, device="cpu")
+    # Upsample factor: unknown until first decode, so we discover it.
+    stride = chunk_frames - 2 * overlap_frames
+    if stride <= 0:
+        raise ValueError(f"chunk_frames ({chunk_frames}) must be > 2*overlap ({overlap_frames})")
+    num_tiles = math.ceil(T / stride)
+    us_factor: Optional[float] = None
+    write_pos = 0
+    final: Optional[torch.Tensor] = None
+    for i in range(num_tiles):
+        core_start = i * stride
+        core_end = min(core_start + stride, T)
+        win_start = max(0, core_start - overlap_frames)
+        win_end = min(T, core_end + overlap_frames)
+        chunk = lat[:, :, win_start:win_end].to(vae_device, dtype=vae_dtype)
+        with torch.inference_mode():
+            decoded = vae.decode(chunk).sample  # [B, 2, samples_chunk]
+        if us_factor is None:
+            us_factor = decoded.shape[-1] / chunk.shape[-1]
+            total_samples = int(round(T * us_factor))
+            final = torch.zeros(B, decoded.shape[1], total_samples, dtype=decoded.dtype, device="cpu")
+        trim_start = int(round((core_start - win_start) * us_factor))
+        trim_end = int(round((win_end - core_end) * us_factor))
+        end_idx = decoded.shape[-1] - trim_end if trim_end > 0 else decoded.shape[-1]
+        core = decoded[:, :, trim_start:end_idx]
+        core_len = core.shape[-1]
+        final[:, :, write_pos:write_pos + core_len] = core.cpu()
+        write_pos += core_len
+        del chunk, decoded, core
+    final = final[:, :, :write_pos]
+    return final.to(dtype=dtype)
+# ============================================================================
+# INFERENCE -- generate_audio()
+# ============================================================================
+def generate_audio(
+    caption: str,
+    checkpoint_dir: str,
+    output_path: str,
+    lyrics: str = "[Instrumental]",
+    duration: float = 10.0,
+    bpm: int = 120,
+    steps: int = 8,
+    seed: int = -1,
+    variant: str = "turbo",
+    device: str = "auto",
+    adapter_path: Optional[str] = None,
+    adapter_scale: float = 1.0,
+) -> str:
+    """Generate audio using the ACE-Step DiT pipeline (pure PyTorch, no server).
+    Pipeline:
+        1. Text encoder  -> text_hidden_states, lyric embeddings
+        2. Load full model (DiT + condition encoder + FSQ)
+        3. Optional: inject LoRA adapter via PEFT
+        4. model.generate_audio()  -- runs condition encoder, FSQ detokenizer,
+           and the flow-matching diffusion loop internally
+        5. VAE decode latents -> waveform
+        6. Save waveform as 48 kHz stereo WAV
+        7. Unload all models, free memory
+    Args:
+        caption:        Text description of the desired music.
+        checkpoint_dir: Root directory that contains model sub-dirs
+                        (e.g. ``acestep-v15-turbo/``, ``vae/``, ``Qwen3-Embedding-0.6B/``).
+        output_path:    Where to write the output WAV file.
+        lyrics:         Lyrics text or ``"[Instrumental]"`` for no vocals.
+        duration:       Desired audio length in seconds.
+        bpm:            Beats per minute (metadata hint for the model).
+        steps:          Number of diffusion steps (8 for turbo, 50 for base/SFT).
+        seed:           RNG seed (-1 = random).
+        variant:        Model variant name (e.g. ``"turbo"``, ``"base"``).
+        device:         ``"auto"``, ``"cpu"``, ``"cuda:0"``, etc.
+        adapter_path:   Path to a PEFT LoRA adapter directory (optional).
+        adapter_scale:  Scaling factor applied to the adapter.
+    Returns:
+        The *output_path* string (for convenience).
+    """
+    import numpy as np
+    # ------------------------------------------------------------------
+    # 0. Device / dtype
+    # ------------------------------------------------------------------
+    device = detect_device(device)
+    dtype = select_dtype(device)
+    logger.info(
+        "generate_audio: device=%s, dtype=%s, variant=%s, steps=%d, duration=%.1fs",
+        device, dtype, variant, steps, duration,
+    )
+    # Resolve seed
+    if seed < 0:
+        seed = random.randint(0, 2**31 - 1)
+    logger.info("Using seed=%d", seed)
+    # ------------------------------------------------------------------
+    # 1. Text encoder -- encode caption and lyrics
+    # ------------------------------------------------------------------
+    logger.info("Loading text encoder...")
+    tokenizer, text_encoder = load_text_encoder(checkpoint_dir, device)
+    text_hs, text_mask = encode_text(text_encoder, tokenizer, caption, device, dtype)
+    lyric_hs, lyric_mask = encode_lyrics(text_encoder, tokenizer, lyrics, device, dtype)
+    # Free text encoder -- no longer needed
+    unload_models(text_encoder)
+    del text_encoder, tokenizer
+    gc.collect()
+    _clear_gpu_cache(device)
+    logger.info("Text encoder unloaded.")
+    # ------------------------------------------------------------------
+    # 2. Load full model (DiT + CondEncoder + FSQ tokenizer/detokenizer)
+    # ------------------------------------------------------------------
+    logger.info("Loading ACE-Step model (%s)...", variant)
+    model = load_model_for_training(checkpoint_dir, variant=variant, device=device)
+    model = model.to(dtype=dtype)
+    model.eval()
+    # ------------------------------------------------------------------
+    # 3. Optional: inject LoRA adapter
+    # ------------------------------------------------------------------
+    if adapter_path:
+        logger.info("Loading LoRA adapter from %s (scale=%.2f)...", adapter_path, adapter_scale)
+        from peft import PeftModel
+        decoder = model.decoder if hasattr(model, "decoder") else model
+        # Unwrap any wrappers
+        while hasattr(decoder, "_forward_module"):
+            decoder = decoder._forward_module
+        if hasattr(decoder, "base_model"):
+            bm = decoder.base_model
+            decoder = bm.model if hasattr(bm, "model") else bm
+        if hasattr(decoder, "model") and isinstance(decoder.model, nn.Module):
+            decoder = decoder.model
+        model.decoder = PeftModel.from_pretrained(
+            decoder, adapter_path, is_trainable=False,
+        )
+        # Apply adapter scale if not 1.0
+        if abs(adapter_scale - 1.0) > 1e-6:
+            for name, module in model.decoder.named_modules():
+                if hasattr(module, "scaling"):
+                    for key in module.scaling:
+                        module.scaling[key] = adapter_scale
+        model.decoder.eval()
+        logger.info("LoRA adapter applied.")
+    # ------------------------------------------------------------------
+    # 4. Prepare inputs for model.generate_audio()
+    # ------------------------------------------------------------------
+    # Latent frame rate is 25 Hz
+    LATENT_HZ = 25
+    latent_length = int(duration * LATENT_HZ)
+    # Load silence latent for context building
+    silence_latent = load_silence_latent(checkpoint_dir, device, variant)
+    # Ensure silence latent covers the required length
+    if silence_latent.shape[1] < latent_length:
+        repeats = math.ceil(latent_length / silence_latent.shape[1])
+        silence_latent = silence_latent.repeat(1, repeats, 1)
+    silence_latent = silence_latent[:, :latent_length, :].to(device=device, dtype=dtype)
+    # Build source latents and masks for text2music mode (all silence, all-ones mask)
+    src_latents = silence_latent[:1, :latent_length, :]
+    chunk_masks = torch.ones(1, latent_length, 64, device=device, dtype=dtype)
+    is_covers = torch.zeros(1, device=device, dtype=torch.long)
+    # Dummy timbre reference (single silence frame -> no timbre conditioning)
+    refer_audio = torch.zeros(1, 1, 64, device=device, dtype=dtype)
+    refer_order = torch.zeros(1, device=device, dtype=torch.long)
+    # Shift schedule: turbo uses 3.0, base/sft uses 1.0
+    shift = 3.0 if "turbo" in variant else 1.0
+    # ------------------------------------------------------------------
+    # 5. Run diffusion (model.generate_audio handles everything internally)
+    # ------------------------------------------------------------------
+    logger.info("Running diffusion (%d steps, shift=%.1f)...", steps, shift)
+    with torch.no_grad():
+        result = model.generate_audio(
+            text_hidden_states=text_hs.to(device=device, dtype=dtype),
+            text_attention_mask=text_mask.to(device=device, dtype=dtype),
+            lyric_hidden_states=lyric_hs.to(device=device, dtype=dtype),
+            lyric_attention_mask=lyric_mask.to(device=device, dtype=dtype),
+            refer_audio_acoustic_hidden_states_packed=refer_audio,
+            refer_audio_order_mask=refer_order,
+            src_latents=src_latents,
+            chunk_masks=chunk_masks,
+            is_covers=is_covers,
+            silence_latent=silence_latent,
+            seed=seed,
+            fix_nfe=steps,
+            shift=shift,
+        )
+    target_latents = result["target_latents"]  # [1, T, 64]
+    time_costs = result.get("time_costs", {})
+    logger.info("Diffusion done. Time costs: %s", time_costs)
+    # Free model weights -- keep latents on CPU
+    target_latents = target_latents.cpu().to(dtype)
+    unload_models(model)
+    del model, silence_latent, src_latents, chunk_masks
+    del text_hs, text_mask, lyric_hs, lyric_mask
+    gc.collect()
+    _clear_gpu_cache(device)
+    logger.info("DiT model unloaded.")
+    # ------------------------------------------------------------------
+    # 6. VAE decode latents -> waveform
+    # ------------------------------------------------------------------
+    logger.info("Loading VAE decoder...")
+    vae = load_vae(checkpoint_dir, device)
+    logger.info("Decoding latents -> waveform (tiled)...")
+    waveform = tiled_vae_decode(vae, target_latents.to(device), dtype)  # [1, 2, samples]
+    unload_models(vae)
+    del vae, target_latents
+    gc.collect()
+    _clear_gpu_cache(device)
+    logger.info("VAE unloaded.")
+    # ------------------------------------------------------------------
+    # 7. Save as WAV (48 kHz stereo)
+    # ------------------------------------------------------------------
+    audio_np = waveform[0].float().clamp(-1.0, 1.0).cpu().numpy()  # [2, samples]
+    os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
+    try:
+        import soundfile as sf
+        # soundfile expects [samples, channels]
+        sf.write(output_path, audio_np.T, TARGET_SR, subtype="PCM_16")
+    except ImportError:
+        import torchaudio
+        torchaudio.save(output_path, torch.from_numpy(audio_np), TARGET_SR)
+    logger.info("Audio saved to %s (%.1fs @ %d Hz)", output_path, duration, TARGET_SR)
+    return output_path