Spaces:

WeReCooking
/

ACE-Step-CPU

Running

App Files Files Community

Nekochu commited on 16 days ago

Commit

ff9f4ad

1 Parent(s): 3c15b8b

fix inference: add LM generation step, detokenize codes before DiT, full pipeline working

Browse files

Files changed (1) hide show

train_engine.py +302 -21

train_engine.py CHANGED Viewed

@@ -2899,18 +2899,25 @@ def generate_audio(
     device: str = "auto",
     adapter_path: Optional[str] = None,
     adapter_scale: float = 1.0,
 ) -> str:
-    """Generate audio using the ACE-Step DiT pipeline (pure PyTorch, no server).
     Pipeline:
-        1. Text encoder  -> text_hidden_states, lyric embeddings
-        2. Load full model (DiT + condition encoder + FSQ)
-        3. Optional: inject LoRA adapter via PEFT
-        4. model.generate_audio()  -- runs condition encoder, FSQ detokenizer,
-           and the flow-matching diffusion loop internally
-        5. VAE decode latents -> waveform
-        6. Save waveform as 48 kHz stereo WAV
-        7. Unload all models, free memory
     Args:
         caption:        Text description of the desired music.
@@ -2926,6 +2933,11 @@ def generate_audio(
         device:         ``"auto"``, ``"cpu"``, ``"cuda:0"``, etc.
         adapter_path:   Path to a PEFT LoRA adapter directory (optional).
         adapter_scale:  Scaling factor applied to the adapter.
     Returns:
         The *output_path* string (for convenience).
@@ -2938,8 +2950,8 @@ def generate_audio(
     device = detect_device(device)
     dtype = select_dtype(device)
     logger.info(
-        "generate_audio: device=%s, dtype=%s, variant=%s, steps=%d, duration=%.1fs",
-        device, dtype, variant, steps, duration,
     )
     # Resolve seed
@@ -2948,7 +2960,34 @@ def generate_audio(
     logger.info("Using seed=%d", seed)
     # ------------------------------------------------------------------
-    # 1. Text encoder -- encode caption and lyrics
     # ------------------------------------------------------------------
     logger.info("Loading text encoder...")
     tokenizer, text_encoder = load_text_encoder(checkpoint_dir, device)
@@ -2964,7 +3003,7 @@ def generate_audio(
     logger.info("Text encoder unloaded.")
     # ------------------------------------------------------------------
-    # 2. Load full model (DiT + CondEncoder + FSQ tokenizer/detokenizer)
     # ------------------------------------------------------------------
     logger.info("Loading ACE-Step model (%s)...", variant)
     model = load_model_for_training(checkpoint_dir, variant=variant, device=device)
@@ -2972,7 +3011,7 @@ def generate_audio(
     model.eval()
     # ------------------------------------------------------------------
-    # 3. Optional: inject LoRA adapter
     # ------------------------------------------------------------------
     if adapter_path:
         logger.info("Loading LoRA adapter from %s (scale=%.2f)...", adapter_path, adapter_scale)
@@ -3001,7 +3040,7 @@ def generate_audio(
         logger.info("LoRA adapter applied.")
     # ------------------------------------------------------------------
-    # 4. Prepare inputs for model.generate_audio()
     # ------------------------------------------------------------------
     # Latent frame rate is 25 Hz
     LATENT_HZ = 25
@@ -3015,10 +3054,31 @@ def generate_audio(
         silence_latent = silence_latent.repeat(1, repeats, 1)
     silence_latent = silence_latent[:, :latent_length, :].to(device=device, dtype=dtype)
-    # Build source latents and masks for text2music mode (all silence, all-ones mask)
     src_latents = silence_latent[:1, :latent_length, :]
     chunk_masks = torch.ones(1, latent_length, 64, device=device, dtype=dtype)
-    is_covers = torch.zeros(1, device=device, dtype=torch.long)
     # Dummy timbre reference (single silence frame -> no timbre conditioning)
     refer_audio = torch.zeros(1, 1, 64, device=device, dtype=dtype)
@@ -3028,7 +3088,7 @@ def generate_audio(
     shift = 3.0 if "turbo" in variant else 1.0
     # ------------------------------------------------------------------
-    # 5. Run diffusion (model.generate_audio handles everything internally)
     # ------------------------------------------------------------------
     logger.info("Running diffusion (%d steps, shift=%.1f)...", steps, shift)
     with torch.no_grad():
@@ -3062,7 +3122,7 @@ def generate_audio(
     logger.info("DiT model unloaded.")
     # ------------------------------------------------------------------
-    # 6. VAE decode latents -> waveform
     # ------------------------------------------------------------------
     logger.info("Loading VAE decoder...")
     vae = load_vae(checkpoint_dir, device)
@@ -3077,7 +3137,7 @@ def generate_audio(
     logger.info("VAE unloaded.")
     # ------------------------------------------------------------------
-    # 7. Save as WAV (48 kHz stereo)
     # ------------------------------------------------------------------
     audio_np = waveform[0].float().clamp(-1.0, 1.0).cpu().numpy()  # [2, samples]
@@ -3105,7 +3165,10 @@ _TOKEN_THINK     = 151667
 _TOKEN_THINK_END = 151668
 _AUDIO_CODE_BASE = 151669
-# Understand system instruction (matches acestep.cpp task-types.h)
 _LM_UNDERSTAND_INSTRUCTION = (
     "Understand the given musical conditions and describe the audio semantics accordingly:"
 )
@@ -3154,6 +3217,224 @@ def _build_understand_prompt(
     return ids
 def _parse_understand_output(text: str) -> Dict[str, str]:
     """Parse CoT metadata + lyrics from understand LM output.

     device: str = "auto",
     adapter_path: Optional[str] = None,
     adapter_scale: float = 1.0,
+    use_lm: bool = True,
+    lm_temperature: float = 0.85,
+    lm_top_p: float = 0.9,
+    lm_top_k: int = 0,
 ) -> str:
+    """Generate audio using the full ACE-Step pipeline (LM + DiT).
     Pipeline:
+        1. LM (Qwen3 1.7B) generates CoT metadata + audio codes from
+           caption and lyrics
+        2. Text encoder  -> text_hidden_states, lyric embeddings
+        3. Load full model (DiT + condition encoder + FSQ)
+        4. Optional: inject LoRA adapter via PEFT
+        5. model.generate_audio()  -- uses LM audio codes as context
+           conditioning via the FSQ detokenizer, then runs flow-matching
+           diffusion
+        6. VAE decode latents -> waveform
+        7. Save waveform as 48 kHz stereo WAV
+        8. Unload all models, free memory
     Args:
         caption:        Text description of the desired music.
         device:         ``"auto"``, ``"cpu"``, ``"cuda:0"``, etc.
         adapter_path:   Path to a PEFT LoRA adapter directory (optional).
         adapter_scale:  Scaling factor applied to the adapter.
+        use_lm:         Run the LM to generate audio codes (True) or skip
+                        and use silence context like before (False).
+        lm_temperature: LM sampling temperature.
+        lm_top_p:       LM nucleus sampling cutoff.
+        lm_top_k:       LM top-K sampling (0 = disabled).
     Returns:
         The *output_path* string (for convenience).
     device = detect_device(device)
     dtype = select_dtype(device)
     logger.info(
+        "generate_audio: device=%s, dtype=%s, variant=%s, steps=%d, duration=%.1fs, use_lm=%s",
+        device, dtype, variant, steps, duration, use_lm,
     )
     # Resolve seed
     logger.info("Using seed=%d", seed)
     # ------------------------------------------------------------------
+    # 1. LM generation -- produce audio codes from caption + lyrics
+    # ------------------------------------------------------------------
+    audio_codes_list: Optional[List[int]] = None
+    if use_lm:
+        logger.info("Running LM to generate audio codes...")
+        audio_codes_list = _generate_codes_with_lm(
+            checkpoint_dir=checkpoint_dir,
+            caption=caption,
+            lyrics=lyrics,
+            duration=duration,
+            device=device,
+            temperature=lm_temperature,
+            top_p=lm_top_p,
+            top_k=lm_top_k,
+        )
+        if audio_codes_list:
+            # The LM determines the actual duration via its code count
+            lm_duration = len(audio_codes_list) / 5.0
+            logger.info(
+                "LM generated %d codes (%.1fs). Overriding duration %.1f -> %.1f",
+                len(audio_codes_list), lm_duration, duration, lm_duration,
+            )
+            duration = lm_duration
+        else:
+            logger.warning("LM produced no codes, falling back to silence context.")
+    # ------------------------------------------------------------------
+    # 2. Text encoder -- encode caption and lyrics
     # ------------------------------------------------------------------
     logger.info("Loading text encoder...")
     tokenizer, text_encoder = load_text_encoder(checkpoint_dir, device)
     logger.info("Text encoder unloaded.")
     # ------------------------------------------------------------------
+    # 3. Load full model (DiT + CondEncoder + FSQ tokenizer/detokenizer)
     # ------------------------------------------------------------------
     logger.info("Loading ACE-Step model (%s)...", variant)
     model = load_model_for_training(checkpoint_dir, variant=variant, device=device)
     model.eval()
     # ------------------------------------------------------------------
+    # 4. Optional: inject LoRA adapter
     # ------------------------------------------------------------------
     if adapter_path:
         logger.info("Loading LoRA adapter from %s (scale=%.2f)...", adapter_path, adapter_scale)
         logger.info("LoRA adapter applied.")
     # ------------------------------------------------------------------
+    # 5. Prepare inputs for model.generate_audio()
     # ------------------------------------------------------------------
     # Latent frame rate is 25 Hz
     LATENT_HZ = 25
         silence_latent = silence_latent.repeat(1, repeats, 1)
     silence_latent = silence_latent[:, :latent_length, :].to(device=device, dtype=dtype)
+    # Build source latents and masks
     src_latents = silence_latent[:1, :latent_length, :]
     chunk_masks = torch.ones(1, latent_length, 64, device=device, dtype=dtype)
+    # Detokenize LM audio codes into context latents for the DiT
+    if audio_codes_list:
+        indices_tensor = torch.tensor(
+            audio_codes_list, dtype=torch.long, device=device,
+        ).unsqueeze(0).unsqueeze(-1)  # [1, T_5Hz, 1]
+        with torch.no_grad():
+            lm_latents = model.tokenizer.quantizer.get_output_from_indices(indices_tensor)
+            # lm_latents: [1, T_5Hz, codebook_dim] -> detokenize to [1, T_25Hz, 64]
+            lm_latents = model.detokenize(lm_latents)
+        T_lm = lm_latents.shape[1]
+        # Use LM latents as src_latents context
+        if T_lm < latent_length:
+            pad = silence_latent[:, :latent_length - T_lm, :]
+            src_latents = torch.cat([lm_latents, pad], dim=1)
+        else:
+            src_latents = lm_latents[:, :latent_length, :]
+        chunk_masks = torch.ones(1, latent_length, 64, device=device, dtype=dtype)
+        is_covers = torch.ones(1, device=device, dtype=torch.long)
+        logger.info("LM codes detokenized: %d codes -> %d latent frames, used as DiT context", len(audio_codes_list), T_lm)
+    else:
+        is_covers = torch.zeros(1, device=device, dtype=torch.long)
     # Dummy timbre reference (single silence frame -> no timbre conditioning)
     refer_audio = torch.zeros(1, 1, 64, device=device, dtype=dtype)
     shift = 3.0 if "turbo" in variant else 1.0
     # ------------------------------------------------------------------
+    # 6. Run diffusion (model.generate_audio handles everything internally)
     # ------------------------------------------------------------------
     logger.info("Running diffusion (%d steps, shift=%.1f)...", steps, shift)
     with torch.no_grad():
     logger.info("DiT model unloaded.")
     # ------------------------------------------------------------------
+    # 7. VAE decode latents -> waveform
     # ------------------------------------------------------------------
     logger.info("Loading VAE decoder...")
     vae = load_vae(checkpoint_dir, device)
     logger.info("VAE unloaded.")
     # ------------------------------------------------------------------
+    # 8. Save as WAV (48 kHz stereo)
     # ------------------------------------------------------------------
     audio_np = waveform[0].float().clamp(-1.0, 1.0).cpu().numpy()  # [2, samples]
 _TOKEN_THINK_END = 151668
 _AUDIO_CODE_BASE = 151669
+# LM system instructions (matches acestep.cpp task-types.h)
+_LM_GENERATE_INSTRUCTION = (
+    "Generate audio semantic tokens based on the given conditions:"
+)
 _LM_UNDERSTAND_INSTRUCTION = (
     "Understand the given musical conditions and describe the audio semantics accordingly:"
 )
     return ids
+def _build_generate_prompt(
+    bpe_tokenizer, caption: str, lyrics: str,
+) -> List[int]:
+    """Build the Qwen3 chat prompt for audio code generation.
+    Format (matching C++ build_lm_prompt in prompt.h):
+        <|im_start|>system
+        # Instruction
+        {LM_GENERATE_INSTRUCTION}
+        <|im_end|>
+        <|im_start|>user
+        # Caption
+        {caption}
+        # Lyric
+        {lyrics}
+        <|im_end|>
+        <|im_start|>assistant
+    """
+    ids: List[int] = []
+    def append_text(text: str):
+        encoded = bpe_tokenizer.encode(text, add_special_tokens=False)
+        ids.extend(encoded)
+    ids.append(_TOKEN_IM_START)
+    append_text(
+        "system\n# Instruction\n"
+        + _LM_GENERATE_INSTRUCTION
+        + "\n\n"
+    )
+    ids.append(_TOKEN_IM_END)
+    append_text("\n")
+    ids.append(_TOKEN_IM_START)
+    append_text(
+        "user\n# Caption\n" + caption + "\n\n"
+        "# Lyric\n" + lyrics + "\n"
+    )
+    ids.append(_TOKEN_IM_END)
+    append_text("\n")
+    ids.append(_TOKEN_IM_START)
+    append_text("assistant\n")
+    return ids
+def _generate_codes_with_lm(
+    checkpoint_dir: str,
+    caption: str,
+    lyrics: str,
+    duration: float,
+    device: str,
+    temperature: float = 0.85,
+    top_p: float = 0.9,
+    top_k: int = 0,
+    max_new_tokens: int = 8192,
+) -> List[int]:
+    """Run the ACE-Step LM (Qwen3 1.7B) to generate audio codes from text.
+    The LM generates in two phases within a single autoregressive pass:
+      Phase 1 (CoT): <think> metadata YAML (bpm, duration, key, etc.) </think>
+      Phase 2 (codes): audio code tokens (token_id >= AUDIO_CODE_BASE)
+    Args:
+        checkpoint_dir: Root directory containing acestep-5Hz-lm-1.7B/.
+        caption: Text description of the music.
+        lyrics: Lyrics text or "[Instrumental]".
+        duration: Target duration in seconds (the LM may override via CoT).
+        device: Torch device string.
+        temperature: Sampling temperature.
+        top_p: Nucleus sampling cutoff (0.0 = disabled).
+        top_k: Top-K sampling (0 = disabled).
+        max_new_tokens: Maximum tokens to generate.
+    Returns:
+        List of FSQ code indices (0-63999 range, NOT offset by AUDIO_CODE_BASE).
+        Length is approximately duration * 5 (5 Hz token rate).
+    """
+    ckpt = Path(checkpoint_dir).resolve()
+    lm_path = ckpt / "acestep-5Hz-lm-1.7B"
+    if not lm_path.is_dir():
+        raise FileNotFoundError(f"LM checkpoint not found: {lm_path}")
+    from transformers import AutoTokenizer, AutoModelForCausalLM
+    # Load BPE tokenizer
+    bpe_tokenizer = AutoTokenizer.from_pretrained(str(lm_path))
+    # Build generation prompt
+    prompt_ids = _build_generate_prompt(bpe_tokenizer, caption, lyrics)
+    logger.info(
+        "[LM Generate] Prompt: %d tokens, caption=%r, lyrics=%r",
+        len(prompt_ids), caption[:80], lyrics[:80],
+    )
+    # Load the LM (Qwen3Model with tied embeddings -> CausalLM)
+    from transformers import Qwen3Config
+    lm_config = Qwen3Config.from_pretrained(str(lm_path))
+    lm_config.architectures = ["Qwen3ForCausalLM"]
+    lm_dtype = select_dtype(device)
+    lm_model = AutoModelForCausalLM.from_pretrained(
+        str(lm_path),
+        config=lm_config,
+        torch_dtype=lm_dtype,
+        low_cpu_mem_usage=True,
+    )
+    lm_model = lm_model.to(device=device)
+    lm_model.eval()
+    logger.info("[LM Generate] LM loaded on %s (dtype=%s)", device, lm_dtype)
+    # Autoregressive decode: single sequence, no CFG.
+    prompt_tensor = torch.tensor([prompt_ids], dtype=torch.long, device=device)
+    with torch.inference_mode():
+        outputs = lm_model(input_ids=prompt_tensor, use_cache=True)
+        logits = outputs.logits[:, -1, :]  # [1, vocab_size]
+        past_kv = outputs.past_key_values
+    gen_tokens: List[int] = []
+    audio_codes: List[int] = []
+    past_think = False
+    in_think = False
+    for step in range(max_new_tokens):
+        logits = logits.clone()
+        # Phase 1 (inside <think>): block audio codes so only text is generated
+        if in_think:
+            logits[0, _AUDIO_CODE_BASE:] = float("-inf")
+        # Phase 2 (after </think>): only allow audio codes + im_end
+        if past_think:
+            # Zero out all non-audio-code logits except im_end (stop token)
+            mask = torch.full_like(logits[0], float("-inf"))
+            mask[_AUDIO_CODE_BASE:] = 0.0
+            mask[_TOKEN_IM_END] = 0.0
+            logits[0] = logits[0] + mask
+        # Sample
+        if temperature <= 0:
+            next_id = int(logits[0].argmax().item())
+        else:
+            scaled = logits[0].clone() / temperature
+            if top_k > 0:
+                topk_vals, _ = torch.topk(scaled, min(top_k, scaled.shape[0]))
+                scaled[scaled < topk_vals[-1]] = float("-inf")
+            if top_p > 0 and top_p < 1.0:
+                sorted_logits, sorted_idx = torch.sort(scaled, descending=True)
+                probs = torch.softmax(sorted_logits, dim=-1)
+                cumsum = torch.cumsum(probs, dim=-1)
+                nucleus_mask = cumsum - probs > top_p
+                sorted_logits[nucleus_mask] = float("-inf")
+                scaled = torch.zeros_like(scaled).scatter(0, sorted_idx, sorted_logits)
+            probs = torch.softmax(scaled, dim=-1)
+            next_id = int(torch.multinomial(probs, 1).item())
+        # Stop on im_end
+        if next_id == _TOKEN_IM_END:
+            break
+        # Track think state transitions
+        if next_id == _TOKEN_THINK:
+            in_think = True
+        elif next_id == _TOKEN_THINK_END:
+            in_think = False
+            past_think = True
+        gen_tokens.append(next_id)
+        # Collect audio codes (Phase 2 tokens)
+        if next_id >= _AUDIO_CODE_BASE:
+            audio_codes.append(next_id - _AUDIO_CODE_BASE)
+        # Next step
+        next_input = torch.tensor([[next_id]], dtype=torch.long, device=device)
+        with torch.inference_mode():
+            outputs = lm_model(
+                input_ids=next_input,
+                past_key_values=past_kv,
+                use_cache=True,
+            )
+            logits = outputs.logits[:, -1, :]
+            past_kv = outputs.past_key_values
+    # Log what the LM generated
+    cot_tokens = [
+        t for t in gen_tokens
+        if t < _AUDIO_CODE_BASE and t not in (
+            _TOKEN_IM_START, _TOKEN_IM_END, _TOKEN_THINK, _TOKEN_THINK_END,
+        )
+    ]
+    if cot_tokens:
+        cot_text = bpe_tokenizer.decode(cot_tokens, skip_special_tokens=False)
+        logger.info("[LM Generate] CoT output:\n%s", cot_text[:500])
+    logger.info(
+        "[LM Generate] Generated %d total tokens, %d audio codes (%.1fs @ 5Hz)",
+        len(gen_tokens), len(audio_codes), len(audio_codes) / 5.0,
+    )
+    # Unload LM
+    del outputs, logits, past_kv, prompt_tensor
+    unload_models(lm_model)
+    del lm_model, bpe_tokenizer
+    gc.collect()
+    _clear_gpu_cache(device)
+    logger.info("[LM Generate] LM unloaded")
+    if not audio_codes:
+        logger.warning(
+            "[LM Generate] No audio codes generated! The DiT will fall back to "
+            "silence context. Check that the LM checkpoint is correct."
+        )
+    return audio_codes
 def _parse_understand_output(text: str) -> Dict[str, str]:
     """Parse CoT metadata + lyrics from understand LM output.