Spaces:

WeReCooking
/

ACE-Step-CPU

Running

App Files Files Community

Nekochu commited on 17 days ago

Commit

6bfdc38

1 Parent(s): ff239f5

add understand_audio (LM reverse), demucs-infer fix, commit refs, dtype fixes

Browse files

Files changed (1) hide show

train_engine.py +411 -0

train_engine.py CHANGED Viewed

@@ -4,6 +4,11 @@ Standalone ACE-Step LoRA Training Engine (CPU + GPU).
 Ported from Side-Step (koda-dernet/Side-Step) into a single self-contained
 module. No external Side-Step dependency required.
 Auto-detects GPU (CUDA > MPS > CPU) and uses it when available,
 falling back to CPU.  bfloat16 is used on GPU; float32 is forced
 on CPU (bfloat16 deadlocks on CPU -- known PyTorch bug).
@@ -17,6 +22,7 @@ Exports:
     get_trained_loras()      - List saved adapters
     generate_audio()         - Standalone inference (text -> WAV, optional LoRA)
     tiled_vae_decode()       - Tiled VAE latent-to-waveform decode
 """
 from __future__ import annotations
@@ -3086,3 +3092,408 @@ def generate_audio(
     logger.info("Audio saved to %s (%.1fs @ %d Hz)", output_path, duration, TARGET_SR)
     return output_path

 Ported from Side-Step (koda-dernet/Side-Step) into a single self-contained
 module. No external Side-Step dependency required.
+Source commits:
+  Side-Step: koda-dernet/Side-Step @ ecd13bd (2026-04-19)
+  acestep.cpp: ServeurpersoCom/acestep.cpp @ 36e4db1 (prompt/understand format)
+  ACE-Step: ace-step/ACE-Step-1.5 (model architecture, training_v2)
 Auto-detects GPU (CUDA > MPS > CPU) and uses it when available,
 falling back to CPU.  bfloat16 is used on GPU; float32 is forced
 on CPU (bfloat16 deadlocks on CPU -- known PyTorch bug).
     get_trained_loras()      - List saved adapters
     generate_audio()         - Standalone inference (text -> WAV, optional LoRA)
     tiled_vae_decode()       - Tiled VAE latent-to-waveform decode
+    understand_audio()       - Reverse pipeline (audio -> caption + lyrics)
 """
 from __future__ import annotations
     logger.info("Audio saved to %s (%.1fs @ %d Hz)", output_path, duration, TARGET_SR)
     return output_path
+# ============================================================================
+# UNDERSTAND MODE (reverse pipeline: audio -> caption + lyrics)
+# ============================================================================
+# Qwen3 special token IDs (ACE-Step LM vocabulary)
+_TOKEN_IM_START  = 151644
+_TOKEN_IM_END    = 151645
+_TOKEN_THINK     = 151667
+_TOKEN_THINK_END = 151668
+_AUDIO_CODE_BASE = 151669
+# Understand system instruction (matches acestep.cpp task-types.h)
+_LM_UNDERSTAND_INSTRUCTION = (
+    "Understand the given musical conditions and describe the audio semantics accordingly:"
+)
+def _build_understand_prompt(
+    bpe_tokenizer, codes: List[int],
+) -> List[int]:
+    """Build the Qwen3 chat prompt for understand mode.
+    Format (matching C++ build_understand_prompt in prompt.h):
+        <|im_start|>system
+        # Instruction
+        {LM_UNDERSTAND_INSTRUCTION}
+        <|im_end|>
+        <|im_start|>user
+        {audio_code_tokens}
+        <|im_end|>
+        <|im_start|>assistant
+    """
+    ids: List[int] = []
+    def append_text(text: str):
+        encoded = bpe_tokenizer.encode(text, add_special_tokens=False)
+        ids.extend(encoded)
+    ids.append(_TOKEN_IM_START)
+    append_text(
+        "system\n# Instruction\n"
+        + _LM_UNDERSTAND_INSTRUCTION
+        + "\n\n"
+    )
+    ids.append(_TOKEN_IM_END)
+    append_text("\n")
+    ids.append(_TOKEN_IM_START)
+    append_text("user\n")
+    # Audio codes as raw token IDs (not BPE text)
+    for code in codes:
+        ids.append(_AUDIO_CODE_BASE + code)
+    append_text("\n")
+    ids.append(_TOKEN_IM_END)
+    append_text("\n")
+    ids.append(_TOKEN_IM_START)
+    append_text("assistant\n")
+    return ids
+def _parse_understand_output(text: str) -> Dict[str, str]:
+    """Parse CoT metadata + lyrics from understand LM output.
+    The LM generates:
+        <think>
+        bpm: 120
+        caption: ...
+        duration: 180
+        keyscale: C major
+        language: en
+        timesignature: 4
+        </think>
+        [Verse 1]
+        ...lyrics...
+    Returns dict with: caption, lyrics, bpm, key, signature, duration,
+    language.
+    """
+    result: Dict[str, str] = {}
+    # Split at <think> / </think> boundaries
+    cot = ""
+    lyrics_after = ""
+    ts = text.find("<think>")
+    te = text.find("</think>")
+    if ts != -1 and te != -1:
+        cot = text[ts + 7:te]
+        lyrics_after = text[te + 8:]
+    elif te != -1:
+        cot = text[:te]
+        lyrics_after = text[te + 8:]
+    else:
+        cot = text
+    # Parse YAML-like fields from CoT
+    def get_field(key: str) -> str:
+        needle = key + ":"
+        p = cot.find(needle)
+        if p == -1:
+            return ""
+        p += len(needle)
+        # Skip leading whitespace and quotes
+        while p < len(cot) and cot[p] in (" ", "'"):
+            p += 1
+        end = cot.find("\n", p)
+        if end == -1:
+            end = len(cot)
+        val = cot[p:end].rstrip(" '\r")
+        return val
+    bpm_s = get_field("bpm")
+    if bpm_s:
+        result["bpm"] = bpm_s
+    dur_s = get_field("duration")
+    if dur_s:
+        result["duration"] = dur_s
+    ks = get_field("keyscale")
+    if ks:
+        result["key"] = ks
+    ts_s = get_field("timesignature")
+    if ts_s:
+        result["signature"] = ts_s
+    lang = get_field("language")
+    if lang:
+        result["language"] = lang
+    # Caption may span multiple lines (YAML word-wrap)
+    cap_needle = "caption:"
+    cp = cot.find(cap_needle)
+    if cp != -1:
+        cp += len(cap_needle)
+        # Read until next known field or end of CoT
+        end = len(cot)
+        for next_field in ("duration:", "keyscale:", "language:", "timesignature:", "bpm:"):
+            nf = cot.find("\n" + next_field, cp)
+            if nf != -1 and nf < end:
+                end = nf
+        full_cap = cot[cp:end]
+        # Collapse whitespace
+        cleaned = " ".join(full_cap.split()).strip()
+        if cleaned:
+            result["caption"] = cleaned
+    # Lyrics after </think>
+    if lyrics_after:
+        lyrics = lyrics_after.strip()
+        # Strip "# Lyric\n" header the LM may echo back
+        lp = lyrics.find("# Lyric\n")
+        if lp != -1 and lp < 64:
+            lyrics = lyrics[lp + 8:]
+        lyrics = lyrics.strip()
+        if lyrics:
+            result["lyrics"] = lyrics
+    return result
+def understand_audio(
+    audio_path: str,
+    checkpoint_dir: str,
+    device: str = "auto",
+    variant: str = "turbo",
+    temperature: float = 0.3,
+    top_p: float = 0.0,
+    top_k: int = 0,
+    max_new_tokens: int = 4096,
+) -> Dict[str, str]:
+    """Extract caption, lyrics, BPM, key, signature from audio using the LM.
+    Pipeline: audio -> VAE encode -> FSQ tokenize -> LM understand -> text
+    Returns dict with: caption, lyrics, bpm, key, signature, duration,
+    language.
+    Args:
+        audio_path: Path to input audio file (WAV, MP3, FLAC, etc.)
+        checkpoint_dir: Path to ACE-Step checkpoints root directory
+            (must contain vae/, acestep-v15-turbo/ or variant subdir,
+            and acestep-5Hz-lm-1.7B/).
+        device: Device string ("auto", "cuda:0", "cpu", etc.)
+        variant: DiT variant to load for FSQ tokenizer ("turbo", "sft",
+            "base", etc.)
+        temperature: LM sampling temperature (default 0.3, lower = more
+            deterministic).
+        top_p: Nucleus sampling cutoff (0.0 = disabled).
+        top_k: Top-K sampling (0 = disabled).
+        max_new_tokens: Maximum tokens to generate.
+    Returns:
+        Dict with extracted metadata. Keys may include:
+        caption, lyrics, bpm, key, signature, duration, language.
+    """
+    device = detect_device(device)
+    dtype = select_dtype(device)
+    ckpt = Path(checkpoint_dir).resolve()
+    # ------------------------------------------------------------------
+    # Step 1: Load audio -> VAE encode -> latents [1, T_25Hz, 64]
+    # ------------------------------------------------------------------
+    logger.info("[Understand] Step 1: VAE encode")
+    audio, sr = load_audio_stereo(audio_path, TARGET_SR, MAX_AUDIO_DURATION)
+    audio = audio.unsqueeze(0)  # [1, 2, samples]
+    logger.info(
+        "[Understand] Audio loaded: %.1fs, %d samples @ %d Hz",
+        audio.shape[-1] / TARGET_SR, audio.shape[-1], TARGET_SR,
+    )
+    vae = load_vae(checkpoint_dir, device)
+    latents = tiled_vae_encode(vae, audio, dtype)  # [1, T_25Hz, 64]
+    T_25Hz = latents.shape[1]
+    logger.info("[Understand] VAE encoded: %d latent frames (%.2fs)", T_25Hz, T_25Hz * 1920.0 / TARGET_SR)
+    unload_models(vae)
+    del vae, audio
+    gc.collect()
+    _clear_gpu_cache(device)
+    logger.info("[Understand] VAE unloaded")
+    # ------------------------------------------------------------------
+    # Step 2: Load DiT (for FSQ tokenizer) -> tokenize latents -> codes
+    # ------------------------------------------------------------------
+    logger.info("[Understand] Step 2: FSQ tokenize")
+    # Load silence_latent for padding
+    silence_latent = load_silence_latent(checkpoint_dir, device="cpu", variant=variant)
+    # Load DiT model (only need its tokenizer submodule)
+    model = load_model_for_training(checkpoint_dir, variant=variant, device=device)
+    model = model.to(dtype=dtype)
+    pool_window = model.config.pool_window_size  # 5 (25Hz -> 5Hz)
+    # Pad latents to multiple of pool_window_size
+    lat = latents.to(device=device, dtype=dtype)
+    pad_len = 0
+    if T_25Hz % pool_window != 0:
+        pad_len = pool_window - (T_25Hz % pool_window)
+        # Use silence_latent for padding
+        sl = silence_latent[:1, :pad_len, :].to(device=device, dtype=dtype)
+        lat = torch.cat([lat, sl.expand(lat.shape[0], -1, -1)], dim=1)
+    # Tokenize: lat [1, T_padded, 64] -> indices [1, T_5Hz, 1]
+    with torch.inference_mode():
+        _quantized, indices = model.tokenizer.tokenize(lat)
+    # indices shape: [1, T_5Hz, num_quantizers=1] -> flatten to [T_5Hz]
+    codes = indices.squeeze(0).squeeze(-1).cpu().tolist()  # List[int]
+    T_5Hz = len(codes)
+    logger.info(
+        "[Understand] FSQ tokenized: %d codes (%.2fs @ 5Hz)",
+        T_5Hz, T_5Hz / 5.0,
+    )
+    unload_models(model)
+    del model, lat, latents, _quantized, indices, silence_latent
+    gc.collect()
+    _clear_gpu_cache(device)
+    logger.info("[Understand] DiT unloaded")
+    # ------------------------------------------------------------------
+    # Step 3: Load LM -> build understand prompt -> generate text
+    # ------------------------------------------------------------------
+    logger.info("[Understand] Step 3: LM generation")
+    lm_path = ckpt / "acestep-5Hz-lm-1.7B"
+    if not lm_path.is_dir():
+        raise FileNotFoundError(f"LM checkpoint not found: {lm_path}")
+    from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
+    # Load BPE tokenizer
+    bpe_tokenizer = AutoTokenizer.from_pretrained(str(lm_path))
+    # Build understand prompt
+    prompt_ids = _build_understand_prompt(bpe_tokenizer, codes)
+    logger.info(
+        "[Understand] Prompt: %d tokens (%d codes + framing)",
+        len(prompt_ids), len(codes),
+    )
+    # Load the LM (Qwen3Model with tied embeddings).
+    # Config says "Qwen3Model" but we need generation (lm_head). Since
+    # tie_word_embeddings=true, Qwen3ForCausalLM will tie the lm_head
+    # to embed_tokens automatically. We override the architecture to load
+    # as CausalLM.
+    from transformers import Qwen3Config
+    lm_config = Qwen3Config.from_pretrained(str(lm_path))
+    lm_config.architectures = ["Qwen3ForCausalLM"]
+    lm_dtype = select_dtype(device)
+    lm_model = AutoModelForCausalLM.from_pretrained(
+        str(lm_path),
+        config=lm_config,
+        torch_dtype=lm_dtype,
+        low_cpu_mem_usage=True,
+    )
+    lm_model = lm_model.to(device=device)
+    lm_model.eval()
+    logger.info("[Understand] LM loaded on %s (dtype=%s)", device, lm_dtype)
+    vocab_size = lm_config.vocab_size  # 217204
+    # Autoregressive decode: no CFG, no batch, single sequence.
+    # FSM is not implemented in Python (would require the prefix tree);
+    # the LM generates structured CoT well enough without it at low temp.
+    prompt_tensor = torch.tensor([prompt_ids], dtype=torch.long, device=device)
+    with torch.inference_mode():
+        # Prefill
+        outputs = lm_model(input_ids=prompt_tensor, use_cache=True)
+        logits = outputs.logits[:, -1, :]  # [1, vocab_size]
+        past_kv = outputs.past_key_values
+    gen_tokens: List[int] = []
+    past_think = False
+    for step in range(max_new_tokens):
+        # After </think>: block audio codes so the LM only generates text
+        if past_think:
+            logits[0, _AUDIO_CODE_BASE:] = float("-inf")
+        # Sample
+        if temperature <= 0:
+            next_id = int(logits[0].argmax().item())
+        else:
+            scaled = logits[0] / temperature
+            if top_k > 0:
+                # Zero out everything below top_k
+                topk_vals, _ = torch.topk(scaled, min(top_k, scaled.shape[0]))
+                scaled[scaled < topk_vals[-1]] = float("-inf")
+            if top_p > 0 and top_p < 1.0:
+                sorted_logits, sorted_idx = torch.sort(scaled, descending=True)
+                probs = torch.softmax(sorted_logits, dim=-1)
+                cumsum = torch.cumsum(probs, dim=-1)
+                mask = cumsum - probs > top_p
+                sorted_logits[mask] = float("-inf")
+                # Scatter masked values back to original positions
+                scaled = torch.zeros_like(scaled).scatter(0, sorted_idx, sorted_logits)
+            probs = torch.softmax(scaled, dim=-1)
+            next_id = int(torch.multinomial(probs, 1).item())
+        if next_id == _TOKEN_IM_END:
+            break
+        if next_id == _TOKEN_THINK_END:
+            past_think = True
+        gen_tokens.append(next_id)
+        # Next step
+        next_input = torch.tensor([[next_id]], dtype=torch.long, device=device)
+        with torch.inference_mode():
+            outputs = lm_model(
+                input_ids=next_input,
+                past_key_values=past_kv,
+                use_cache=True,
+            )
+            logits = outputs.logits[:, -1, :]
+            past_kv = outputs.past_key_values
+    logger.info("[Understand] Generated %d tokens", len(gen_tokens))
+    # Decode tokens to text (skip audio code tokens and special tokens)
+    text_tokens = [
+        t for t in gen_tokens
+        if t < _AUDIO_CODE_BASE and t not in (
+            _TOKEN_IM_START, _TOKEN_IM_END, _TOKEN_THINK, _TOKEN_THINK_END,
+        )
+    ]
+    generated_text = bpe_tokenizer.decode(text_tokens, skip_special_tokens=False)
+    # Re-insert <think> / </think> markers for the parser
+    think_text = ""
+    in_think = False
+    for t in gen_tokens:
+        if t == _TOKEN_THINK:
+            think_text += "<think>"
+            in_think = True
+        elif t == _TOKEN_THINK_END:
+            think_text += "</think>"
+            in_think = False
+        elif t < _AUDIO_CODE_BASE and t not in (_TOKEN_IM_START, _TOKEN_IM_END):
+            think_text += bpe_tokenizer.decode([t], skip_special_tokens=False)
+    logger.info("[Understand] Raw output:\n%s", think_text[:500])
+    # Unload LM
+    del outputs, logits, past_kv, prompt_tensor
+    unload_models(lm_model)
+    del lm_model, bpe_tokenizer
+    gc.collect()
+    _clear_gpu_cache(device)
+    logger.info("[Understand] LM unloaded")
+    # ------------------------------------------------------------------
+    # Step 4: Parse generated text into structured fields
+    # ------------------------------------------------------------------
+    result = _parse_understand_output(think_text)
+    logger.info("[Understand] Parsed result: %s", {k: v[:80] + "..." if len(v) > 80 else v for k, v in result.items()})
+    return result