Spaces:

WeReCooking
/

ACE-Step-CPU

Running

App Files Files Community

Nekochu commited on 9 days ago

Commit

2e395ab

1 Parent(s): 81f54b1

chunk latents into ~30s segments for faster CPU training, energy-aware boundaries

Browse files

Files changed (2) hide show

app.py +2 -1
train_engine.py +88 -43

app.py CHANGED Viewed

@@ -749,7 +749,8 @@ def gradio_main():
             processed = result.get("processed", 0)
             failed = result.get("failed", 0)
             total = result.get("total", 0)
-            _log(f"[OK] Preprocessed: {processed}/{total} (failed: {failed})")
             yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
             if processed == 0:

             processed = result.get("processed", 0)
             failed = result.get("failed", 0)
             total = result.get("total", 0)
+            chunks = result.get("chunks", processed)
+            _log(f"[OK] Preprocessed: {total} files -> {processed} training samples (failed: {failed})")
             yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
             if processed == 0:

train_engine.py CHANGED Viewed

@@ -64,6 +64,10 @@ logger = logging.getLogger(__name__)
 MAX_AUDIO_DURATION = 240.0    # seconds, cap per audio file
 MAX_TRAINING_TIME = 28800     # 8 hours hard timeout
 TARGET_SR = 48000
 AUDIO_EXTENSIONS = frozenset({".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".aac"})
 # bfloat16 deadlocks on CPU (known PyTorch bug) -- force float32
@@ -778,6 +782,46 @@ def encode_lyrics(text_encoder, tokenizer, lyrics: str, device, dtype):
     return hs, mask
 # ============================================================================
 # VAE TILED ENCODING
 # ============================================================================
@@ -2148,18 +2192,13 @@ def preprocess_audio(
                     del target_latents
                     continue
-                lat_len = target_latents.shape[1]
-                att_mask = torch.ones(1, lat_len, device=device, dtype=dtype)
-                # Auto-caption: read existing sidecar or analyze
                 sidecar = _read_caption_sidecar(af)
                 if sidecar is not None:
                     caption = sidecar.get("caption", "") or af.stem
                     lyrics = sidecar.get("lyrics", "[Instrumental]")
                     logger.info("[Caption] %s: using existing sidecar", af.name)
                 else:
-                    # Auto-select analysis mode based on dataset size
-                    # mid/sas use Demucs stem separation — GPU only
                     if device == "cpu":
                         analysis_mode = "faf"
                     elif total <= 20:
@@ -2169,7 +2208,6 @@ def preprocess_audio(
                     else:
                         analysis_mode = "faf"
-                    # Log mode selection with reasoning (first file only)
                     if i == 0:
                         _MODE_DESC = {
                             "faf": "fast, ~3s/file",
@@ -2177,19 +2215,9 @@ def preprocess_audio(
                             "sas": "best quality, ~30s/file on GPU, slower on CPU",
                         }
                         logger.info(
-                            "[Analysis] Mode auto-selected: '%s' (%s) "
-                            "for %d files (<=20: sas, 21-100: mid, 100+: faf)",
                             analysis_mode, _MODE_DESC[analysis_mode], total,
                         )
-                        if analysis_mode in ("mid", "sas") and device == "cpu":
-                            logger.warning(
-                                "[Analysis] Mode '%s' uses Demucs stem separation "
-                                "which is SLOW on CPU (~2-5 min/file). "
-                                "Total estimated time: ~%d-%d min for %d files. "
-                                "Use 'faf' mode or a GPU machine for faster processing.",
-                                analysis_mode,
-                                total * 2, total * 5, total,
-                            )
                     try:
                         logger.info("[Caption] %s: analyzing (mode=%s)...", af.name, analysis_mode)
@@ -2204,10 +2232,9 @@ def preprocess_audio(
                         logger.warning("[Caption] %s: analysis failed (%s), using filename", af.name, exc)
                         caption = af.stem
                         lyrics = "[Instrumental]"
-                text_prompt = caption
                 with torch.no_grad():
-                    text_hs, text_mask = encode_text(text_enc, tokenizer, text_prompt, device, dtype)
                     lyric_hs, lyric_mask = encode_lyrics(text_enc, tokenizer, lyrics, device, dtype)
                 has_bad = any(
@@ -2216,32 +2243,49 @@ def preprocess_audio(
                 )
                 if has_bad:
                     p1_failed += 1
-                    del target_latents, att_mask, text_hs, text_mask, lyric_hs, lyric_mask
                     continue
-                tmp_path = out / f"{stem}.tmp.pt"
-                torch.save({
-                    "target_latents": target_latents.squeeze(0).cpu(),
-                    "attention_mask": att_mask.squeeze(0).cpu(),
-                    "text_hidden_states": text_hs.cpu(),
-                    "text_attention_mask": text_mask.cpu(),
-                    "lyric_hidden_states": lyric_hs.cpu(),
-                    "lyric_attention_mask": lyric_mask.cpu(),
-                    "silence_latent": silence_lat.cpu(),
-                    "latent_length": lat_len,
-                    "metadata": {
-                        "audio_path": str(af),
-                        "filename": af.name,
-                        "caption": caption,
-                        "lyrics": lyrics,
-                    },
-                }, tmp_path)
-                del target_latents, att_mask, text_hs, text_mask, lyric_hs, lyric_mask
-                intermediates.append(tmp_path)
                 if progress_callback:
-                    progress_callback(i + 1, total, f"[Pass 1] {af.name}")
             except Exception as exc:
                 p1_failed += 1
@@ -2329,7 +2373,8 @@ def preprocess_audio(
         _clear_gpu_cache(device)
     failed = p1_failed + p2_failed
-    return {"processed": processed, "failed": failed, "total": total, "output_dir": str(out)}
 # ============================================================================

 MAX_AUDIO_DURATION = 240.0    # seconds, cap per audio file
 MAX_TRAINING_TIME = 28800     # 8 hours hard timeout
 TARGET_SR = 48000
+LATENT_HZ = 25               # latent frames per second (48000 / 1920)
+CHUNK_LATENT_MIN = 20 * LATENT_HZ   # 500 frames (20s)
+CHUNK_LATENT_TARGET = 30 * LATENT_HZ  # 750 frames (30s)
+CHUNK_LATENT_MAX = 40 * LATENT_HZ   # 1000 frames (40s)
 AUDIO_EXTENSIONS = frozenset({".wav", ".mp3", ".flac", ".ogg", ".opus", ".m4a", ".aac"})
 # bfloat16 deadlocks on CPU (known PyTorch bug) -- force float32
     return hs, mask
+# ============================================================================
+# LATENT CHUNKING (split long latents into ~30s training samples)
+# ============================================================================
+def _chunk_latents(latent: torch.Tensor) -> List[torch.Tensor]:
+    """Split a [T, C] latent into ~30s chunks for faster training.
+    Uses energy-based boundary detection: finds the lowest-energy frame
+    within the 20-40s window around each cut point, avoiding cuts through
+    loud notes. Short files (<=40s) are returned as-is.
+    """
+    T = latent.shape[0]
+    if T <= CHUNK_LATENT_MAX:
+        return [latent]
+    energy = latent.pow(2).mean(dim=-1)  # [T] per-frame energy
+    chunks = []
+    pos = 0
+    while pos < T:
+        remaining = T - pos
+        if remaining <= CHUNK_LATENT_MAX:
+            if chunks and remaining < CHUNK_LATENT_MIN:
+                # Merge short tail into the previous chunk
+                chunks[-1] = latent[pos - chunks[-1].shape[0]:]
+            else:
+                chunks.append(latent[pos:])
+            break
+        search_start = pos + CHUNK_LATENT_MIN
+        search_end = min(pos + CHUNK_LATENT_MAX, T)
+        window = energy[search_start:search_end]
+        cut = search_start + window.argmin().item()
+        chunks.append(latent[pos:cut])
+        pos = cut
+    return chunks
 # ============================================================================
 # VAE TILED ENCODING
 # ============================================================================
                     del target_latents
                     continue
+                # Auto-caption (once per file, shared across chunks)
                 sidecar = _read_caption_sidecar(af)
                 if sidecar is not None:
                     caption = sidecar.get("caption", "") or af.stem
                     lyrics = sidecar.get("lyrics", "[Instrumental]")
                     logger.info("[Caption] %s: using existing sidecar", af.name)
                 else:
                     if device == "cpu":
                         analysis_mode = "faf"
                     elif total <= 20:
                     else:
                         analysis_mode = "faf"
                     if i == 0:
                         _MODE_DESC = {
                             "faf": "fast, ~3s/file",
                             "sas": "best quality, ~30s/file on GPU, slower on CPU",
                         }
                         logger.info(
+                            "[Analysis] Mode '%s' (%s) for %d files",
                             analysis_mode, _MODE_DESC[analysis_mode], total,
                         )
                     try:
                         logger.info("[Caption] %s: analyzing (mode=%s)...", af.name, analysis_mode)
                         logger.warning("[Caption] %s: analysis failed (%s), using filename", af.name, exc)
                         caption = af.stem
                         lyrics = "[Instrumental]"
                 with torch.no_grad():
+                    text_hs, text_mask = encode_text(text_enc, tokenizer, caption, device, dtype)
                     lyric_hs, lyric_mask = encode_lyrics(text_enc, tokenizer, lyrics, device, dtype)
                 has_bad = any(
                 )
                 if has_bad:
                     p1_failed += 1
+                    del target_latents, text_hs, text_mask, lyric_hs, lyric_mask
                     continue
+                # Chunk latents into ~30s segments for faster training
+                full_lat = target_latents.squeeze(0).cpu()  # [T, C]
+                T = full_lat.shape[0]
+                chunks = _chunk_latents(full_lat)
+                logger.info("[Chunk] %s: %d frames -> %d chunks", af.name, T, len(chunks))
+                text_hs_cpu = text_hs.cpu()
+                text_mask_cpu = text_mask.cpu()
+                lyric_hs_cpu = lyric_hs.cpu()
+                lyric_mask_cpu = lyric_mask.cpu()
+                silence_cpu = silence_lat.cpu()
+                meta = {
+                    "audio_path": str(af),
+                    "filename": af.name,
+                    "caption": caption,
+                    "lyrics": lyrics,
+                }
+                for ci, chunk_lat in enumerate(chunks):
+                    chunk_len = chunk_lat.shape[0]
+                    chunk_mask = torch.ones(chunk_len, dtype=dtype)
+                    tag = f"{stem}_chunk{ci}" if len(chunks) > 1 else stem
+                    tmp_path = out / f"{tag}.tmp.pt"
+                    torch.save({
+                        "target_latents": chunk_lat,
+                        "attention_mask": chunk_mask,
+                        "text_hidden_states": text_hs_cpu,
+                        "text_attention_mask": text_mask_cpu,
+                        "lyric_hidden_states": lyric_hs_cpu,
+                        "lyric_attention_mask": lyric_mask_cpu,
+                        "silence_latent": silence_cpu,
+                        "latent_length": chunk_len,
+                        "metadata": meta,
+                    }, tmp_path)
+                    intermediates.append(tmp_path)
+                del target_latents, full_lat, text_hs, text_mask, lyric_hs, lyric_mask
                 if progress_callback:
+                    progress_callback(i + 1, total, f"[Pass 1] {af.name} ({len(chunks)} chunks)")
             except Exception as exc:
                 p1_failed += 1
         _clear_gpu_cache(device)
     failed = p1_failed + p2_failed
+    return {"processed": processed, "failed": failed, "total": total,
+            "chunks": len(intermediates), "output_dir": str(out)}
 # ============================================================================