Spaces:

WeReCooking
/

ACE-Step-CPU

Running

App Files Files Community

Nekochu commited on 7 days ago

Commit

1ee8f1f

1 Parent(s): 2e395ab

audio-level chunking (not latent), auto-scale epochs for chunk count

Browse files

Files changed (2) hide show

app.py +10 -2
train_engine.py +56 -42

app.py CHANGED Viewed

@@ -749,7 +749,6 @@ def gradio_main():
             processed = result.get("processed", 0)
             failed = result.get("failed", 0)
             total = result.get("total", 0)
-            chunks = result.get("chunks", processed)
             _log(f"[OK] Preprocessed: {total} files -> {processed} training samples (failed: {failed})")
             yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
@@ -758,6 +757,15 @@ def gradio_main():
                 yield _log_text(), gr.Button(visible=True), gr.Button(visible=False), gr.File()
                 return
             _gc.collect()
             # -- Phase 2: Training --
@@ -768,7 +776,7 @@ def gradio_main():
                 dataset_dir=preprocessed_dir,
                 output_dir=adapter_out,
                 checkpoint_dir=ACE_CHECKPOINT_DIR,
-                epochs=epochs,
                 lr=lr,
                 rank=rank,
                 alpha=rank * 2,

             processed = result.get("processed", 0)
             failed = result.get("failed", 0)
             total = result.get("total", 0)
             _log(f"[OK] Preprocessed: {total} files -> {processed} training samples (failed: {failed})")
             yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
                 yield _log_text(), gr.Button(visible=True), gr.Button(visible=False), gr.File()
                 return
+            # Auto-scale epochs: chunking multiplies samples, reduce epochs
+            # to keep total gradient updates ~constant
+            effective_epochs = epochs
+            if processed > total and total > 0:
+                scale = total / processed
+                effective_epochs = max(10, int(epochs * scale))
+                _log(f"[INFO] Auto-scaled epochs: {epochs} -> {effective_epochs} "
+                     f"(chunking: {total} files -> {processed} samples)")
             _gc.collect()
             # -- Phase 2: Training --
                 dataset_dir=preprocessed_dir,
                 output_dir=adapter_out,
                 checkpoint_dir=ACE_CHECKPOINT_DIR,
+                epochs=effective_epochs,
                 lr=lr,
                 rank=rank,
                 alpha=rank * 2,

train_engine.py CHANGED Viewed

@@ -783,40 +783,54 @@ def encode_lyrics(text_encoder, tokenizer, lyrics: str, device, dtype):
 # ============================================================================
-# LATENT CHUNKING (split long latents into ~30s training samples)
 # ============================================================================
-def _chunk_latents(latent: torch.Tensor) -> List[torch.Tensor]:
-    """Split a [T, C] latent into ~30s chunks for faster training.
-    Uses energy-based boundary detection: finds the lowest-energy frame
-    within the 20-40s window around each cut point, avoiding cuts through
-    loud notes. Short files (<=40s) are returned as-is.
     """
-    T = latent.shape[0]
-    if T <= CHUNK_LATENT_MAX:
-        return [latent]
-    energy = latent.pow(2).mean(dim=-1)  # [T] per-frame energy
     chunks = []
     pos = 0
-    while pos < T:
-        remaining = T - pos
-        if remaining <= CHUNK_LATENT_MAX:
-            if chunks and remaining < CHUNK_LATENT_MIN:
-                # Merge short tail into the previous chunk
-                chunks[-1] = latent[pos - chunks[-1].shape[0]:]
-            else:
-                chunks.append(latent[pos:])
             break
-        search_start = pos + CHUNK_LATENT_MIN
-        search_end = min(pos + CHUNK_LATENT_MAX, T)
-        window = energy[search_start:search_end]
         cut = search_start + window.argmin().item()
-        chunks.append(latent[pos:cut])
         pos = cut
     return chunks
@@ -2181,16 +2195,6 @@ def preprocess_audio(
             try:
                 audio, _ = load_audio_stereo(str(af), TARGET_SR, max_duration)
-                audio = audio.unsqueeze(0).to(device=device, dtype=vae.dtype)
-                with torch.no_grad():
-                    target_latents = tiled_vae_encode(vae, audio, dtype)
-                del audio
-                if torch.isnan(target_latents).any() or torch.isinf(target_latents).any():
-                    p1_failed += 1
-                    del target_latents
-                    continue
                 # Auto-caption (once per file, shared across chunks)
                 sidecar = _read_caption_sidecar(af)
@@ -2243,14 +2247,13 @@ def preprocess_audio(
                 )
                 if has_bad:
                     p1_failed += 1
-                    del target_latents, text_hs, text_mask, lyric_hs, lyric_mask
                     continue
-                # Chunk latents into ~30s segments for faster training
-                full_lat = target_latents.squeeze(0).cpu()  # [T, C]
-                T = full_lat.shape[0]
-                chunks = _chunk_latents(full_lat)
-                logger.info("[Chunk] %s: %d frames -> %d chunks", af.name, T, len(chunks))
                 text_hs_cpu = text_hs.cpu()
                 text_mask_cpu = text_mask.cpu()
@@ -2263,11 +2266,21 @@ def preprocess_audio(
                     "caption": caption,
                     "lyrics": lyrics,
                 }
-                for ci, chunk_lat in enumerate(chunks):
                     chunk_len = chunk_lat.shape[0]
                     chunk_mask = torch.ones(chunk_len, dtype=dtype)
-                    tag = f"{stem}_chunk{ci}" if len(chunks) > 1 else stem
                     tmp_path = out / f"{tag}.tmp.pt"
                     torch.save({
                         "target_latents": chunk_lat,
@@ -2281,11 +2294,12 @@ def preprocess_audio(
                         "metadata": meta,
                     }, tmp_path)
                     intermediates.append(tmp_path)
-                del target_latents, full_lat, text_hs, text_mask, lyric_hs, lyric_mask
                 if progress_callback:
-                    progress_callback(i + 1, total, f"[Pass 1] {af.name} ({len(chunks)} chunks)")
             except Exception as exc:
                 p1_failed += 1

 # ============================================================================
+# AUDIO CHUNKING (split long audio into ~30s training samples)
 # ============================================================================
+CHUNK_MIN_SAMPLES = 20 * TARGET_SR   # 20s
+CHUNK_MAX_SAMPLES = 40 * TARGET_SR   # 40s
+def _chunk_audio(audio: torch.Tensor) -> List[torch.Tensor]:
+    """Split a [C, S] audio tensor into ~30s chunks for faster training.
+    Uses RMS energy to find the quietest point within the 20-40s window
+    around each cut, avoiding cuts through loud notes.
+    Short files (<=40s) are returned as-is.
     """
+    S = audio.shape[-1]
+    if S <= CHUNK_MAX_SAMPLES:
+        return [audio]
+    mono = audio.mean(dim=0)  # [S]
+    hop = TARGET_SR // 10  # 0.1s resolution
+    frame_count = S // hop
+    rms = torch.zeros(frame_count)
+    for fi in range(frame_count):
+        seg = mono[fi * hop:(fi + 1) * hop]
+        rms[fi] = seg.pow(2).mean().sqrt()
+    min_frames = 20 * 10  # 20s in 0.1s frames
+    max_frames = 40 * 10  # 40s
     chunks = []
     pos = 0
+    while pos < frame_count:
+        remaining = frame_count - pos
+        if remaining <= max_frames:
+            chunks.append(audio[:, pos * hop:])
             break
+        search_start = pos + min_frames
+        search_end = min(pos + max_frames, frame_count)
+        window = rms[search_start:search_end]
         cut = search_start + window.argmin().item()
+        # If cutting here leaves a tail shorter than 20s, take it all
+        tail = frame_count - cut
+        if tail < min_frames:
+            chunks.append(audio[:, pos * hop:])
+            break
+        chunks.append(audio[:, pos * hop:cut * hop])
         pos = cut
     return chunks
             try:
                 audio, _ = load_audio_stereo(str(af), TARGET_SR, max_duration)
                 # Auto-caption (once per file, shared across chunks)
                 sidecar = _read_caption_sidecar(af)
                 )
                 if has_bad:
                     p1_failed += 1
+                    del text_hs, text_mask, lyric_hs, lyric_mask
                     continue
+                # Split audio into ~30s chunks, VAE encode each independently
+                audio_chunks = _chunk_audio(audio)
+                logger.info("[Chunk] %s: %.1fs -> %d chunks", af.name,
+                            audio.shape[-1] / TARGET_SR, len(audio_chunks))
                 text_hs_cpu = text_hs.cpu()
                 text_mask_cpu = text_mask.cpu()
                     "caption": caption,
                     "lyrics": lyrics,
                 }
+                del text_hs, text_mask, lyric_hs, lyric_mask
+                for ci, chunk_audio in enumerate(audio_chunks):
+                    chunk_in = chunk_audio.unsqueeze(0).to(device=device, dtype=vae.dtype)
+                    with torch.no_grad():
+                        chunk_lat = tiled_vae_encode(vae, chunk_in, dtype)
+                    del chunk_in
+                    if torch.isnan(chunk_lat).any() or torch.isinf(chunk_lat).any():
+                        continue
+                    chunk_lat = chunk_lat.squeeze(0).cpu()
                     chunk_len = chunk_lat.shape[0]
                     chunk_mask = torch.ones(chunk_len, dtype=dtype)
+                    tag = f"{stem}_chunk{ci}" if len(audio_chunks) > 1 else stem
                     tmp_path = out / f"{tag}.tmp.pt"
                     torch.save({
                         "target_latents": chunk_lat,
                         "metadata": meta,
                     }, tmp_path)
                     intermediates.append(tmp_path)
+                    del chunk_lat
+                del audio
                 if progress_callback:
+                    progress_callback(i + 1, total, f"[Pass 1] {af.name} ({len(audio_chunks)} chunks)")
             except Exception as exc:
                 p1_failed += 1