Spaces:

WeReCooking
/

ACE-Step-CPU

Running

App Files Files Community

Nekochu commited on 6 days ago

Commit

d3618ec

1 Parent(s): 1ee8f1f

random 60s crop at training time (matches Side-Step chunk-duration), remove pre-split chunking

Browse files

Files changed (2) hide show

app.py +4 -12
train_engine.py +47 -47

app.py CHANGED Viewed

@@ -749,7 +749,7 @@ def gradio_main():
             processed = result.get("processed", 0)
             failed = result.get("failed", 0)
             total = result.get("total", 0)
-            _log(f"[OK] Preprocessed: {total} files -> {processed} training samples (failed: {failed})")
             yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
             if processed == 0:
@@ -757,18 +757,9 @@ def gradio_main():
                 yield _log_text(), gr.Button(visible=True), gr.Button(visible=False), gr.File()
                 return
-            # Auto-scale epochs: chunking multiplies samples, reduce epochs
-            # to keep total gradient updates ~constant
-            effective_epochs = epochs
-            if processed > total and total > 0:
-                scale = total / processed
-                effective_epochs = max(10, int(epochs * scale))
-                _log(f"[INFO] Auto-scaled epochs: {epochs} -> {effective_epochs} "
-                     f"(chunking: {total} files -> {processed} samples)")
             _gc.collect()
-            # -- Phase 2: Training --
             _log("[Step 2/2] Training LoRA...")
             yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
@@ -776,7 +767,7 @@ def gradio_main():
                 dataset_dir=preprocessed_dir,
                 output_dir=adapter_out,
                 checkpoint_dir=ACE_CHECKPOINT_DIR,
-                epochs=effective_epochs,
                 lr=lr,
                 rank=rank,
                 alpha=rank * 2,
@@ -790,6 +781,7 @@ def gradio_main():
                 seed=42,
                 variant="turbo",
                 device="cpu",
                 log_every=5,
             ):
                 elapsed = time.time() - train_start

             processed = result.get("processed", 0)
             failed = result.get("failed", 0)
             total = result.get("total", 0)
+            _log(f"[OK] Preprocessed: {processed}/{total} files (failed: {failed})")
             yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
             if processed == 0:
                 yield _log_text(), gr.Button(visible=True), gr.Button(visible=False), gr.File()
                 return
             _gc.collect()
+            # -- Phase 2: Training (random 60s crops for speed + augmentation) --
             _log("[Step 2/2] Training LoRA...")
             yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
                 dataset_dir=preprocessed_dir,
                 output_dir=adapter_out,
                 checkpoint_dir=ACE_CHECKPOINT_DIR,
+                epochs=epochs,
                 lr=lr,
                 rank=rank,
                 alpha=rank * 2,
                 seed=42,
                 variant="turbo",
                 device="cpu",
+                chunk_duration=60,
                 log_every=5,
             ):
                 elapsed = time.time() - train_start

train_engine.py CHANGED Viewed

@@ -2250,56 +2250,45 @@ def preprocess_audio(
                     del text_hs, text_mask, lyric_hs, lyric_mask
                     continue
-                # Split audio into ~30s chunks, VAE encode each independently
-                audio_chunks = _chunk_audio(audio)
-                logger.info("[Chunk] %s: %.1fs -> %d chunks", af.name,
-                            audio.shape[-1] / TARGET_SR, len(audio_chunks))
-                text_hs_cpu = text_hs.cpu()
-                text_mask_cpu = text_mask.cpu()
-                lyric_hs_cpu = lyric_hs.cpu()
-                lyric_mask_cpu = lyric_mask.cpu()
-                silence_cpu = silence_lat.cpu()
-                meta = {
-                    "audio_path": str(af),
-                    "filename": af.name,
-                    "caption": caption,
-                    "lyrics": lyrics,
-                }
-                del text_hs, text_mask, lyric_hs, lyric_mask
-                for ci, chunk_audio in enumerate(audio_chunks):
-                    chunk_in = chunk_audio.unsqueeze(0).to(device=device, dtype=vae.dtype)
-                    with torch.no_grad():
-                        chunk_lat = tiled_vae_encode(vae, chunk_in, dtype)
-                    del chunk_in
-                    if torch.isnan(chunk_lat).any() or torch.isinf(chunk_lat).any():
-                        continue
-                    chunk_lat = chunk_lat.squeeze(0).cpu()
-                    chunk_len = chunk_lat.shape[0]
-                    chunk_mask = torch.ones(chunk_len, dtype=dtype)
-                    tag = f"{stem}_chunk{ci}" if len(audio_chunks) > 1 else stem
-                    tmp_path = out / f"{tag}.tmp.pt"
-                    torch.save({
-                        "target_latents": chunk_lat,
-                        "attention_mask": chunk_mask,
-                        "text_hidden_states": text_hs_cpu,
-                        "text_attention_mask": text_mask_cpu,
-                        "lyric_hidden_states": lyric_hs_cpu,
-                        "lyric_attention_mask": lyric_mask_cpu,
-                        "silence_latent": silence_cpu,
-                        "latent_length": chunk_len,
-                        "metadata": meta,
-                    }, tmp_path)
-                    intermediates.append(tmp_path)
-                    del chunk_lat
-                del audio
                 if progress_callback:
-                    progress_callback(i + 1, total, f"[Pass 1] {af.name} ({len(audio_chunks)} chunks)")
             except Exception as exc:
                 p1_failed += 1
@@ -2419,6 +2408,7 @@ def train_lora_generator(
     target_modules: Optional[List[str]] = None,
     log_every: int = 10,
     resume_from: Optional[str] = None,
 ) -> Generator[str, None, None]:
     """Run LoRA training, yielding progress strings each epoch.
@@ -2634,6 +2624,16 @@ def train_lora_generator(
             enc_mask = batch["encoder_attention_mask"].to(device, dtype=dtype, non_blocking=nb)
             ctx = batch["context_latents"].to(device, dtype=dtype, non_blocking=nb)
             bsz = tgt.shape[0]
             # CFG dropout

                     del text_hs, text_mask, lyric_hs, lyric_mask
                     continue
+                # VAE encode full audio (tiled for memory, output is full-length)
+                audio_in = audio.unsqueeze(0).to(device=device, dtype=vae.dtype)
+                with torch.no_grad():
+                    target_latents = tiled_vae_encode(vae, audio_in, dtype)
+                del audio_in, audio
+                if torch.isnan(target_latents).any() or torch.isinf(target_latents).any():
+                    p1_failed += 1
+                    del target_latents, text_hs, text_mask, lyric_hs, lyric_mask
+                    continue
+                lat = target_latents.squeeze(0).cpu()
+                lat_len = lat.shape[0]
+                att_mask = torch.ones(lat_len, dtype=dtype)
+                tmp_path = out / f"{stem}.tmp.pt"
+                torch.save({
+                    "target_latents": lat,
+                    "attention_mask": att_mask,
+                    "text_hidden_states": text_hs.cpu(),
+                    "text_attention_mask": text_mask.cpu(),
+                    "lyric_hidden_states": lyric_hs.cpu(),
+                    "lyric_attention_mask": lyric_mask.cpu(),
+                    "silence_latent": silence_lat.cpu(),
+                    "latent_length": lat_len,
+                    "metadata": {
+                        "audio_path": str(af),
+                        "filename": af.name,
+                        "caption": caption,
+                        "lyrics": lyrics,
+                    },
+                }, tmp_path)
+                intermediates.append(tmp_path)
+                del target_latents, lat, text_hs, text_mask, lyric_hs, lyric_mask
+                logger.info("[OK] %s: %d latent frames (%.1fs)", af.name, lat_len, lat_len / LATENT_HZ)
                 if progress_callback:
+                    progress_callback(i + 1, total, f"[Pass 1] {af.name}")
             except Exception as exc:
                 p1_failed += 1
     target_modules: Optional[List[str]] = None,
     log_every: int = 10,
     resume_from: Optional[str] = None,
+    chunk_duration: float = 0,
 ) -> Generator[str, None, None]:
     """Run LoRA training, yielding progress strings each epoch.
             enc_mask = batch["encoder_attention_mask"].to(device, dtype=dtype, non_blocking=nb)
             ctx = batch["context_latents"].to(device, dtype=dtype, non_blocking=nb)
+            # Random crop to chunk_duration (data augmentation + speed)
+            if chunk_duration > 0:
+                max_len = int(chunk_duration * LATENT_HZ)
+                T = tgt.shape[1]
+                if T > max_len:
+                    start = random.randint(0, T - max_len)
+                    tgt = tgt[:, start:start + max_len, :]
+                    att = att[:, start:start + max_len]
+                    ctx = ctx[:, start:start + max_len, :]
             bsz = tgt.shape[0]
             # CFG dropout