Spaces:

WeReCooking
/

ACE-Step-CPU

Running

App Files Files Community

Nekochu commited on 19 days ago

Commit

829ed0c

1 Parent(s): a5741b1

fix all review issues: dedup sampling/unwrap, thread-safe lock, cleanup, retry, security docs

Browse files

Files changed (2) hide show

app.py +63 -22
train_engine.py +42 -72

app.py CHANGED Viewed

@@ -13,12 +13,14 @@ import string
 import random
 import requests
 import logging
 from train_engine import (
     preprocess_audio,
     train_lora_generator,
     cancel_training,
     get_trained_loras as _get_trained_loras_engine,
 )
 logger = logging.getLogger(__name__)
@@ -28,7 +30,7 @@ logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 MAX_TOTAL_AUDIO = 1800       # seconds total across all uploaded files (30 min)
-MAX_TRAINING_TIME = 28800    # 8 hours hard training timeout (seconds)
 MAX_AUDIO_FILES = 50         # max number of training audio files per run
 # ---------------------------------------------------------------------------
@@ -39,6 +41,21 @@ ACE_SERVER = os.environ.get("ACE_SERVER", "http://127.0.0.1:8085")
 OUTPUT_DIR = os.environ.get("ACE_OUTPUT_DIR", "/app/outputs")
 os.makedirs(OUTPUT_DIR, exist_ok=True)
 ACE_CHECKPOINT_DIR = os.environ.get("ACE_CHECKPOINT_DIR", "/app/checkpoints")
 ACE_SOURCE_DIR = "/app/ace-step-source"
 ACE_HF_MODEL = "ACE-Step/Ace-Step1.5"
@@ -49,7 +66,7 @@ ACE_SERVER_BIN = "/app/ace-server"
 # Detect if running on HF Space (ace-server available) vs locally (PyTorch only)
 _is_space = os.path.isfile(ACE_SERVER_BIN) or os.environ.get("SPACE_ID") is not None
-_training_in_progress = False
 # HF repo for on-demand GGUF downloads
 GGUF_HF_REPO = "Serveurperso/ACE-Step-1.5-GGUF"
@@ -329,25 +346,47 @@ def _stop_ace_server():
     time.sleep(1)
-def _start_ace_server():
-    """Start ace-server in background and wait for health."""
     global _ace_proc
-    logger.info("[ace-server] Starting with --adapters %s", ADAPTER_DIR)
-    try:
-        _ace_proc = subprocess.Popen(
-            [ACE_SERVER_BIN, "--host", "127.0.0.1", "--port", "8085",
-             "--models", MODELS_DIR, "--adapters", ADAPTER_DIR, "--max-batch", "1"],
         )
-    except Exception as exc:
-        logger.error("[ace-server] Failed to start: %s", exc)
-        return False
-    for _ in range(30):
-        if _server_ok():
-            logger.info("[ace-server] Healthy")
-            return True
-        time.sleep(2)
-    logger.error("[ace-server] Health check timeout")
     return False
@@ -449,8 +488,9 @@ def gradio_main():
     def generate_music(caption, lyrics, instrumental, bpm, duration, seed,
                        steps, lora_select, lm_model_select,
                        progress=gr.Progress(track_tqdm=True)):
-        if _training_in_progress:
             return None, "Training in progress. Inference unavailable until training completes. Press Cancel to stop training."
         if not _server_ok():
             return None, "ace-server not running. Check logs."
@@ -631,8 +671,7 @@ def gradio_main():
             yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
         # Stop ace-server before training (frees memory)
-        global _training_in_progress
-        _training_in_progress = True
         _log("[INFO] Stopping ace-server for training...")
         yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
         _stop_ace_server()
@@ -720,7 +759,7 @@ def gradio_main():
             yield _log_text(), gr.Button(visible=True), gr.Button(visible=False), gr.File()
         finally:
-            _training_in_progress = False
             # Always restart ace-server
             _log("[INFO] Restarting ace-server...")
             yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
@@ -746,6 +785,8 @@ def gradio_main():
                 yield _log_text(), gr.Button(visible=True), gr.Button(visible=False), gr.File(value=tmp_out.name, visible=True)
             else:
                 yield _log_text(), gr.Button(visible=True), gr.Button(visible=False), gr.File()
     # -- Cancel handler --
     def _on_cancel():

 import random
 import requests
 import logging
+import threading
 from train_engine import (
     preprocess_audio,
     train_lora_generator,
     cancel_training,
     get_trained_loras as _get_trained_loras_engine,
+    MAX_TRAINING_TIME,
 )
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 MAX_TOTAL_AUDIO = 1800       # seconds total across all uploaded files (30 min)
+# MAX_TRAINING_TIME is imported from train_engine (single source of truth)
 MAX_AUDIO_FILES = 50         # max number of training audio files per run
 # ---------------------------------------------------------------------------
 OUTPUT_DIR = os.environ.get("ACE_OUTPUT_DIR", "/app/outputs")
 os.makedirs(OUTPUT_DIR, exist_ok=True)
+# Clean up old inference temp files (older than 1 hour) at startup
+_CLEANUP_MAX_AGE = 3600  # seconds
+try:
+    _now = time.time()
+    for _fname in os.listdir(OUTPUT_DIR):
+        if _fname.lower().endswith((".wav", ".mp3")):
+            _fpath = os.path.join(OUTPUT_DIR, _fname)
+            try:
+                if os.path.isfile(_fpath) and (_now - os.path.getmtime(_fpath)) > _CLEANUP_MAX_AGE:
+                    os.remove(_fpath)
+            except OSError:
+                pass
+except Exception:
+    pass
 ACE_CHECKPOINT_DIR = os.environ.get("ACE_CHECKPOINT_DIR", "/app/checkpoints")
 ACE_SOURCE_DIR = "/app/ace-step-source"
 ACE_HF_MODEL = "ACE-Step/Ace-Step1.5"
 # Detect if running on HF Space (ace-server available) vs locally (PyTorch only)
 _is_space = os.path.isfile(ACE_SERVER_BIN) or os.environ.get("SPACE_ID") is not None
+_training_lock = threading.Lock()
 # HF repo for on-demand GGUF downloads
 GGUF_HF_REPO = "Serveurperso/ACE-Step-1.5-GGUF"
     time.sleep(1)
+def _start_ace_server(max_retries: int = 3, retry_delay: float = 5.0):
+    """Start ace-server in background and wait for health.
+    Retries up to max_retries times with retry_delay seconds between attempts.
+    """
     global _ace_proc
+    for attempt in range(1, max_retries + 1):
+        logger.info(
+            "[ace-server] Starting (attempt %d/%d) with --adapters %s",
+            attempt, max_retries, ADAPTER_DIR,
         )
+        try:
+            _ace_proc = subprocess.Popen(
+                [ACE_SERVER_BIN, "--host", "127.0.0.1", "--port", "8085",
+                 "--models", MODELS_DIR, "--adapters", ADAPTER_DIR, "--max-batch", "1"],
+            )
+        except Exception as exc:
+            logger.error("[ace-server] Failed to start: %s", exc)
+            if attempt < max_retries:
+                time.sleep(retry_delay)
+                continue
+            return False
+        for _ in range(30):
+            if _server_ok():
+                logger.info("[ace-server] Healthy")
+                return True
+            time.sleep(2)
+        logger.warning("[ace-server] Health check timeout on attempt %d/%d", attempt, max_retries)
+        # Kill the failed process before retrying
+        if _ace_proc and _ace_proc.poll() is None:
+            _ace_proc.kill()
+            try:
+                _ace_proc.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                pass
+        if attempt < max_retries:
+            time.sleep(retry_delay)
+    logger.error("[ace-server] Failed to start after %d attempts", max_retries)
     return False
     def generate_music(caption, lyrics, instrumental, bpm, duration, seed,
                        steps, lora_select, lm_model_select,
                        progress=gr.Progress(track_tqdm=True)):
+        if not _training_lock.acquire(blocking=False):
             return None, "Training in progress. Inference unavailable until training completes. Press Cancel to stop training."
+        _training_lock.release()
         if not _server_ok():
             return None, "ace-server not running. Check logs."
             yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
         # Stop ace-server before training (frees memory)
+        _training_lock.acquire()
         _log("[INFO] Stopping ace-server for training...")
         yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
         _stop_ace_server()
             yield _log_text(), gr.Button(visible=True), gr.Button(visible=False), gr.File()
         finally:
+            _training_lock.release()
             # Always restart ace-server
             _log("[INFO] Restarting ace-server...")
             yield _log_text(), gr.Button(visible=False), gr.Button(visible=True), gr.File()
                 yield _log_text(), gr.Button(visible=True), gr.Button(visible=False), gr.File(value=tmp_out.name, visible=True)
             else:
                 yield _log_text(), gr.Button(visible=True), gr.Button(visible=False), gr.File()
+            # Clean up training workspace (preprocessed tensors, temp audio, etc.)
+            shutil.rmtree(work_dir, ignore_errors=True)
     # -- Cancel handler --
     def _on_cancel():

train_engine.py CHANGED Viewed

@@ -374,28 +374,6 @@ def enable_gradient_checkpointing(decoder: nn.Module) -> bool:
     return enabled
-def force_disable_cache(decoder: nn.Module) -> None:
-    stack = [decoder]
-    visited = set()
-    while stack:
-        mod = stack.pop()
-        if not isinstance(mod, nn.Module):
-            continue
-        mid = id(mod)
-        if mid in visited:
-            continue
-        visited.add(mid)
-        cfg = getattr(mod, "config", None)
-        if cfg is not None and hasattr(cfg, "use_cache"):
-            try:
-                cfg.use_cache = False
-            except Exception:
-                pass
-        for a in ("_forward_module", "_orig_mod", "base_model", "model", "module"):
-            child = getattr(mod, a, None)
-            if isinstance(child, nn.Module):
-                stack.append(child)
 # ============================================================================
 # LORA INJECTION (PEFT only -- no DoRA/LoKR/LoHA/OFT)
@@ -464,9 +442,7 @@ def inject_lora(model, lora_cfg: LoRAConfig) -> Tuple[Any, Dict[str, Any]]:
 def save_lora_adapter(model, output_dir: str) -> None:
     os.makedirs(output_dir, exist_ok=True)
-    decoder = model.decoder if hasattr(model, "decoder") else model
-    while hasattr(decoder, "_forward_module"):
-        decoder = decoder._forward_module
     if hasattr(decoder, "save_pretrained"):
         decoder.save_pretrained(output_dir)
@@ -602,6 +578,10 @@ def load_model_for_training(
     for idx, attn in enumerate(candidates):
         try:
             load_kwargs = dict(
                 trust_remote_code=True,
                 attn_implementation=attn,
                 torch_dtype=dtype,
@@ -2443,8 +2423,8 @@ def train_lora_generator(
     yield f"[OK] LoRA injected: {info['trainable_params']:,} trainable params"
-    # Gradient checkpointing + cache disable
-    force_disable_cache(model.decoder)
     ckpt_ok = enable_gradient_checkpointing(model.decoder)
     force_input_grads = ckpt_ok
     if ckpt_ok:
@@ -2511,9 +2491,7 @@ def train_lora_generator(
             if aw.exists():
                 from safetensors.torch import load_file
                 state = load_file(str(aw))
-                decoder = model.decoder
-                while hasattr(decoder, "_forward_module"):
-                    decoder = decoder._forward_module
                 decoder.load_state_dict(state, strict=False)
             # Load training state
@@ -3017,15 +2995,7 @@ def generate_audio(
         logger.info("Loading LoRA adapter from %s (scale=%.2f)...", adapter_path, adapter_scale)
         from peft import PeftModel
-        decoder = model.decoder if hasattr(model, "decoder") else model
-        # Unwrap any wrappers
-        while hasattr(decoder, "_forward_module"):
-            decoder = decoder._forward_module
-        if hasattr(decoder, "base_model"):
-            bm = decoder.base_model
-            decoder = bm.model if hasattr(bm, "model") else bm
-        if hasattr(decoder, "model") and isinstance(decoder.model, nn.Module):
-            decoder = decoder.model
         model.decoder = PeftModel.from_pretrained(
             decoder, adapter_path, is_trainable=False,
@@ -3174,6 +3144,37 @@ _LM_UNDERSTAND_INSTRUCTION = (
 )
 def _build_understand_prompt(
     bpe_tokenizer, codes: List[int],
 ) -> List[int]:
@@ -3357,22 +3358,7 @@ def _generate_codes_with_lm(
             logits[0] = logits[0] + mask
         # Sample
-        if temperature <= 0:
-            next_id = int(logits[0].argmax().item())
-        else:
-            scaled = logits[0].clone() / temperature
-            if top_k > 0:
-                topk_vals, _ = torch.topk(scaled, min(top_k, scaled.shape[0]))
-                scaled[scaled < topk_vals[-1]] = float("-inf")
-            if top_p > 0 and top_p < 1.0:
-                sorted_logits, sorted_idx = torch.sort(scaled, descending=True)
-                probs = torch.softmax(sorted_logits, dim=-1)
-                cumsum = torch.cumsum(probs, dim=-1)
-                nucleus_mask = cumsum - probs > top_p
-                sorted_logits[nucleus_mask] = float("-inf")
-                scaled = torch.zeros_like(scaled).scatter(0, sorted_idx, sorted_logits)
-            probs = torch.softmax(scaled, dim=-1)
-            next_id = int(torch.multinomial(probs, 1).item())
         # Stop on im_end
         if next_id == _TOKEN_IM_END:
@@ -3701,23 +3687,7 @@ def understand_audio(
             logits[0, _AUDIO_CODE_BASE:] = float("-inf")
         # Sample
-        if temperature <= 0:
-            next_id = int(logits[0].argmax().item())
-        else:
-            scaled = logits[0].clone() / temperature
-            if top_k > 0:
-                topk_vals, _ = torch.topk(scaled, min(top_k, scaled.shape[0]))
-                scaled[scaled < topk_vals[-1]] = float("-inf")
-            if top_p > 0 and top_p < 1.0:
-                sorted_logits, sorted_idx = torch.sort(scaled, descending=True)
-                probs = torch.softmax(sorted_logits, dim=-1)
-                cumsum = torch.cumsum(probs, dim=-1)
-                mask = cumsum - probs > top_p
-                sorted_logits[mask] = float("-inf")
-                # Scatter masked values back to original positions
-                scaled = torch.zeros_like(scaled).scatter(0, sorted_idx, sorted_logits)
-            probs = torch.softmax(scaled, dim=-1)
-            next_id = int(torch.multinomial(probs, 1).item())
         if next_id == _TOKEN_IM_END:
             break

     return enabled
 # ============================================================================
 # LORA INJECTION (PEFT only -- no DoRA/LoKR/LoHA/OFT)
 def save_lora_adapter(model, output_dir: str) -> None:
     os.makedirs(output_dir, exist_ok=True)
+    decoder = _unwrap_decoder(model)
     if hasattr(decoder, "save_pretrained"):
         decoder.save_pretrained(output_dir)
     for idx, attn in enumerate(candidates):
         try:
             load_kwargs = dict(
+                # SECURITY: trust_remote_code=True is required because the
+                # ACE-Step model config references custom Python code in its
+                # checkpoint (config.json -> auto_map).  Only load checkpoints
+                # from trusted sources (the official ACE-Step HF repo).
                 trust_remote_code=True,
                 attn_implementation=attn,
                 torch_dtype=dtype,
     yield f"[OK] LoRA injected: {info['trainable_params']:,} trainable params"
+    # Gradient checkpointing + cache disable (enable_gradient_checkpointing
+    # also walks the module tree and sets use_cache=False on any config it finds)
     ckpt_ok = enable_gradient_checkpointing(model.decoder)
     force_input_grads = ckpt_ok
     if ckpt_ok:
             if aw.exists():
                 from safetensors.torch import load_file
                 state = load_file(str(aw))
+                decoder = _unwrap_decoder(model)
                 decoder.load_state_dict(state, strict=False)
             # Load training state
         logger.info("Loading LoRA adapter from %s (scale=%.2f)...", adapter_path, adapter_scale)
         from peft import PeftModel
+        decoder = _unwrap_decoder(model)
         model.decoder = PeftModel.from_pretrained(
             decoder, adapter_path, is_trainable=False,
 )
+def _sample_next_token(
+    logits: torch.Tensor, temperature: float, top_k: int, top_p: float,
+) -> int:
+    """Sample a single token from logits with temperature, top-k, and top-p.
+    Args:
+        logits: 1-D logits tensor (vocab_size,).
+        temperature: Sampling temperature (<=0 for argmax).
+        top_k: Top-K filtering (0 = disabled).
+        top_p: Nucleus sampling cutoff (0 or >=1 = disabled).
+    Returns:
+        Selected token ID as int.
+    """
+    if temperature <= 0:
+        return int(logits.argmax().item())
+    scaled = logits.clone() / temperature
+    if top_k > 0:
+        topk_vals, _ = torch.topk(scaled, min(top_k, scaled.shape[0]))
+        scaled[scaled < topk_vals[-1]] = float("-inf")
+    if top_p > 0 and top_p < 1.0:
+        sorted_logits, sorted_idx = torch.sort(scaled, descending=True)
+        probs = torch.softmax(sorted_logits, dim=-1)
+        cumsum = torch.cumsum(probs, dim=-1)
+        nucleus_mask = cumsum - probs > top_p
+        sorted_logits[nucleus_mask] = float("-inf")
+        scaled = torch.zeros_like(scaled).scatter(0, sorted_idx, sorted_logits)
+    probs = torch.softmax(scaled, dim=-1)
+    return int(torch.multinomial(probs, 1).item())
 def _build_understand_prompt(
     bpe_tokenizer, codes: List[int],
 ) -> List[int]:
             logits[0] = logits[0] + mask
         # Sample
+        next_id = _sample_next_token(logits[0], temperature, top_k, top_p)
         # Stop on im_end
         if next_id == _TOKEN_IM_END:
             logits[0, _AUDIO_CODE_BASE:] = float("-inf")
         # Sample
+        next_id = _sample_next_token(logits[0], temperature, top_k, top_p)
         if next_id == _TOKEN_IM_END:
             break