Aloukik21
/

trainer

@@ -72,24 +72,36 @@ MODEL_PRESETS = {
     "flux_schnell": "train_lora_flux_schnell_24gb.yaml",
 }
-# HuggingFace repos used by each model (for pre-warming)
-MODEL_HF_REPOS = {
-    "wan21_1b": ["Wan-AI/Wan2.1-T2V-1.3B-Diffusers"],
-    "wan21_14b": ["Wan-AI/Wan2.1-T2V-14B-Diffusers"],
-    "wan22_14b": ["ai-toolkit/Wan2.2-T2V-A14B-Diffusers-bf16"],
-    "qwen_image": ["Qwen/Qwen-Image"],
-    "qwen_image_edit": ["Qwen/Qwen-Image-Edit"],
-    "qwen_image_edit_2509": ["Qwen/Qwen-Image-Edit"],
-    "flux_dev": ["black-forest-labs/FLUX.1-dev"],
-    "flux_schnell": ["black-forest-labs/FLUX.1-schnell"],
 }
-# Accuracy Recovery Adapters (smaller files, can be pre-downloaded)
-ARA_FILES = {
-    "wan22_14b": "ostris/accuracy_recovery_adapters/wan22_14b_t2i_torchao_uint4.safetensors",
-    "qwen_image": "ostris/accuracy_recovery_adapters/qwen_image_torchao_uint3.safetensors",
 }
 # =============================================================================
 # Cleanup Functions
@@ -196,52 +208,95 @@ def get_environment_info():
     }
-def find_cached_model(hf_repo: str) -> str:
     """
-    Find cached model path on RunPod.
     Args:
-        hf_repo: HuggingFace repo ID (e.g., 'black-forest-labs/FLUX.1-dev')
     Returns:
-        Path to cached model, or original repo ID if not cached
     """
     if not IS_RUNPOD_CACHE:
-        return hf_repo
-    # Convert "Org/Repo" -> "models--Org--Repo"
-    cache_name = hf_repo.replace("/", "--")
     snapshots_dir = Path(RUNPOD_HF_CACHE) / f"models--{cache_name}" / "snapshots"
     if snapshots_dir.exists():
         snapshots = list(snapshots_dir.iterdir())
         if snapshots:
-            cached_path = str(snapshots[0])
-            logger.info(f"Using cached model: {hf_repo} -> {cached_path}")
-            return cached_path
-    logger.info(f"Model not cached, will download: {hf_repo}")
-    return hf_repo
 def check_model_cache_status(model_key: str) -> dict:
-    """Check if model files are cached."""
-    if model_key not in MODEL_HF_REPOS:
         return {"cached": False, "reason": "unknown model"}
-    repos = MODEL_HF_REPOS[model_key]
-    status = {"repos": {}}
-    for repo in repos:
-        cache_name = repo.replace("/", "--")
-        snapshots_dir = Path(RUNPOD_HF_CACHE) / f"models--{cache_name}" / "snapshots"
-        if snapshots_dir.exists() and list(snapshots_dir.iterdir()):
-            status["repos"][repo] = "cached"
         else:
-            status["repos"][repo] = "not cached"
-    status["all_cached"] = all(s == "cached" for s in status["repos"].values())
     return status
@@ -306,14 +361,12 @@ def run_training(params):
     if "trigger_word" in params:
         process["trigger_word"] = params["trigger_word"]
-    # Check if we should use cached model path
-    if IS_RUNPOD_CACHE and "model" in process:
-        original_path = process["model"].get("name_or_path", "")
-        if original_path:
-            cached_path = find_cached_model(original_path)
-            if cached_path != original_path:
-                process["model"]["name_or_path"] = cached_path
-                logger.info(f"Using cached model path: {cached_path}")
     # Save config
     config_dir = os.path.join(AI_TOOLKIT_DIR, "config")

     "flux_schnell": "train_lora_flux_schnell_24gb.yaml",
 }
+# All models cached in single HuggingFace repo for RunPod caching
+CACHE_REPO = "Aloukik21/trainer"
+# Map model keys to subfolder in cache repo
+MODEL_CACHE_PATHS = {
+    "wan21_1b": "wan21-14b",  # Uses same base, different config
+    "wan21_14b": "wan21-14b",
+    "wan22_14b": "wan22-14b",
+    "qwen_image": "qwen-image",
+    "qwen_image_edit": "qwen-image",  # Same base model
+    "qwen_image_edit_2509": "qwen-image",
+    "flux_dev": "flux-dev",
+    "flux_schnell": "flux-schnell",
 }
+# Original HuggingFace repos (fallback if cache not available)
+MODEL_HF_REPOS = {
+    "wan21_1b": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
+    "wan21_14b": "Wan-AI/Wan2.1-T2V-14B-Diffusers",
+    "wan22_14b": "ai-toolkit/Wan2.2-T2V-A14B-Diffusers-bf16",
+    "qwen_image": "Qwen/Qwen-Image",
+    "qwen_image_edit": "Qwen/Qwen-Image-Edit",
+    "qwen_image_edit_2509": "Qwen/Qwen-Image-Edit",
+    "flux_dev": "black-forest-labs/FLUX.1-dev",
+    "flux_schnell": "black-forest-labs/FLUX.1-schnell",
 }
+# Accuracy Recovery Adapters path in cache repo
+ARA_CACHE_PATH = "accuracy_recovery_adapters"
 # =============================================================================
 # Cleanup Functions
     }
+def find_cached_model(model_key: str) -> str:
+    """
+    Find cached model path on RunPod from Aloukik21/trainer repo.
+    Args:
+        model_key: Model key (e.g., 'flux_dev', 'wan22_14b')
+    Returns:
+        Path to cached model subfolder, or original HF repo if not cached
+    """
+    if not IS_RUNPOD_CACHE:
+        return MODEL_HF_REPOS.get(model_key, "")
+    # Check for Aloukik21/trainer cache
+    cache_name = CACHE_REPO.replace("/", "--")
+    snapshots_dir = Path(RUNPOD_HF_CACHE) / f"models--{cache_name}" / "snapshots"
+    if snapshots_dir.exists():
+        snapshots = list(snapshots_dir.iterdir())
+        if snapshots:
+            # Get the subfolder for this model
+            subfolder = MODEL_CACHE_PATHS.get(model_key)
+            if subfolder:
+                cached_path = snapshots[0] / subfolder
+                if cached_path.exists():
+                    logger.info(f"Using cached model: {model_key} -> {cached_path}")
+                    return str(cached_path)
+    # Fallback to original repo
+    original_repo = MODEL_HF_REPOS.get(model_key, "")
+    logger.info(f"Model not in cache, using original: {original_repo}")
+    return original_repo
+def find_cached_ara(adapter_name: str) -> str:
     """
+    Find cached accuracy recovery adapter.
     Args:
+        adapter_name: Adapter filename (e.g., 'wan22_14b_t2i_torchao_uint4.safetensors')
     Returns:
+        Path to cached adapter, or original HF path
     """
     if not IS_RUNPOD_CACHE:
+        return f"ostris/accuracy_recovery_adapters/{adapter_name}"
+    cache_name = CACHE_REPO.replace("/", "--")
     snapshots_dir = Path(RUNPOD_HF_CACHE) / f"models--{cache_name}" / "snapshots"
     if snapshots_dir.exists():
         snapshots = list(snapshots_dir.iterdir())
         if snapshots:
+            cached_path = snapshots[0] / ARA_CACHE_PATH / adapter_name
+            if cached_path.exists():
+                logger.info(f"Using cached ARA: {adapter_name} -> {cached_path}")
+                return str(cached_path)
+    return f"ostris/accuracy_recovery_adapters/{adapter_name}"
 def check_model_cache_status(model_key: str) -> dict:
+    """Check if model files are cached in Aloukik21/trainer."""
+    if model_key not in MODEL_CACHE_PATHS:
         return {"cached": False, "reason": "unknown model"}
+    status = {
+        "model": model_key,
+        "cache_repo": CACHE_REPO,
+        "subfolder": MODEL_CACHE_PATHS.get(model_key),
+    }
+    # Check if main cache repo exists
+    cache_name = CACHE_REPO.replace("/", "--")
+    snapshots_dir = Path(RUNPOD_HF_CACHE) / f"models--{cache_name}" / "snapshots"
+    if snapshots_dir.exists():
+        snapshots = list(snapshots_dir.iterdir())
+        if snapshots:
+            subfolder = MODEL_CACHE_PATHS.get(model_key)
+            model_path = snapshots[0] / subfolder
+            status["cached"] = model_path.exists()
+            status["path"] = str(model_path) if model_path.exists() else None
         else:
+            status["cached"] = False
+    else:
+        status["cached"] = False
+        status["reason"] = "cache repo not found"
     return status
     if "trigger_word" in params:
         process["trigger_word"] = params["trigger_word"]
+    # Check if we should use cached model path from Aloukik21/trainer
+    if "model" in process:
+        cached_path = find_cached_model(model_key)
+        if cached_path:
+            process["model"]["name_or_path"] = cached_path
+            logger.info(f"Model path set to: {cached_path}")
     # Save config
     config_dir = os.path.join(AI_TOOLKIT_DIR, "config")