Spaces:

samwaugh
/

ArteFact

Sleeping

App Files Files Community

samwaugh commited on Sep 30

Commit

4750882

1 Parent(s): 07a1661

Try fix 2

Browse files

Files changed (1) hide show

backend/runner/config.py +92 -0

backend/runner/config.py CHANGED Viewed

@@ -261,6 +261,16 @@ def load_markdown_dataset(force_refresh: bool = False) -> Optional[Path]:
                     clear_markdown_cache()
                 else:
                     print(f"✅ Using cached markdown dataset at {works_dir}")
                     return works_dir
         # Use optimized download approach
@@ -332,6 +342,88 @@ def _download_markdown_optimized(works_dir: Path) -> Optional[Path]:
         traceback.print_exc()
         return None
 def _download_markdown_files_robust(works_dir: Path, work_dirs: set, files: list, progress: dict, progress_file: Path) -> None:
     """Download markdown files with retry logic and progress persistence"""
     import concurrent.futures

                     clear_markdown_cache()
                 else:
                     print(f"✅ Using cached markdown dataset at {works_dir}")
+                    # Even if markdown folders exist, images may be missing. Perform a
+                    # lightweight sampling check and, if needed, resume image downloads.
+                    try:
+                        if _images_likely_missing(works_dir):
+                            print("🖼️  Images appear to be missing or incomplete – resuming image download phase...")
+                            _download_images_phase_only(works_dir)
+                        else:
+                            print("🖼️  Images appear present for sampled works – skipping image download phase")
+                    except Exception as e:
+                        print(f"⚠️  Image presence check failed: {e}")
                     return works_dir
         # Use optimized download approach
         traceback.print_exc()
         return None
+def _images_likely_missing(works_dir: Path, sample_size: int = 20) -> bool:
+    """Quickly assess whether images are present in the cache.
+    We sample up to `sample_size` work directories and check for any .jpg/.png
+    files either under <work>/images/ or directly inside <work>/.
+    Returns True if fewer than 20% of sampled works have at least one image.
+    """
+    try:
+        work_dirs = [d for d in works_dir.iterdir() if d.is_dir()]
+        if not work_dirs:
+            print("🖼️  Image check: no work directories found – treating as missing")
+            return True
+        sampled = work_dirs[:sample_size]
+        has_images_count = 0
+        for work_dir in sampled:
+            images_dir = work_dir / "images"
+            found = False
+            if images_dir.exists():
+                if any(images_dir.glob("*.jpg")) or any(images_dir.glob("*.jpeg")) or any(images_dir.glob("*.png")):
+                    found = True
+            # Fallback: look in the work dir directly
+            if not found:
+                if any(work_dir.glob("*.jpg")) or any(work_dir.glob("*.jpeg")) or any(work_dir.glob("*.png")):
+                    found = True
+            if found:
+                has_images_count += 1
+        ratio = has_images_count / max(1, len(sampled))
+        print(f"🖼️  Image check: {has_images_count}/{len(sampled)} sampled works have images (ratio={ratio:.2f})")
+        return ratio < 0.2
+    except Exception as e:
+        print(f"⚠️  Image sampling check failed: {e}")
+        # Be conservative – assume images are missing so we attempt to download them
+        return True
+def _download_images_phase_only(works_dir: Path) -> Optional[Path]:
+    """Resume/perform only the image download phase without touching markdown files.
+    This function discovers files on the HF repo, constructs the list of works,
+    loads any existing download progress, and runs the robust image downloader.
+    """
+    try:
+        from huggingface_hub import list_repo_files
+        import json
+        progress_file = works_dir.parent / "download_progress.json"
+        # Load existing progress if available
+        progress = {"markdown_completed": set(), "image_batches_completed": set()}
+        if progress_file.exists():
+            try:
+                with open(progress_file, 'r') as f:
+                    saved_progress = json.load(f)
+                    progress["markdown_completed"] = set(saved_progress.get("markdown_completed", []))
+                    progress["image_batches_completed"] = set(saved_progress.get("image_batches_completed", []))
+                print(f"📊 Resuming image download from previous progress...")
+            except Exception as e:
+                print(f"⚠️  Could not load progress file: {e}")
+        print("🔍 Discovering files in dataset (images phase only)...")
+        files = list_repo_files(repo_id=ARTEFACT_MARKDOWN_DATASET, repo_type="dataset")
+        work_dirs = set()
+        for file_path in files:
+            if file_path.startswith("works/"):
+                parts = file_path.split("/")
+                if len(parts) >= 2:
+                    work_id = parts[1]
+                    if work_id.startswith("W"):
+                        work_dirs.add(work_id)
+        print(f"📊 Images phase: {len(work_dirs)} work directories discovered")
+        _download_images_robust(works_dir, work_dirs, files, progress, progress_file)
+        return works_dir
+    except Exception as e:
+        print(f"❌ Images-phase-only download failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
 def _download_markdown_files_robust(works_dir: Path, work_dirs: set, files: list, progress: dict, progress_file: Path) -> None:
     """Download markdown files with retry logic and progress persistence"""
     import concurrent.futures