samwaugh commited on
Commit
4750882
ยท
1 Parent(s): 07a1661
Files changed (1) hide show
  1. backend/runner/config.py +92 -0
backend/runner/config.py CHANGED
@@ -261,6 +261,16 @@ def load_markdown_dataset(force_refresh: bool = False) -> Optional[Path]:
261
  clear_markdown_cache()
262
  else:
263
  print(f"โœ… Using cached markdown dataset at {works_dir}")
 
 
 
 
 
 
 
 
 
 
264
  return works_dir
265
 
266
  # Use optimized download approach
@@ -332,6 +342,88 @@ def _download_markdown_optimized(works_dir: Path) -> Optional[Path]:
332
  traceback.print_exc()
333
  return None
334
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  def _download_markdown_files_robust(works_dir: Path, work_dirs: set, files: list, progress: dict, progress_file: Path) -> None:
336
  """Download markdown files with retry logic and progress persistence"""
337
  import concurrent.futures
 
261
  clear_markdown_cache()
262
  else:
263
  print(f"โœ… Using cached markdown dataset at {works_dir}")
264
+ # Even if markdown folders exist, images may be missing. Perform a
265
+ # lightweight sampling check and, if needed, resume image downloads.
266
+ try:
267
+ if _images_likely_missing(works_dir):
268
+ print("๐Ÿ–ผ๏ธ Images appear to be missing or incomplete โ€“ resuming image download phase...")
269
+ _download_images_phase_only(works_dir)
270
+ else:
271
+ print("๐Ÿ–ผ๏ธ Images appear present for sampled works โ€“ skipping image download phase")
272
+ except Exception as e:
273
+ print(f"โš ๏ธ Image presence check failed: {e}")
274
  return works_dir
275
 
276
  # Use optimized download approach
 
342
  traceback.print_exc()
343
  return None
344
 
345
+ def _images_likely_missing(works_dir: Path, sample_size: int = 20) -> bool:
346
+ """Quickly assess whether images are present in the cache.
347
+
348
+ We sample up to `sample_size` work directories and check for any .jpg/.png
349
+ files either under <work>/images/ or directly inside <work>/.
350
+
351
+ Returns True if fewer than 20% of sampled works have at least one image.
352
+ """
353
+ try:
354
+ work_dirs = [d for d in works_dir.iterdir() if d.is_dir()]
355
+ if not work_dirs:
356
+ print("๐Ÿ–ผ๏ธ Image check: no work directories found โ€“ treating as missing")
357
+ return True
358
+
359
+ sampled = work_dirs[:sample_size]
360
+ has_images_count = 0
361
+ for work_dir in sampled:
362
+ images_dir = work_dir / "images"
363
+ found = False
364
+ if images_dir.exists():
365
+ if any(images_dir.glob("*.jpg")) or any(images_dir.glob("*.jpeg")) or any(images_dir.glob("*.png")):
366
+ found = True
367
+ # Fallback: look in the work dir directly
368
+ if not found:
369
+ if any(work_dir.glob("*.jpg")) or any(work_dir.glob("*.jpeg")) or any(work_dir.glob("*.png")):
370
+ found = True
371
+ if found:
372
+ has_images_count += 1
373
+
374
+ ratio = has_images_count / max(1, len(sampled))
375
+ print(f"๐Ÿ–ผ๏ธ Image check: {has_images_count}/{len(sampled)} sampled works have images (ratio={ratio:.2f})")
376
+ return ratio < 0.2
377
+ except Exception as e:
378
+ print(f"โš ๏ธ Image sampling check failed: {e}")
379
+ # Be conservative โ€“ assume images are missing so we attempt to download them
380
+ return True
381
+
382
+ def _download_images_phase_only(works_dir: Path) -> Optional[Path]:
383
+ """Resume/perform only the image download phase without touching markdown files.
384
+
385
+ This function discovers files on the HF repo, constructs the list of works,
386
+ loads any existing download progress, and runs the robust image downloader.
387
+ """
388
+ try:
389
+ from huggingface_hub import list_repo_files
390
+ import json
391
+
392
+ progress_file = works_dir.parent / "download_progress.json"
393
+
394
+ # Load existing progress if available
395
+ progress = {"markdown_completed": set(), "image_batches_completed": set()}
396
+ if progress_file.exists():
397
+ try:
398
+ with open(progress_file, 'r') as f:
399
+ saved_progress = json.load(f)
400
+ progress["markdown_completed"] = set(saved_progress.get("markdown_completed", []))
401
+ progress["image_batches_completed"] = set(saved_progress.get("image_batches_completed", []))
402
+ print(f"๐Ÿ“Š Resuming image download from previous progress...")
403
+ except Exception as e:
404
+ print(f"โš ๏ธ Could not load progress file: {e}")
405
+
406
+ print("๐Ÿ” Discovering files in dataset (images phase only)...")
407
+ files = list_repo_files(repo_id=ARTEFACT_MARKDOWN_DATASET, repo_type="dataset")
408
+
409
+ work_dirs = set()
410
+ for file_path in files:
411
+ if file_path.startswith("works/"):
412
+ parts = file_path.split("/")
413
+ if len(parts) >= 2:
414
+ work_id = parts[1]
415
+ if work_id.startswith("W"):
416
+ work_dirs.add(work_id)
417
+
418
+ print(f"๐Ÿ“Š Images phase: {len(work_dirs)} work directories discovered")
419
+ _download_images_robust(works_dir, work_dirs, files, progress, progress_file)
420
+ return works_dir
421
+ except Exception as e:
422
+ print(f"โŒ Images-phase-only download failed: {e}")
423
+ import traceback
424
+ traceback.print_exc()
425
+ return None
426
+
427
  def _download_markdown_files_robust(works_dir: Path, work_dirs: set, files: list, progress: dict, progress_file: Path) -> None:
428
  """Download markdown files with retry logic and progress persistence"""
429
  import concurrent.futures