Try fix 2
Browse files- backend/runner/config.py +92 -0
backend/runner/config.py
CHANGED
|
@@ -261,6 +261,16 @@ def load_markdown_dataset(force_refresh: bool = False) -> Optional[Path]:
|
|
| 261 |
clear_markdown_cache()
|
| 262 |
else:
|
| 263 |
print(f"โ
Using cached markdown dataset at {works_dir}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
return works_dir
|
| 265 |
|
| 266 |
# Use optimized download approach
|
|
@@ -332,6 +342,88 @@ def _download_markdown_optimized(works_dir: Path) -> Optional[Path]:
|
|
| 332 |
traceback.print_exc()
|
| 333 |
return None
|
| 334 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
def _download_markdown_files_robust(works_dir: Path, work_dirs: set, files: list, progress: dict, progress_file: Path) -> None:
|
| 336 |
"""Download markdown files with retry logic and progress persistence"""
|
| 337 |
import concurrent.futures
|
|
|
|
| 261 |
clear_markdown_cache()
|
| 262 |
else:
|
| 263 |
print(f"โ
Using cached markdown dataset at {works_dir}")
|
| 264 |
+
# Even if markdown folders exist, images may be missing. Perform a
|
| 265 |
+
# lightweight sampling check and, if needed, resume image downloads.
|
| 266 |
+
try:
|
| 267 |
+
if _images_likely_missing(works_dir):
|
| 268 |
+
print("๐ผ๏ธ Images appear to be missing or incomplete โ resuming image download phase...")
|
| 269 |
+
_download_images_phase_only(works_dir)
|
| 270 |
+
else:
|
| 271 |
+
print("๐ผ๏ธ Images appear present for sampled works โ skipping image download phase")
|
| 272 |
+
except Exception as e:
|
| 273 |
+
print(f"โ ๏ธ Image presence check failed: {e}")
|
| 274 |
return works_dir
|
| 275 |
|
| 276 |
# Use optimized download approach
|
|
|
|
| 342 |
traceback.print_exc()
|
| 343 |
return None
|
| 344 |
|
| 345 |
+
def _images_likely_missing(works_dir: Path, sample_size: int = 20) -> bool:
|
| 346 |
+
"""Quickly assess whether images are present in the cache.
|
| 347 |
+
|
| 348 |
+
We sample up to `sample_size` work directories and check for any .jpg/.png
|
| 349 |
+
files either under <work>/images/ or directly inside <work>/.
|
| 350 |
+
|
| 351 |
+
Returns True if fewer than 20% of sampled works have at least one image.
|
| 352 |
+
"""
|
| 353 |
+
try:
|
| 354 |
+
work_dirs = [d for d in works_dir.iterdir() if d.is_dir()]
|
| 355 |
+
if not work_dirs:
|
| 356 |
+
print("๐ผ๏ธ Image check: no work directories found โ treating as missing")
|
| 357 |
+
return True
|
| 358 |
+
|
| 359 |
+
sampled = work_dirs[:sample_size]
|
| 360 |
+
has_images_count = 0
|
| 361 |
+
for work_dir in sampled:
|
| 362 |
+
images_dir = work_dir / "images"
|
| 363 |
+
found = False
|
| 364 |
+
if images_dir.exists():
|
| 365 |
+
if any(images_dir.glob("*.jpg")) or any(images_dir.glob("*.jpeg")) or any(images_dir.glob("*.png")):
|
| 366 |
+
found = True
|
| 367 |
+
# Fallback: look in the work dir directly
|
| 368 |
+
if not found:
|
| 369 |
+
if any(work_dir.glob("*.jpg")) or any(work_dir.glob("*.jpeg")) or any(work_dir.glob("*.png")):
|
| 370 |
+
found = True
|
| 371 |
+
if found:
|
| 372 |
+
has_images_count += 1
|
| 373 |
+
|
| 374 |
+
ratio = has_images_count / max(1, len(sampled))
|
| 375 |
+
print(f"๐ผ๏ธ Image check: {has_images_count}/{len(sampled)} sampled works have images (ratio={ratio:.2f})")
|
| 376 |
+
return ratio < 0.2
|
| 377 |
+
except Exception as e:
|
| 378 |
+
print(f"โ ๏ธ Image sampling check failed: {e}")
|
| 379 |
+
# Be conservative โ assume images are missing so we attempt to download them
|
| 380 |
+
return True
|
| 381 |
+
|
| 382 |
+
def _download_images_phase_only(works_dir: Path) -> Optional[Path]:
|
| 383 |
+
"""Resume/perform only the image download phase without touching markdown files.
|
| 384 |
+
|
| 385 |
+
This function discovers files on the HF repo, constructs the list of works,
|
| 386 |
+
loads any existing download progress, and runs the robust image downloader.
|
| 387 |
+
"""
|
| 388 |
+
try:
|
| 389 |
+
from huggingface_hub import list_repo_files
|
| 390 |
+
import json
|
| 391 |
+
|
| 392 |
+
progress_file = works_dir.parent / "download_progress.json"
|
| 393 |
+
|
| 394 |
+
# Load existing progress if available
|
| 395 |
+
progress = {"markdown_completed": set(), "image_batches_completed": set()}
|
| 396 |
+
if progress_file.exists():
|
| 397 |
+
try:
|
| 398 |
+
with open(progress_file, 'r') as f:
|
| 399 |
+
saved_progress = json.load(f)
|
| 400 |
+
progress["markdown_completed"] = set(saved_progress.get("markdown_completed", []))
|
| 401 |
+
progress["image_batches_completed"] = set(saved_progress.get("image_batches_completed", []))
|
| 402 |
+
print(f"๐ Resuming image download from previous progress...")
|
| 403 |
+
except Exception as e:
|
| 404 |
+
print(f"โ ๏ธ Could not load progress file: {e}")
|
| 405 |
+
|
| 406 |
+
print("๐ Discovering files in dataset (images phase only)...")
|
| 407 |
+
files = list_repo_files(repo_id=ARTEFACT_MARKDOWN_DATASET, repo_type="dataset")
|
| 408 |
+
|
| 409 |
+
work_dirs = set()
|
| 410 |
+
for file_path in files:
|
| 411 |
+
if file_path.startswith("works/"):
|
| 412 |
+
parts = file_path.split("/")
|
| 413 |
+
if len(parts) >= 2:
|
| 414 |
+
work_id = parts[1]
|
| 415 |
+
if work_id.startswith("W"):
|
| 416 |
+
work_dirs.add(work_id)
|
| 417 |
+
|
| 418 |
+
print(f"๐ Images phase: {len(work_dirs)} work directories discovered")
|
| 419 |
+
_download_images_robust(works_dir, work_dirs, files, progress, progress_file)
|
| 420 |
+
return works_dir
|
| 421 |
+
except Exception as e:
|
| 422 |
+
print(f"โ Images-phase-only download failed: {e}")
|
| 423 |
+
import traceback
|
| 424 |
+
traceback.print_exc()
|
| 425 |
+
return None
|
| 426 |
+
|
| 427 |
def _download_markdown_files_robust(works_dir: Path, work_dirs: set, files: list, progress: dict, progress_file: Path) -> None:
|
| 428 |
"""Download markdown files with retry logic and progress persistence"""
|
| 429 |
import concurrent.futures
|