Spaces:

evaleval
/

benchmarkcard-webhook

Sleeping

App Files Files Community

aristoteles78 commited on 11 days ago

Commit

b714bbd

1 Parent(s): 19b064e

webhook hardening, dedup, init-known-folders

Browse files

Files changed (4) hide show

__pycache__/app.cpython-311.pyc +0 -0
__pycache__/worker.cpython-311.pyc +0 -0
app.py +75 -10
worker.py +302 -200

__pycache__/app.cpython-311.pyc DELETED Viewed

Binary file (8.16 kB)

__pycache__/worker.cpython-311.pyc DELETED Viewed

Binary file (13.1 kB)

app.py CHANGED Viewed

@@ -2,6 +2,7 @@
 Listens for PR merge events on evaleval/EEE_datastore and triggers
 card generation for new benchmarks in a background thread.
 """
 import hmac
@@ -17,7 +18,8 @@ from worker import (
     detect_new_benchmarks,
     process_new_benchmarks,
     load_state,
-    save_state,
     PERSISTENT_DIR,
 )
@@ -29,6 +31,9 @@ logger = logging.getLogger("webhook")
 app = FastAPI(title="BenchmarkCard Webhook")
 # Track active generation thread (max 1 concurrent)
 _active_job: dict = {"thread": None, "started_at": None, "folders": []}
 _job_lock = threading.Lock()
@@ -52,12 +57,35 @@ def _is_merged_pr(payload: dict) -> bool:
     )
 def _run_generation(new_folders: list[str]):
-    """Background worker: generate cards for new benchmark folders."""
     try:
         logger.info("Background generation started for %d folders: %s", len(new_folders), new_folders)
         process_new_benchmarks(new_folders)
         logger.info("Background generation completed")
     except Exception:
         logger.exception("Background generation failed")
     finally:
@@ -70,14 +98,12 @@ def _run_generation(new_folders: list[str]):
 @app.post("/webhook")
 async def webhook(request: Request):
     """Receive HF webhook events and trigger card generation."""
-    # Verify secret
     secret = request.headers.get("X-Webhook-Secret", "")
     if not _verify_secret(secret):
         return JSONResponse(status_code=403, content={"error": "invalid secret"})
     payload = await request.json()
-    # Only act on merged PRs
     if not _is_merged_pr(payload):
         event_scope = payload.get("event", {}).get("scope", "unknown")
         discussion = payload.get("discussion", {})
@@ -92,7 +118,6 @@ async def webhook(request: Request):
     pr_num = discussion.get("num", "?")
     logger.info("Merged PR detected: #%s '%s'", pr_num, pr_title)
-    # Detect new benchmark folders
     new_folders = detect_new_benchmarks()
     if not new_folders:
         return JSONResponse(
@@ -100,16 +125,32 @@ async def webhook(request: Request):
             content={"action": "no_new_benchmarks", "pr": f"#{pr_num} {pr_title}"},
         )
-    # Spawn background thread if not already running
     with _job_lock:
-        if _active_job["thread"] is not None and _active_job["thread"].is_alive():
             return JSONResponse(
                 status_code=200,
                 content={
                     "action": "queued",
-                    "reason": "generation already in progress",
                     "active_folders": _active_job["folders"],
-                    "new_folders": new_folders,
                 },
             )
@@ -145,14 +186,38 @@ async def status():
     return {
         "active_job": active,
         "known_folders": len(state.get("known_folders", [])),
         "jobs": state.get("jobs", [])[-20:],
     }
 @app.get("/")
 async def root():
     """Root endpoint for HF Space UI."""
-    return {"service": "BenchmarkCard Webhook", "endpoints": ["/webhook", "/status", "/health"]}
 @app.get("/health")

 Listens for PR merge events on evaleval/EEE_datastore and triggers
 card generation for new benchmarks in a background thread.
+Queued folders are persisted so nothing is lost between webhook events.
 """
 import hmac
     detect_new_benchmarks,
     process_new_benchmarks,
     load_state,
+    save_pending,
+    pop_pending,
     PERSISTENT_DIR,
 )
 app = FastAPI(title="BenchmarkCard Webhook")
+# Max time a generation job can run before we allow new jobs (1 hour)
+MAX_JOB_DURATION_SECONDS = 3600
 # Track active generation thread (max 1 concurrent)
 _active_job: dict = {"thread": None, "started_at": None, "folders": []}
 _job_lock = threading.Lock()
     )
+def _is_job_timed_out() -> bool:
+    """Check if the active job has exceeded the timeout."""
+    started = _active_job.get("started_at")
+    if not started:
+        return False
+    try:
+        started_dt = datetime.fromisoformat(started)
+        elapsed = (datetime.now(timezone.utc) - started_dt).total_seconds()
+        return elapsed > MAX_JOB_DURATION_SECONDS
+    except (ValueError, TypeError):
+        return False
 def _run_generation(new_folders: list[str]):
+    """Background worker: generate cards, then drain pending queue."""
     try:
         logger.info("Background generation started for %d folders: %s", len(new_folders), new_folders)
         process_new_benchmarks(new_folders)
         logger.info("Background generation completed")
+        # Drain pending queue: process any folders that arrived while we were busy
+        while True:
+            pending = pop_pending()
+            if not pending:
+                break
+            logger.info("Draining pending queue: %d folders: %s", len(pending), pending)
+            # Re-detect to catch any new folders since the pending was saved
+            process_new_benchmarks(pending)
     except Exception:
         logger.exception("Background generation failed")
     finally:
 @app.post("/webhook")
 async def webhook(request: Request):
     """Receive HF webhook events and trigger card generation."""
     secret = request.headers.get("X-Webhook-Secret", "")
     if not _verify_secret(secret):
         return JSONResponse(status_code=403, content={"error": "invalid secret"})
     payload = await request.json()
     if not _is_merged_pr(payload):
         event_scope = payload.get("event", {}).get("scope", "unknown")
         discussion = payload.get("discussion", {})
     pr_num = discussion.get("num", "?")
     logger.info("Merged PR detected: #%s '%s'", pr_num, pr_title)
     new_folders = detect_new_benchmarks()
     if not new_folders:
         return JSONResponse(
             content={"action": "no_new_benchmarks", "pr": f"#{pr_num} {pr_title}"},
         )
     with _job_lock:
+        thread_alive = _active_job["thread"] is not None and _active_job["thread"].is_alive()
+        timed_out = thread_alive and _is_job_timed_out()
+        if timed_out:
+            logger.warning(
+                "Active job timed out (started %s), allowing new job",
+                _active_job["started_at"],
+            )
+            # Don't kill old thread (daemon, will die on exit), just reset tracking
+            _active_job["thread"] = None
+            _active_job["started_at"] = None
+            _active_job["folders"] = []
+            thread_alive = False
+        if thread_alive:
+            # Persist to pending queue so they get processed after current job
+            save_pending(new_folders)
+            logger.info("Job in progress, queued %d folders to pending", len(new_folders))
             return JSONResponse(
                 status_code=200,
                 content={
                     "action": "queued",
+                    "reason": "generation in progress, folders saved to pending queue",
                     "active_folders": _active_job["folders"],
+                    "queued_folders": new_folders,
                 },
             )
     return {
         "active_job": active,
         "known_folders": len(state.get("known_folders", [])),
+        "pending_folders": state.get("pending_folders", []),
         "jobs": state.get("jobs", [])[-20:],
     }
+@app.post("/init-known-folders")
+async def init_known_folders(request: Request):
+    """One-time: mark all current EEE folders as known."""
+    secret = request.headers.get("X-Webhook-Secret", "")
+    if not _verify_secret(secret):
+        return JSONResponse(status_code=403, content={"error": "invalid secret"})
+    from huggingface_hub import HfApi
+    from worker import save_state
+    api = HfApi()
+    all_files = api.list_repo_files("evaleval/EEE_datastore", repo_type="dataset")
+    current_folders = sorted({
+        p.split("/")[1] for p in all_files
+        if p.startswith("data/") and len(p.split("/")) >= 2
+    })
+    state = load_state()
+    state["known_folders"] = current_folders
+    save_state(state)
+    return {"action": "initialized", "known_folders_count": len(current_folders)}
 @app.get("/")
 async def root():
     """Root endpoint for HF Space UI."""
+    return {"service": "BenchmarkCard Webhook", "endpoints": ["/webhook", "/status", "/health", "/init-known-folders"]}
 @app.get("/health")

worker.py CHANGED Viewed

@@ -1,18 +1,20 @@
 """Background worker for benchmark card generation.
-Detects new benchmarks in EEE_datastore by scanning folders for actual
-benchmark names, resolves them via the Entity Registry for canonical IDs,
-and skips benchmarks that already have cards in evaleval/auto-benchmarkcards.
 """
 import json
 import logging
 import os
-import shutil
 import tempfile
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import Any
 import requests
 from huggingface_hub import HfApi, snapshot_download
@@ -21,8 +23,6 @@ logger = logging.getLogger("worker")
 EEE_REPO = "evaleval/EEE_datastore"
 CARDS_REPO = "evaleval/auto-benchmarkcards"
-MAX_BENCHMARKS_PER_JOB = 5
 ENTITY_REGISTRY_URL = "https://evaleval-entity-registry.hf.space/api/v1"
 # Persistent storage on HF Spaces (mounted volume).
@@ -30,176 +30,209 @@ ENTITY_REGISTRY_URL = "https://evaleval-entity-registry.hf.space/api/v1"
 PERSISTENT_DIR = Path(os.environ.get("PERSISTENT_DIR", "/data"))
 STATE_FILE = PERSISTENT_DIR / "state.json"
 def load_state() -> dict:
-    """Load persistent state (known folders, job history)."""
     if STATE_FILE.exists():
         try:
             return json.loads(STATE_FILE.read_text())
         except Exception:
             logger.exception("Failed to read state file, starting fresh")
-    return {"known_folders": [], "jobs": []}
 def save_state(state: dict) -> None:
-    """Save persistent state to disk (atomic write via temp + rename)."""
     PERSISTENT_DIR.mkdir(parents=True, exist_ok=True)
     tmp = STATE_FILE.with_suffix(".tmp")
     tmp.write_text(json.dumps(state, indent=2))
     tmp.rename(STATE_FILE)
-def _normalize_name(name: str) -> str:
-    """Local fallback normalization (lowercase, collapse separators to underscore)."""
-    return name.lower().replace("-", "_").replace(" ", "_")
-def _resolve_names(names: list[str]) -> dict[str, str]:
-    """Resolve benchmark names to canonical IDs via Entity Registry.
-    Returns a mapping {raw_name: canonical_id}. Names the registry doesn't
-    recognize get a locally-normalized fallback ID so dedup still works.
     """
-    resolved = {}
-    if not names:
-        return resolved
     try:
         resp = requests.post(
             f"{ENTITY_REGISTRY_URL}/resolve/batch",
-            json=[{"raw_value": n, "entity_type": "benchmark"} for n in names],
-            timeout=15,
         )
         resp.raise_for_status()
         results = resp.json()
-        # Response is a list in the same order as the input
-        for name, item in zip(names, results):
-            canonical = item.get("canonical_id")
-            resolved[name] = canonical if canonical else _normalize_name(name)
-        registry_hits = sum(1 for item in results if item.get("canonical_id"))
-        logger.info(
-            "Entity Registry resolved %d/%d names",
-            registry_hits, len(names),
-        )
-    except Exception:
-        logger.warning("Entity Registry unreachable, falling back to local normalization")
-        for n in names:
-            resolved[n] = _normalize_name(n)
-    return resolved
 def _extract_folders(file_list: list[str]) -> set[str]:
     """Extract unique top-level folder names under data/."""
     folders = set()
     for path in file_list:
         parts = path.split("/")
-        if len(parts) >= 3 and parts[0] == "data":
             folders.add(parts[1])
     return folders
-def _get_existing_cards() -> set[str]:
-    """List card names already in the target dataset (without path/extension)."""
     api = HfApi()
-    try:
-        files = api.list_repo_files(CARDS_REPO, repo_type="dataset")
-    except Exception:
-        logger.exception("Failed to list existing cards")
-        return set()
-    names = set()
-    for f in files:
-        if f.startswith("cards/") and f.endswith(".json"):
-            names.add(f[len("cards/"):-len(".json")])
-    return names
-def _download_folder(folder_name: str) -> Path:
-    """Download a single EEE folder to a temp directory."""
-    target_dir = tempfile.mkdtemp(prefix=f"eee_{folder_name}_")
-    logger.info("Downloading EEE folder '%s' to %s", folder_name, target_dir)
     snapshot_download(
         repo_id=EEE_REPO,
         repo_type="dataset",
         local_dir=target_dir,
-        allow_patterns=[f"data/{folder_name}/**/*.json"],
     )
     return Path(target_dir) / "data"
-def detect_new_benchmarks() -> list[str]:
-    """Find EEE folders that contain benchmarks without cards.
-    Returns folder names that have at least one benchmark not yet in the
-    cards dataset. The actual per-benchmark filtering happens during processing.
-    """
-    api = HfApi()
-    try:
-        all_files = api.list_repo_files(EEE_REPO, repo_type="dataset")
-    except Exception:
-        logger.exception("Failed to list EEE_datastore files")
-        return []
-    current_folders = _extract_folders(all_files)
-    existing_cards = _get_existing_cards()
-    # Normalize existing card names for comparison (match both hyphen and underscore)
-    normalized_cards = set()
-    for c in existing_cards:
-        normalized_cards.add(c)
-        normalized_cards.add(c.replace("-", "_"))
-        normalized_cards.add(c.replace("_", "-"))
-    # A folder is "new" if its normalized name doesn't match any card.
-    # This is a coarse filter — per-benchmark dedup happens in process_new_benchmarks.
-    new_folders = sorted(
-        f for f in current_folders
-        if _normalize_name(f) not in normalized_cards
-    )
-    if not new_folders:
-        logger.info("All %d folders already have cards", len(current_folders))
-        return []
-    if len(new_folders) > MAX_BENCHMARKS_PER_JOB:
-        logger.info(
-            "Found %d folders without cards, limiting to %d per job",
-            len(new_folders), MAX_BENCHMARKS_PER_JOB,
-        )
-        new_folders = new_folders[:MAX_BENCHMARKS_PER_JOB]
-    logger.info("Processing %d folders: %s", len(new_folders), new_folders)
-    return new_folders
-def _upload_card(card: dict, benchmark_name: str, canonical_id: str | None = None) -> bool:
-    """Upload a generated card to the auto-benchmarkcards dataset."""
     api = HfApi()
-    # Use canonical ID for filename when available, fall back to local normalization
-    if canonical_id:
-        safe_name = canonical_id
-    else:
-        safe_name = _normalize_name(benchmark_name).replace("/", "_")
-    remote_path = f"cards/{safe_name}.json"
-    # Embed canonical ID in the card so it's self-consistent with the filename
-    if canonical_id:
-        inner = card.get("benchmark_card", card)
-        inner.setdefault("card_info", {})["canonical_id"] = canonical_id
     try:
-        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
-            json.dump(card, f, indent=2)
-            tmp_path = f.name
         api.upload_file(
             path_or_fileobj=tmp_path,
             path_in_repo=remote_path,
@@ -209,10 +242,6 @@ def _upload_card(card: dict, benchmark_name: str, canonical_id: str | None = Non
         )
         logger.info("Uploaded card to %s/%s", CARDS_REPO, remote_path)
         return True
-    except Exception:
-        logger.exception("Failed to upload card for %s", benchmark_name)
-        return False
     finally:
         try:
             os.unlink(tmp_path)
@@ -220,30 +249,82 @@ def _upload_card(card: dict, benchmark_name: str, canonical_id: str | None = Non
             pass
 def process_new_benchmarks(new_folders: list[str]) -> None:
-    """Generate and upload cards for benchmarks that don't have one yet.
-    Downloads each folder, scans for benchmarks, resolves names via the
-    Entity Registry, and skips any benchmark that already has a card.
     """
-    from auto_benchmarkcard.tools.eee.eee_tool import (
-        scan_eee_folder,
-        eee_to_pipeline_inputs,
-    )
-    from auto_benchmarkcard.eee_workflow import process_single_benchmark
-    from auto_benchmarkcard.logging_setup import setup_logging_suppression
     setup_logging_suppression(debug_mode=False)
-    # Fetch existing cards once for the whole job.
-    # Normalize to both hyphen and underscore forms so we match old and new naming.
-    existing_cards = _get_existing_cards()
-    existing_normalized = set()
-    for c in existing_cards:
-        existing_normalized.add(c)
-        existing_normalized.add(c.replace("-", "_"))
-        existing_normalized.add(c.replace("_", "-"))
     state = load_state()
     job_record: dict[str, Any] = {
         "started_at": datetime.now(timezone.utc).isoformat(),
@@ -251,81 +332,102 @@ def process_new_benchmarks(new_folders: list[str]) -> None:
         "results": [],
     }
-    for folder_name in new_folders:
-        logger.info("Processing folder: %s", folder_name)
-        tmp_root = None
         try:
-            data_path = _download_folder(folder_name)
-            tmp_root = data_path.parent
         except Exception:
-            logger.exception("Failed to download folder %s", folder_name)
             job_record["results"].append({
-                "folder": folder_name, "status": "download_failed",
             })
-            continue
         try:
             scan_result = scan_eee_folder(str(data_path))
-            # Resolve all benchmark names in this folder at once
-            bench_names = list(scan_result.benchmarks.keys())
-            resolved = _resolve_names(bench_names)
-            for name, bench in sorted(scan_result.benchmarks.items()):
-                canonical = resolved.get(name, _normalize_name(name))
-                # Per-benchmark dedup: skip if a card already exists
-                if canonical in existing_normalized:
-                    logger.info("Skipping %s: card already exists (canonical: %s)", name, canonical)
                     job_record["results"].append({
-                        "folder": folder_name, "benchmark": name,
-                        "canonical_id": canonical, "status": "already_exists",
                     })
-                    continue
-                inputs = eee_to_pipeline_inputs(bench)
-                if not inputs.get("hf_repo"):
-                    logger.warning("Skipping %s: no HF repo", name)
                     job_record["results"].append({
-                        "folder": folder_name, "benchmark": name, "status": "no_hf_repo",
                     })
-                    continue
-                card = process_single_benchmark(
-                    benchmark_name=name,
-                    pipeline_inputs=inputs,
-                    base_output_path=str(PERSISTENT_DIR / "output"),
-                    debug=False,
-                )
-                if card:
-                    uploaded = _upload_card(card, name, canonical_id=canonical)
-                    job_record["results"].append({
-                        "folder": folder_name, "benchmark": name,
-                        "canonical_id": canonical,
-                        "status": "uploaded" if uploaded else "upload_failed",
-                    })
-                    # Add to existing set so later benchmarks in same job are deduped
-                    if uploaded:
-                        existing_normalized.add(canonical)
-                        existing_normalized.add(canonical.replace("-", "_"))
-                else:
                     job_record["results"].append({
-                        "folder": folder_name, "benchmark": name, "status": "generation_failed",
                     })
-        except Exception:
-            logger.exception("Failed to process folder %s", folder_name)
-            job_record["results"].append({
-                "folder": folder_name, "status": "scan_failed",
-            })
-        finally:
-            if tmp_root and tmp_root.exists():
-                shutil.rmtree(tmp_root, ignore_errors=True)
         if folder_name not in state["known_folders"]:
             state["known_folders"].append(folder_name)
@@ -333,8 +435,8 @@ def process_new_benchmarks(new_folders: list[str]) -> None:
     results = job_record["results"]
     uploaded = sum(1 for r in results if r.get("status") == "uploaded")
-    failed = sum(1 for r in results if r.get("status") not in ("uploaded", "no_hf_repo", "already_exists"))
-    skipped = sum(1 for r in results if r.get("status") in ("no_hf_repo", "already_exists"))
     logger.info("Job complete: %d uploaded, %d failed, %d skipped", uploaded, failed, skipped)
     state["jobs"].append(job_record)

 """Background worker for benchmark card generation.
+Detects new benchmark folders in EEE_datastore, generates cards via
+run_eee_pipeline(), and uploads them to evaleval/auto-benchmarkcards.
+Uses Jenny's Entity Registry for canonical ID resolution and dedup.
 """
 import json
 import logging
 import os
 import tempfile
+import time
 from datetime import datetime, timezone
+from functools import wraps
 from pathlib import Path
+from typing import Any, Optional
 import requests
 from huggingface_hub import HfApi, snapshot_download
 EEE_REPO = "evaleval/EEE_datastore"
 CARDS_REPO = "evaleval/auto-benchmarkcards"
 ENTITY_REGISTRY_URL = "https://evaleval-entity-registry.hf.space/api/v1"
 # Persistent storage on HF Spaces (mounted volume).
 PERSISTENT_DIR = Path(os.environ.get("PERSISTENT_DIR", "/data"))
 STATE_FILE = PERSISTENT_DIR / "state.json"
+FORCE_REGENERATE = os.environ.get("FORCE_REGENERATE", "").lower() in ("1", "true", "yes")
+# -- Retry decorator for transient failures --
+def retry(max_attempts=3, delay=5, backoff=2):
+    """Retry decorator with exponential backoff for transient failures."""
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            for attempt in range(max_attempts):
+                try:
+                    return func(*args, **kwargs)
+                except Exception as e:
+                    if attempt == max_attempts - 1:
+                        raise
+                    wait = delay * (backoff ** attempt)
+                    logger.warning(
+                        "%s failed (attempt %d/%d), retrying in %ds: %s",
+                        func.__name__, attempt + 1, max_attempts, wait, e,
+                    )
+                    time.sleep(wait)
+        return wrapper
+    return decorator
+# -- State management (atomic writes) --
 def load_state() -> dict:
+    """Load persistent state (known folders, job history, pending queue)."""
     if STATE_FILE.exists():
         try:
             return json.loads(STATE_FILE.read_text())
         except Exception:
             logger.exception("Failed to read state file, starting fresh")
+    return {"known_folders": [], "jobs": [], "pending_folders": []}
 def save_state(state: dict) -> None:
+    """Save persistent state atomically (write-then-rename)."""
     PERSISTENT_DIR.mkdir(parents=True, exist_ok=True)
     tmp = STATE_FILE.with_suffix(".tmp")
     tmp.write_text(json.dumps(state, indent=2))
     tmp.rename(STATE_FILE)
+def save_pending(folders: list[str]) -> None:
+    """Add folders to the pending queue."""
+    state = load_state()
+    pending = state.get("pending_folders", [])
+    for f in folders:
+        if f not in pending:
+            pending.append(f)
+    state["pending_folders"] = pending
+    save_state(state)
+def pop_pending() -> list[str]:
+    """Pop all pending folders from the queue."""
+    state = load_state()
+    pending = state.pop("pending_folders", [])
+    state["pending_folders"] = []
+    save_state(state)
+    return pending
+# -- Entity Registry --
+_canonical_cache: dict[str, Optional[str]] = {}
+def resolve_canonical_id(benchmark_name: str) -> Optional[str]:
+    """Resolve benchmark name to canonical_id via Entity Registry.
+    Returns canonical_id string (e.g. "math") or None if not found.
+    Uses an in-memory cache to avoid repeated API calls within a job.
     """
+    if benchmark_name in _canonical_cache:
+        return _canonical_cache[benchmark_name]
+    try:
+        resp = requests.post(
+            f"{ENTITY_REGISTRY_URL}/resolve",
+            json={"raw_value": benchmark_name, "entity_type": "benchmark"},
+            timeout=10,
+        )
+        resp.raise_for_status()
+        data = resp.json()
+        canonical_id = data.get("canonical_id")
+        _canonical_cache[benchmark_name] = canonical_id
+        if canonical_id:
+            logger.info("Entity Registry: '%s' -> '%s'", benchmark_name, canonical_id)
+        return canonical_id
+    except Exception:
+        logger.debug("Entity Registry lookup failed for '%s'", benchmark_name)
+        _canonical_cache[benchmark_name] = None
+        return None
+def resolve_canonical_ids_batch(names: list[str]) -> dict[str, Optional[str]]:
+    """Batch-resolve benchmark names to canonical_ids."""
+    # Check cache first, only query uncached names
+    uncached = [n for n in names if n not in _canonical_cache]
+    if not uncached:
+        return {n: _canonical_cache[n] for n in names}
     try:
+        payload = [{"raw_value": n, "entity_type": "benchmark"} for n in uncached]
         resp = requests.post(
             f"{ENTITY_REGISTRY_URL}/resolve/batch",
+            json=payload,
+            timeout=30,
         )
         resp.raise_for_status()
         results = resp.json()
+        for name, result in zip(uncached, results):
+            canonical_id = result.get("canonical_id")
+            _canonical_cache[name] = canonical_id
+            if canonical_id:
+                logger.info("Entity Registry: '%s' -> '%s'", name, canonical_id)
+    except Exception:
+        logger.warning("Entity Registry batch resolve failed, using fallback")
+        for name in uncached:
+            _canonical_cache[name] = None
+    return {n: _canonical_cache.get(n) for n in names}
+def _get_card_filename(benchmark_name: str) -> str:
+    """Get the canonical filename for a benchmark card.
+    Uses Entity Registry canonical_id when available, falls back to
+    sanitize_benchmark_name from the main package.
+    """
+    canonical = resolve_canonical_id(benchmark_name)
+    if canonical:
+        return canonical
+    from auto_benchmarkcard.output import sanitize_benchmark_name
+    return sanitize_benchmark_name(benchmark_name).lower()
+# -- EEE folder detection --
 def _extract_folders(file_list: list[str]) -> set[str]:
     """Extract unique top-level folder names under data/."""
     folders = set()
     for path in file_list:
         parts = path.split("/")
+        if len(parts) >= 2 and parts[0] == "data":
             folders.add(parts[1])
     return folders
+@retry(max_attempts=3, delay=5)
+def detect_new_benchmarks() -> list[str]:
+    """Compare current EEE_datastore file listing against known state."""
     api = HfApi()
+    all_files = api.list_repo_files(EEE_REPO, repo_type="dataset")
+    current_folders = _extract_folders(all_files)
+    state = load_state()
+    known = set(state.get("known_folders", []))
+    new_folders = sorted(current_folders - known)
+    if new_folders:
+        logger.info("Detected %d new folders: %s", len(new_folders), new_folders)
+    else:
+        logger.info("No new folders (known: %d, current: %d)", len(known), len(current_folders))
+    return new_folders
+# -- Download & upload --
+@retry(max_attempts=3, delay=10)
+def _download_folders(folder_names: list[str], target_dir: str) -> Path:
+    """Download EEE folders into a shared temp directory."""
+    patterns = [f"data/{f}/**/*.json" for f in folder_names]
+    logger.info("Downloading %d EEE folders to %s", len(folder_names), target_dir)
     snapshot_download(
         repo_id=EEE_REPO,
         repo_type="dataset",
         local_dir=target_dir,
+        allow_patterns=patterns,
     )
     return Path(target_dir) / "data"
+@retry(max_attempts=3, delay=5)
+def _upload_card(card: dict, benchmark_name: str, canonical_id: Optional[str] = None) -> bool:
+    """Upload a generated card to evaleval/auto-benchmarkcards."""
     api = HfApi()
+    filename = canonical_id or _get_card_filename(benchmark_name)
+    remote_path = f"cards/{filename}.json"
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+        json.dump(card, f, indent=2)
+        tmp_path = f.name
     try:
         api.upload_file(
             path_or_fileobj=tmp_path,
             path_in_repo=remote_path,
         )
         logger.info("Uploaded card to %s/%s", CARDS_REPO, remote_path)
         return True
     finally:
         try:
             os.unlink(tmp_path)
             pass
+@retry(max_attempts=2, delay=5)
+def _list_existing_cards() -> set[str]:
+    """List all card filenames (without extension) in the cards repo."""
+    api = HfApi()
+    all_files = api.list_repo_files(CARDS_REPO, repo_type="dataset")
+    cards = set()
+    for path in all_files:
+        if path.startswith("cards/") and path.endswith(".json"):
+            name = path[len("cards/"):-len(".json")]
+            cards.add(name)
+    return cards
+# -- Main processing --
+def _build_dedup_filter(
+    benchmark_names: list[str],
+    existing_cards: set[str],
+) -> list[str]:
+    """Return list of benchmark names that don't already have cards.
+    Checks in order: Entity Registry canonical_id, exact fallback name,
+    and parent prefix match (for 'Parent - Child' pattern).
+    """
+    if FORCE_REGENERATE:
+        logger.info("FORCE_REGENERATE=true, skipping dedup")
+        return benchmark_names
+    # Batch-resolve all names
+    canonical_map = resolve_canonical_ids_batch(benchmark_names)
+    from auto_benchmarkcard.output import sanitize_benchmark_name
+    new_benchmarks = []
+    for name in benchmark_names:
+        canonical = canonical_map.get(name)
+        fallback = sanitize_benchmark_name(name).lower()
+        # 1. Entity Registry canonical_id match
+        if canonical and canonical in existing_cards:
+            logger.info("Skipping '%s' (card exists as '%s')", name, canonical)
+            continue
+        # 2. Exact fallback name match
+        if fallback in existing_cards:
+            logger.info("Skipping '%s' (card exists as '%s')", name, fallback)
+            continue
+        # 3. Prefix match: 'MGSM - Bengali' -> check if 'mgsm' card exists
+        if " - " in name:
+            parent = name.split(" - ", 1)[0].strip()
+            parent_lower = sanitize_benchmark_name(parent).lower()
+            if parent_lower in existing_cards:
+                logger.info("Skipping '%s' (parent card exists as '%s')", name, parent_lower)
+                continue
+        new_benchmarks.append(name)
+    logger.info("Dedup: %d total, %d new, %d existing",
+                len(benchmark_names), len(new_benchmarks),
+                len(benchmark_names) - len(new_benchmarks))
+    return new_benchmarks
 def process_new_benchmarks(new_folders: list[str]) -> None:
+    """Generate and upload cards for benchmarks in new folders.
+    Delegates to run_eee_pipeline() for the actual generation, using a
+    callback to upload each card as it's generated.
     """
+    from auto_benchmarkcard.eee_workflow import run_eee_pipeline
+    from auto_benchmarkcard.tools.eee.eee_tool import scan_eee_folder
+    from auto_benchmarkcard.workflow import setup_logging_suppression
     setup_logging_suppression(debug_mode=False)
     state = load_state()
     job_record: dict[str, Any] = {
         "started_at": datetime.now(timezone.utc).isoformat(),
         "results": [],
     }
+    # Pre-fetch existing cards for dedup
+    try:
+        existing_cards = _list_existing_cards()
+        logger.info("Found %d existing cards in %s", len(existing_cards), CARDS_REPO)
+    except Exception:
+        logger.warning("Failed to list existing cards, dedup disabled for this job")
+        existing_cards = set()
+    # Download all folders into one shared temp dir
+    with tempfile.TemporaryDirectory(prefix="eee_batch_") as tmpdir:
         try:
+            data_path = _download_folders(new_folders, tmpdir)
         except Exception:
+            logger.exception("Failed to download EEE folders")
             job_record["results"].append({
+                "folders": new_folders, "status": "download_failed",
             })
+            job_record["completed_at"] = datetime.now(timezone.utc).isoformat()
+            state["jobs"].append(job_record)
+            state["jobs"] = state["jobs"][-50:]
+            save_state(state)
+            return
+        # Scan to discover benchmark names for dedup
         try:
             scan_result = scan_eee_folder(str(data_path))
+        except Exception:
+            logger.exception("Failed to scan EEE data")
+            job_record["results"].append({
+                "folders": new_folders, "status": "scan_failed",
+            })
+            job_record["completed_at"] = datetime.now(timezone.utc).isoformat()
+            state["jobs"].append(job_record)
+            state["jobs"] = state["jobs"][-50:]
+            save_state(state)
+            return
+        all_names = (
+            list(scan_result.benchmarks.keys())
+            + list(scan_result.composites.keys())
+        )
+        benchmarks_to_generate = _build_dedup_filter(all_names, existing_cards)
+        if not benchmarks_to_generate:
+            logger.info("All benchmarks already have cards, nothing to generate")
+            job_record["results"].append({"status": "all_existing"})
+        else:
+            # Upload callback: called by run_eee_pipeline for each generated card
+            def _on_card_generated(name: str, card: dict) -> None:
+                canonical = resolve_canonical_id(name)
+                # Enrich card metadata
+                inner = card.get("benchmark_card", card)
+                info = inner.get("card_info", {})
+                info["source"] = "webhook"
+                if canonical:
+                    info["canonical_id"] = canonical
+                inner["card_info"] = info
+                try:
+                    _upload_card(card, name, canonical_id=canonical)
                     job_record["results"].append({
+                        "benchmark": name,
+                        "canonical_id": canonical,
+                        "status": "uploaded",
                     })
+                except Exception:
+                    logger.exception("Failed to upload card for %s", name)
                     job_record["results"].append({
+                        "benchmark": name, "status": "upload_failed",
                     })
+            # Run the unified pipeline
+            summary = run_eee_pipeline(
+                eee_path=str(data_path),
+                output_path=str(PERSISTENT_DIR / "output"),
+                benchmarks_filter=benchmarks_to_generate,
+                on_card_generated=_on_card_generated,
+            )
+            # Record skipped/failed from pipeline summary
+            for item in summary.get("skipped", []):
+                job_record["results"].append({
+                    "benchmark": item.get("benchmark", "unknown"),
+                    "status": f"skipped:{item.get('reason', 'unknown')}",
+                })
+            for name in summary.get("failed", []):
+                # Only add if not already recorded by callback
+                existing = {r.get("benchmark") for r in job_record["results"]}
+                if name not in existing:
                     job_record["results"].append({
+                        "benchmark": name, "status": "generation_failed",
                     })
+    # Mark folders as known
+    for folder_name in new_folders:
         if folder_name not in state["known_folders"]:
             state["known_folders"].append(folder_name)
     results = job_record["results"]
     uploaded = sum(1 for r in results if r.get("status") == "uploaded")
+    failed = sum(1 for r in results if "failed" in r.get("status", ""))
+    skipped = sum(1 for r in results if r.get("status", "").startswith("skipped"))
     logger.info("Job complete: %d uploaded, %d failed, %d skipped", uploaded, failed, skipped)
     state["jobs"].append(job_record)