Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import argparse | |
| import importlib.util | |
| import json | |
| import os | |
| import shutil | |
| import subprocess | |
| import sys | |
| from datetime import datetime, timezone | |
| from typing import Any, Dict, List, Tuple | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| CORE_TASKS = [ | |
| "bug_detection_easy_1", | |
| "memory_leak_medium_1", | |
| "security_hard_1", | |
| ] | |
| def _run_cmd(command: List[str]) -> Tuple[int, str, str]: | |
| """Run a command and return (returncode, stdout, stderr).""" | |
| result = subprocess.run( | |
| command, | |
| capture_output=True, | |
| text=True, | |
| encoding="utf-8", | |
| errors="replace", | |
| ) | |
| return result.returncode, result.stdout, result.stderr | |
| def _require_command(binary: str) -> bool: | |
| """Return True if a command exists on PATH.""" | |
| return shutil.which(binary) is not None | |
| def run_validation() -> Tuple[bool, Dict[str, Any]]: | |
| print("Running OpenEnv validation...") | |
| if not _require_command("openenv"): | |
| return False, { | |
| "ok": False, | |
| "reason": "openenv command not found on PATH", | |
| "stdout": "", | |
| "stderr": "", | |
| } | |
| code, out, err = _run_cmd(["openenv", "validate"]) | |
| if code != 0: | |
| print("Validation failed") | |
| return False, {"ok": False, "stdout": out, "stderr": err} | |
| print("Validation passed") | |
| return True, {"ok": True, "stdout": out, "stderr": err} | |
| def run_tests(with_coverage: bool) -> Tuple[bool, Dict[str, Any]]: | |
| print("Running unit tests...") | |
| cmd = ["pytest", "tests/", "-v"] | |
| coverage_enabled = False | |
| coverage_reason = "" | |
| if with_coverage and importlib.util.find_spec("pytest_cov") is not None: | |
| cmd.extend(["--cov=environment", "--cov-report=html"]) | |
| coverage_enabled = True | |
| elif with_coverage: | |
| coverage_reason = "pytest-cov not installed; ran tests without coverage" | |
| code, out, err = _run_cmd(cmd) | |
| if code != 0: | |
| print("Tests failed") | |
| return False, { | |
| "ok": False, | |
| "stdout": out, | |
| "stderr": err, | |
| "coverage_enabled": coverage_enabled, | |
| "coverage_reason": coverage_reason, | |
| } | |
| print("Tests passed") | |
| return True, { | |
| "ok": True, | |
| "stdout": out, | |
| "stderr": err, | |
| "coverage_enabled": coverage_enabled, | |
| "coverage_reason": coverage_reason, | |
| } | |
| def check_docker(image_name: str) -> Tuple[bool, Dict[str, Any]]: | |
| print("Checking Docker build...") | |
| if not _require_command("docker"): | |
| return False, { | |
| "ok": False, | |
| "reason": "docker command not found on PATH", | |
| "stdout": "", | |
| "stderr": "", | |
| } | |
| info_code, info_out, info_err = _run_cmd(["docker", "info"]) | |
| if info_code != 0: | |
| return False, { | |
| "ok": False, | |
| "reason": "docker daemon not reachable. Start Docker Desktop and retry.", | |
| "stdout": info_out, | |
| "stderr": info_err, | |
| } | |
| code, out, err = _run_cmd(["docker", "build", "-t", image_name, "."]) | |
| if code != 0: | |
| print("Docker build failed") | |
| return False, {"ok": False, "stdout": out, "stderr": err} | |
| print("Docker build successful") | |
| return True, {"ok": True, "stdout": out, "stderr": err} | |
| def _inference_env_ready() -> Tuple[bool, str]: | |
| if not (os.getenv("API_BASE_URL") or "").strip(): | |
| return False, "API_BASE_URL is not set" | |
| if not (os.getenv("MODEL_NAME") or "").strip(): | |
| return False, "MODEL_NAME is not set" | |
| token = (os.getenv("API_KEY") or os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY") or "").strip() | |
| if not token: | |
| return False, "API_KEY is not set (supported aliases: HF_TOKEN, OPENAI_API_KEY)" | |
| return True, "" | |
| def run_baseline(tasks: List[str], max_steps: int) -> Tuple[bool, Dict[str, Any], Dict[str, float]]: | |
| """Run inference for each task and collect task_score from result JSONs.""" | |
| print("Running baseline inference for core tasks...") | |
| ready, reason = _inference_env_ready() | |
| if not ready: | |
| print(f"Skipping baseline inference: {reason}") | |
| return False, {"ok": False, "reason": reason}, {} | |
| baseline_scores: Dict[str, float] = {} | |
| details: Dict[str, Any] = {"ok": True, "runs": []} | |
| for task_id in tasks: | |
| output_file = f"baseline_{task_id}.json" | |
| cmd = [ | |
| sys.executable, | |
| "inference.py", | |
| "--task-id", | |
| task_id, | |
| "--max-steps", | |
| str(max_steps), | |
| "--output", | |
| output_file, | |
| ] | |
| code, out, err = _run_cmd(cmd) | |
| run_info: Dict[str, Any] = { | |
| "task_id": task_id, | |
| "ok": code == 0, | |
| "stdout": out, | |
| "stderr": err, | |
| "output_file": output_file, | |
| } | |
| if code != 0: | |
| details["ok"] = False | |
| details["runs"].append(run_info) | |
| continue | |
| # Inference currently catches model-call exceptions and can still exit 0 | |
| # after using fallback actions. Treat this as a baseline failure signal. | |
| combined_logs = f"{out}\n{err}".lower() | |
| if "error getting action from llm" in combined_logs or "insufficient balance" in combined_logs: | |
| details["ok"] = False | |
| run_info["ok"] = False | |
| run_info["reason"] = "Model API call failed; fallback action used" | |
| try: | |
| with open(output_file, "r", encoding="utf-8") as fh: | |
| payload = json.load(fh) | |
| score = float(payload.get("task_score", 0.0)) | |
| baseline_scores[task_id] = score | |
| run_info["task_score"] = score | |
| except (OSError, json.JSONDecodeError, ValueError) as exc: | |
| details["ok"] = False | |
| run_info["ok"] = False | |
| run_info["parse_error"] = str(exc) | |
| details["runs"].append(run_info) | |
| if details["ok"]: | |
| print("Baseline inference passed for all selected tasks") | |
| else: | |
| print("Baseline inference had failures") | |
| return bool(details["ok"]), details, baseline_scores | |
| def generate_report( | |
| checks: Dict[str, Dict[str, Any]], | |
| baseline_scores: Dict[str, float], | |
| report_path: str, | |
| ) -> None: | |
| openenv_ok = checks["validation"]["ok"] | |
| tests_ok = checks["tests"]["ok"] | |
| docker_ok = checks["docker"]["ok"] | |
| baseline_ok = checks["baseline"]["ok"] | |
| report = { | |
| "project": "code-review-agent-env", | |
| "generated_at_utc": datetime.now(timezone.utc).isoformat(), | |
| "tasks": CORE_TASKS, | |
| "difficulties": ["easy", "medium", "hard"], | |
| "openenv_compliant": openenv_ok, | |
| "docker_supported": docker_ok, | |
| "tests_passed": tests_ok, | |
| "baseline_passed": baseline_ok, | |
| "baseline_scores": baseline_scores, | |
| "checks": checks, | |
| } | |
| with open(report_path, "w", encoding="utf-8") as fh: | |
| json.dump(report, fh, indent=2) | |
| print(f"Submission report generated: {report_path}") | |
| def main() -> int: | |
| parser = argparse.ArgumentParser(description="Pre-submission checklist for OpenEnv hackathon") | |
| parser.add_argument( | |
| "--skip-baseline", | |
| action="store_true", | |
| help="Skip inference baseline runs", | |
| ) | |
| parser.add_argument( | |
| "--max-steps", | |
| type=int, | |
| default=50, | |
| help="Max steps for each baseline inference run", | |
| ) | |
| parser.add_argument( | |
| "--no-coverage", | |
| action="store_true", | |
| help="Run tests without coverage output", | |
| ) | |
| parser.add_argument( | |
| "--image-name", | |
| default="code-review-env", | |
| help="Docker image name for validation build", | |
| ) | |
| parser.add_argument( | |
| "--report-path", | |
| default="submission_report.json", | |
| help="Where to write the JSON report", | |
| ) | |
| args = parser.parse_args() | |
| print("=" * 50) | |
| print("Pre-submission Checklist") | |
| print("=" * 50) | |
| checks: Dict[str, Dict[str, Any]] = {} | |
| ok, detail = run_validation() | |
| checks["validation"] = detail | |
| ok, detail = run_tests(with_coverage=not args.no_coverage) | |
| checks["tests"] = detail | |
| ok, detail = check_docker(args.image_name) | |
| checks["docker"] = detail | |
| baseline_scores: Dict[str, float] = {} | |
| if args.skip_baseline: | |
| checks["baseline"] = {"ok": False, "skipped": True, "reason": "Skipped by --skip-baseline"} | |
| else: | |
| ok, detail, baseline_scores = run_baseline(CORE_TASKS, max_steps=args.max_steps) | |
| checks["baseline"] = detail | |
| generate_report(checks, baseline_scores, args.report_path) | |
| required_checks_ok = ( | |
| checks["validation"]["ok"] | |
| and checks["tests"]["ok"] | |
| and checks["docker"]["ok"] | |
| ) | |
| if required_checks_ok: | |
| print("\nRequired checks passed. Ready for submission.") | |
| return 0 | |
| print("\nSome required checks failed. Please fix before submitting.") | |
| return 1 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |