Spaces:
Configuration error
Configuration error
| #!/usr/bin/env python3 | |
| """Competition baseline inference for FraudShield. | |
| This module provides the main entry point for evaluation: | |
| 1. Initialize environment with frozen data snapshot | |
| 2. Load agent (heuristic or LLM-powered) | |
| 3. Run all 3 task difficulties | |
| 4. Grade predictions against ground truth | |
| 5. Save results to fraudshield_baseline_results.json | |
| Execution Modes: | |
| - Heuristic (offline): No external API, deterministic fraud rules | |
| Command: python inference.py | |
| Result: Baseline score (easy=1.0, medium=0.877, hard=0.721, final=0.866) | |
| - LLM (online): Calls OpenAI-compatible API with reasoning prompt | |
| Command: API_BASE_URL=... MODEL_NAME=... python inference.py | |
| Result: LLM reasoning + baseline grading | |
| Output: | |
| - fraudshield_baseline_results.json: Complete grading report with: | |
| - Per-task scores (easy, medium, hard) | |
| - Final weighted score | |
| - Metadata (agent, model, seed, data snapshot) | |
| - Prediction traces (for replay/audit) | |
| Logging: | |
| - INFO: Task progress, scores, file paths | |
| - ERROR: Data load failures, agent exceptions | |
| - EXCEPTION: Full traceback if inference fails | |
| Usage Examples: | |
| # Heuristic baseline (no API needed) | |
| python inference.py | |
| # With LLM (requires API credentials) | |
| export API_BASE_URL=https://router.huggingface.co/v1 | |
| export MODEL_NAME=meta-llama/Llama-2-7b-chat-hf | |
| python inference.py | |
| # In Docker (PATH already set) | |
| docker run -e API_BASE_URL=... -e MODEL_NAME=... fraudshield:v0.2.0 | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| import os | |
| import sys | |
| from typing import Dict, List, Tuple | |
| from fraudshield_env import FraudShieldEnvironment | |
| from graders import FraudShieldGrader | |
| from llm_agent import build_default_agent | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | |
| logger = logging.getLogger(__name__) | |
| RESULTS_FILE = "fraudshield_baseline_results.json" | |
| def get_env(*names: str, default: str = "") -> str: | |
| """Return the first non-empty environment variable from a list of aliases. | |
| Tries multiple variable names in order (useful for supporting different naming conventions). | |
| Args: | |
| *names: Environment variable names to check (in order of preference). | |
| default: Fallback value if none of the names are set. | |
| Returns: | |
| The first non-empty value found, or default if none matched. | |
| Example: | |
| api_url = get_env("API_BASE_URL", "APIBASEURL", default="https://router.huggingface.co/v1") | |
| model = get_env("MODEL_NAME", "MODELNAME", default="meta-llama/Llama-2-7b") | |
| """ | |
| for name in names: | |
| value = os.getenv(name) | |
| if value: | |
| return value | |
| return default | |
| def run_task(env: FraudShieldEnvironment, agent: object, task_name: str) -> Tuple[List[str], List[str], List[float]]: | |
| """Run one task episode and capture the full prediction trace. | |
| This function executes a complete episode for a single task difficulty, | |
| collecting all predictions, confidences, and ground truth labels. | |
| Args: | |
| env: FraudShieldEnvironment instance (with data already loaded). | |
| agent: Agent object with decide(observation) method. | |
| task_name: Task difficulty ("easy", "medium", or "hard"). | |
| Returns: | |
| Tuple of 3 lists: | |
| - predictions: List[str] of decisions ("fraud" or "legitimate") | |
| - ground_truth: List[str] of true labels | |
| - confidences: List[float] of confidence values [0.0, 1.0] | |
| Workflow: | |
| 1. Call env.reset(task_name) to initialize episode | |
| 2. Loop: agent.decide(obs) → env.step(action) → next obs | |
| 3. Log progress each step | |
| 4. Collect all decisions and ground truth | |
| 5. Return predictions for grading | |
| Logging: | |
| - Task header with agent name | |
| - Progress every 10 steps (or at first/last) | |
| - Final accuracy and cumulative reward | |
| Example: | |
| preds, labels, confs = run_task(env, agent, "easy") | |
| print(f"Accuracy: {sum(p == l for p, l in zip(preds, labels)) / len(preds)}") | |
| """ | |
| logger.info("%s", "=" * 72) | |
| logger.info("Running %s task with %s", task_name.upper(), getattr(agent, "name", agent.__class__.__name__)) | |
| logger.info("%s", "=" * 72) | |
| reset_result = env.reset(task_name) | |
| logger.info("Episode %s contains %s transactions", env.episode_id, reset_result.info["num_transactions"]) | |
| observation = reset_result.observation | |
| predictions: List[str] = [] | |
| confidences: List[float] = [] | |
| while not env.is_done: | |
| action = agent.decide(observation) | |
| predictions.append(action.decision.value) | |
| confidences.append(action.confidence) | |
| step_result = env.step(action) | |
| if env.step_count in {1, len(env.current_cases)} or env.step_count % 10 == 0: | |
| logger.info( | |
| "Step %02d | decision=%s | confidence=%.2f | reward=%+.2f", | |
| env.step_count, | |
| action.decision.value, | |
| action.confidence, | |
| step_result.reward.value, | |
| ) | |
| observation = step_result.observation | |
| logger.info( | |
| "Finished %s: accuracy_so_far=%.3f cumulative_reward=%.3f", | |
| task_name.upper(), | |
| env.correct_predictions / max(1, env.step_count), | |
| env.cumulative_reward, | |
| ) | |
| return predictions, list(env.ground_truth_labels), confidences | |
| def main() -> Dict[str, object]: | |
| """Run the baseline across all tasks and persist the report. | |
| This is the main entry point. It orchestrates the complete evaluation: | |
| 1. Create environment and load frozen data snapshot | |
| 2. Build agent (heuristic or LLM-powered) | |
| 3. Run easy/medium/hard tasks sequentially | |
| 4. Grade all predictions | |
| 5. Save results to fraudshield_baseline_results.json | |
| Returns: | |
| Grading report dict with keys: | |
| - easy: {score, predictions, ground_truth, confidences} | |
| - medium: {...} | |
| - hard: {...} | |
| - final_score: Weighted average across all tasks | |
| - metadata: {agent_name, model_name, seed, data_snapshot, tasks} | |
| Error Handling: | |
| - Exits with code 1 if data fails to load | |
| - Exits with code 1 if inference crashes | |
| - Logs full exception traceback | |
| Side Effects: | |
| - Writes fraudshield_baseline_results.json to cwd | |
| - Logs task progress and scores | |
| Environment Variables: | |
| - API_BASE_URL: OpenAI-compatible API endpoint (for LLM mode) | |
| - MODEL_NAME: Model to use (for LLM mode) | |
| - (Both optional; heuristic mode runs offline if not set) | |
| Example: | |
| result = main() | |
| print(f"Final score: {result['final_score']:.4f}") | |
| print(f"Easy: {result['easy']['score']:.4f}") | |
| """ | |
| logger.info("%s", "=" * 72) | |
| logger.info("FraudShield baseline inference") | |
| logger.info("%s", "=" * 72) | |
| env = FraudShieldEnvironment(data_path="data", seed=42) | |
| if not env.load_data(): | |
| logger.error("FraudShield data could not be loaded from ./data") | |
| sys.exit(1) | |
| agent = build_default_agent() | |
| logger.info( | |
| "Agent mode: %s | API_BASE_URL=%s | MODEL_NAME=%s", | |
| getattr(agent, "name", agent.__class__.__name__), | |
| get_env("API_BASE_URL", "APIBASEURL", default="https://router.huggingface.co/v1"), | |
| get_env("MODEL_NAME", "MODELNAME", default="<offline-heuristic>"), | |
| ) | |
| easy_predictions, easy_ground_truth, easy_confidences = run_task(env, agent, "easy") | |
| medium_predictions, medium_ground_truth, medium_confidences = run_task(env, agent, "medium") | |
| hard_predictions, hard_ground_truth, hard_confidences = run_task(env, agent, "hard") | |
| grading_result = FraudShieldGrader.grade_all_tasks( | |
| easy_predictions, | |
| easy_ground_truth, | |
| easy_confidences, | |
| medium_predictions, | |
| medium_ground_truth, | |
| medium_confidences, | |
| hard_predictions, | |
| hard_ground_truth, | |
| hard_confidences, | |
| ) | |
| grading_result["metadata"] = { | |
| "agent_name": getattr(agent, "name", agent.__class__.__name__), | |
| "api_base_url": get_env("API_BASE_URL", "APIBASEURL", default="https://router.huggingface.co/v1"), | |
| "model_name": get_env("MODEL_NAME", "MODELNAME"), | |
| "seed": 42, | |
| "data_snapshot": env.data_loader.get_bundle_summary(), | |
| "tasks": { | |
| "easy": len(easy_ground_truth), | |
| "medium": len(medium_ground_truth), | |
| "hard": len(hard_ground_truth), | |
| }, | |
| } | |
| logger.info("Easy score: %.4f", grading_result["easy"]["score"]) | |
| logger.info("Medium score: %.4f", grading_result["medium"]["score"]) | |
| logger.info("Hard score: %.4f", grading_result["hard"]["score"]) | |
| logger.info("Final score: %.4f", grading_result["final_score"]) | |
| with open(RESULTS_FILE, "w", encoding="utf-8") as handle: | |
| json.dump(grading_result, handle, indent=2) | |
| logger.info("Saved baseline report to %s", RESULTS_FILE) | |
| return grading_result | |
| if __name__ == "__main__": # pragma: no cover | |
| try: | |
| main() | |
| except Exception as exc: | |
| logger.exception("Baseline inference failed: %s", exc) | |
| sys.exit(1) | |