theapemachine
/

cortex

theapemachine commited on 12 days ago

Commit

7dd817a

verified ·

1 Parent(s): 4c1ba64

Add benchmark harness: init.py

Files changed (1) hide show

benchmark/__init__.py ADDED Viewed

+"""
+Cortex Benchmark Harness
+========================
+A self-contained evaluation suite for comparing base LLMs against
+Cortex-enhanced versions across standard NLP benchmarks and
+Cortex-specific capability tests (memory, hallucination detection, etc.).
+Benchmarks:
+  Standard:  HellaSwag, ARC-Easy, ARC-Challenge, PIQA, WinoGrande, MMLU
+  Memory:    Passkey Retrieval, Multi-Hop Memory
+  Hallucination: HaluEval-QA
+"""
+from benchmark.scoring import log_likelihood_score, generate_and_check
+from benchmark.tasks import TASK_REGISTRY, BenchmarkTask
+from benchmark.memory_tasks import PasskeyRetrieval, MultiHopMemory
+from benchmark.runner import BenchmarkRunner
+__all__ = [
+    "log_likelihood_score",
+    "generate_and_check",
+    "TASK_REGISTRY",
+    "BenchmarkTask",
+    "PasskeyRetrieval",
+    "MultiHopMemory",
+    "BenchmarkRunner",
+]