theapemachine commited on
Commit
7dd817a
·
verified ·
1 Parent(s): 4c1ba64

Add benchmark harness: __init__.py

Browse files
Files changed (1) hide show
  1. benchmark/__init__.py +28 -0
benchmark/__init__.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Cortex Benchmark Harness
3
+ ========================
4
+
5
+ A self-contained evaluation suite for comparing base LLMs against
6
+ Cortex-enhanced versions across standard NLP benchmarks and
7
+ Cortex-specific capability tests (memory, hallucination detection, etc.).
8
+
9
+ Benchmarks:
10
+ Standard: HellaSwag, ARC-Easy, ARC-Challenge, PIQA, WinoGrande, MMLU
11
+ Memory: Passkey Retrieval, Multi-Hop Memory
12
+ Hallucination: HaluEval-QA
13
+ """
14
+
15
+ from benchmark.scoring import log_likelihood_score, generate_and_check
16
+ from benchmark.tasks import TASK_REGISTRY, BenchmarkTask
17
+ from benchmark.memory_tasks import PasskeyRetrieval, MultiHopMemory
18
+ from benchmark.runner import BenchmarkRunner
19
+
20
+ __all__ = [
21
+ "log_likelihood_score",
22
+ "generate_and_check",
23
+ "TASK_REGISTRY",
24
+ "BenchmarkTask",
25
+ "PasskeyRetrieval",
26
+ "MultiHopMemory",
27
+ "BenchmarkRunner",
28
+ ]