Refactor device handling in benchmark and cortex modules to use resolve_torch_device function for improved device selection. Update README with benchmark table formatting and additional device options.

Browse files

Files changed (8) hide show

README.md +21 -19
benchmark/memory_tasks.py +7 -2
benchmark/run_benchmark.py +1 -1
benchmark/runner.py +2 -1
benchmark/scoring.py +10 -4
cortex/steering_vector.py +5 -2
cortex/torch_device.py +20 -0
test_cortex.py +2 -1

README.md CHANGED Viewed

@@ -132,22 +132,22 @@ Cortex includes a comprehensive benchmark harness for comparing base LLMs agains
 ### Standard Benchmarks
-| Task | Type | Choices | Dataset | Few-Shot |
-|------|------|---------|---------|----------|
-| **HellaSwag** | Commonsense NLI | 4 | `Rowan/hellaswag` | 5-shot |
-| **ARC-Easy** | Science QA | 3-5 | `allenai/ai2_arc` | 5-shot |
-| **ARC-Challenge** | Science QA (hard) | 3-5 | `allenai/ai2_arc` | 5-shot |
-| **PIQA** | Physical intuition | 2 | `gimmaru/piqa` | 0-shot |
-| **WinoGrande** | Coreference | 2 | `allenai/winogrande` | 5-shot |
-| **MMLU** | Multi-domain knowledge | 4 | `cais/mmlu` | 5-shot |
-| **HaluEval** | Hallucination detection | 2 | `pminervini/HaluEval` | 0-shot |
 ### Cortex-Specific Benchmarks
-| Task | Tests | Method |
-|------|-------|--------|
 | **Passkey Retrieval** | Long-context memory, attention to details | Generation + substring match at 128/256/512/1024 token contexts |
-| **Multi-Hop Memory** | Compositional reasoning, fact chaining | Generation + answer extraction from 3-hop fact chains |
 ### Running Benchmarks
@@ -172,6 +172,8 @@ python -m benchmark.run_benchmark --n 50 --no-memory
 # Custom passkey test
 python -m benchmark.run_benchmark --n 20 --passkey-lengths 128 256 512 1024 --n-passkey 10
 ```
 ### Scoring Method
@@ -242,13 +244,13 @@ All modules are independent and composable. Use any combination:
 ## Injection Points
-| Point | Location | Best For |
-|-------|----------|----------|
-| `PRE_ATTENTION` | Before self-attention | Input preprocessing, prefix injection |
-| `POST_ATTENTION` | After attention, before FFN | Memory augmentation (reads enhance attention output) |
-| `PRE_FFN` | Before FFN | Gate what the FFN processes |
-| `POST_FFN` | After full block | Gating, confidence estimation |
-| `RESIDUAL_STREAM` | Wraps entire block | Steering vectors, thinking tokens, backtracking |
 ## Layer Targeting

 ### Standard Benchmarks
+| Task              | Type                    | Choices | Dataset               | Few-Shot |
+|-------------------|-------------------------|---------|-----------------------|----------|
+| **HellaSwag**     | Commonsense NLI         | 4       | `Rowan/hellaswag`     | 5-shot   |
+| **ARC-Easy**      | Science QA              | 3-5     | `allenai/ai2_arc`     | 5-shot   |
+| **ARC-Challenge** | Science QA (hard)       | 3-5     | `allenai/ai2_arc`     | 5-shot   |
+| **PIQA**          | Physical intuition      | 2       | `gimmaru/piqa`        | 0-shot   |
+| **WinoGrande**    | Coreference             | 2       | `allenai/winogrande`  | 5-shot   |
+| **MMLU**          | Multi-domain knowledge  | 4       | `cais/mmlu`           | 5-shot   |
+| **HaluEval**      | Hallucination detection | 2       | `pminervini/HaluEval` | 0-shot   |
 ### Cortex-Specific Benchmarks
+| Task                  | Tests                                     | Method                                                          |
+|-----------------------|-------------------------------------------|-----------------------------------------------------------------|
 | **Passkey Retrieval** | Long-context memory, attention to details | Generation + substring match at 128/256/512/1024 token contexts |
+| **Multi-Hop Memory**  | Compositional reasoning, fact chaining    | Generation + answer extraction from 3-hop fact chains           |
 ### Running Benchmarks
 # Custom passkey test
 python -m benchmark.run_benchmark --n 20 --passkey-lengths 128 256 512 1024 --n-passkey 10
+python -m benchmark.run_benchmark --n 10 --model meta-llama/Llama-3.2-1B --tasks hellaswag piqa arc-easy arc-challenge winogrande mmlu
 ```
 ### Scoring Method
 ## Injection Points
+| Point             | Location                    | Best For                                             |
+|-------------------|-----------------------------|------------------------------------------------------|
+| `PRE_ATTENTION`   | Before self-attention       | Input preprocessing, prefix injection                |
+| `POST_ATTENTION`  | After attention, before FFN | Memory augmentation (reads enhance attention output) |
+| `PRE_FFN`         | Before FFN                  | Gate what the FFN processes                          |
+| `POST_FFN`        | After full block            | Gating, confidence estimation                        |
+| `RESIDUAL_STREAM` | Wraps entire block          | Steering vectors, thinking tokens, backtracking      |
 ## Layer Targeting

benchmark/memory_tasks.py CHANGED Viewed

@@ -16,6 +16,7 @@ import string
 from typing import List, Dict, Optional, Tuple
 from benchmark.scoring import generate_and_check
 class PasskeyRetrieval:
@@ -87,7 +88,7 @@ class PasskeyRetrieval:
         model,
         tokenizer,
         n_per_length: int = 5,
-        device: str = "cuda",
         seed: int = 42,
     ) -> Dict:
         """
@@ -95,6 +96,8 @@ class PasskeyRetrieval:
         Returns dict with results per context length.
         """
         results = {}
         for ctx_len in self.context_lengths:
@@ -224,13 +227,15 @@ class MultiHopMemory:
         model,
         tokenizer,
         n: Optional[int] = None,
-        device: str = "cuda",
     ) -> Dict:
         """
         Run multi-hop memory benchmark.
         Returns accuracy and per-example results.
         """
         chains = self.FACT_CHAINS
         if n is not None:
             chains = chains[:n]

 from typing import List, Dict, Optional, Tuple
 from benchmark.scoring import generate_and_check
+from cortex.torch_device import resolve_torch_device
 class PasskeyRetrieval:
         model,
         tokenizer,
         n_per_length: int = 5,
+        device: Optional[str] = None,
         seed: int = 42,
     ) -> Dict:
         """
         Returns dict with results per context length.
         """
+        if device is None:
+            device = resolve_torch_device("auto")
         results = {}
         for ctx_len in self.context_lengths:
         model,
         tokenizer,
         n: Optional[int] = None,
+        device: Optional[str] = None,
     ) -> Dict:
         """
         Run multi-hop memory benchmark.
         Returns accuracy and per-example results.
         """
+        if device is None:
+            device = resolve_torch_device("auto")
         chains = self.FACT_CHAINS
         if n is not None:
             chains = chains[:n]

benchmark/run_benchmark.py CHANGED Viewed

@@ -57,7 +57,7 @@ def main():
     )
     parser.add_argument(
         "--device", type=str, default="auto",
-        help="Device: cuda, cpu, or auto",
     )
     parser.add_argument(
         "--dtype", type=str, default="float32",

     )
     parser.add_argument(
         "--device", type=str, default="auto",
+        help="Device: cuda, mps, cpu, or auto (auto: cuda > mps > cpu)",
     )
     parser.add_argument(
         "--dtype", type=str, default="float32",

benchmark/runner.py CHANGED Viewed

@@ -21,6 +21,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from benchmark.scoring import log_likelihood_score, accuracy_from_loglikelihoods
 from benchmark.tasks import TASK_REGISTRY, BenchmarkTask
 from benchmark.memory_tasks import PasskeyRetrieval, MultiHopMemory
 class BenchmarkRunner:
@@ -43,7 +44,7 @@ class BenchmarkRunner:
         self.model_name = model_name
         if device == "auto":
-            self.device = "cuda" if torch.cuda.is_available() else "cpu"
         else:
             self.device = device

 from benchmark.scoring import log_likelihood_score, accuracy_from_loglikelihoods
 from benchmark.tasks import TASK_REGISTRY, BenchmarkTask
 from benchmark.memory_tasks import PasskeyRetrieval, MultiHopMemory
+from cortex.torch_device import resolve_torch_device
 class BenchmarkRunner:
         self.model_name = model_name
         if device == "auto":
+            self.device = resolve_torch_device("auto")
         else:
             self.device = device

benchmark/scoring.py CHANGED Viewed

@@ -14,6 +14,8 @@ import torch.nn.functional as F
 from typing import List, Optional, Tuple, Dict
 import re
 @torch.no_grad()
 def log_likelihood_score(
@@ -21,7 +23,7 @@ def log_likelihood_score(
     tokenizer,
     context: str,
     continuations: List[str],
-    device: str = "cuda",
 ) -> List[float]:
     """
     Compute normalized log-likelihood for each continuation given a context.
@@ -36,11 +38,13 @@ def log_likelihood_score(
         tokenizer: The tokenizer
         context: The prompt/context string
         continuations: List of possible continuations to score
-        device: Device to use
     Returns:
         List of normalized log-likelihood scores (higher = model prefers this continuation)
     """
     scores = []
     for cont in continuations:
@@ -96,7 +100,7 @@ def generate_and_check(
     prompt: str,
     expected: str,
     max_new_tokens: int = 64,
-    device: str = "cuda",
     exact_match: bool = False,
 ) -> Tuple[bool, str]:
     """
@@ -108,12 +112,14 @@ def generate_and_check(
         prompt: The input prompt
         expected: The expected answer string
         max_new_tokens: Max tokens to generate
-        device: Device
         exact_match: If True, requires exact match; otherwise substring match
     Returns:
         (is_correct, generated_text)
     """
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
     # Pad token

 from typing import List, Optional, Tuple, Dict
 import re
+from cortex.torch_device import resolve_torch_device
 @torch.no_grad()
 def log_likelihood_score(
     tokenizer,
     context: str,
     continuations: List[str],
+    device: Optional[str] = None,
 ) -> List[float]:
     """
     Compute normalized log-likelihood for each continuation given a context.
         tokenizer: The tokenizer
         context: The prompt/context string
         continuations: List of possible continuations to score
+        device: Device to use (default: auto — cuda, then mps, then cpu)
     Returns:
         List of normalized log-likelihood scores (higher = model prefers this continuation)
     """
+    if device is None:
+        device = resolve_torch_device("auto")
     scores = []
     for cont in continuations:
     prompt: str,
     expected: str,
     max_new_tokens: int = 64,
+    device: Optional[str] = None,
     exact_match: bool = False,
 ) -> Tuple[bool, str]:
     """
         prompt: The input prompt
         expected: The expected answer string
         max_new_tokens: Max tokens to generate
+        device: Device (default: auto — cuda, then mps, then cpu)
         exact_match: If True, requires exact match; otherwise substring match
     Returns:
         (is_correct, generated_text)
     """
+    if device is None:
+        device = resolve_torch_device("auto")
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
     # Pad token

cortex/steering_vector.py CHANGED Viewed

@@ -26,6 +26,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from typing import Optional, Union, List, Dict, Tuple
 from cortex.core import CortexModule, InjectionPoint
 class SteeringVector(CortexModule):
@@ -120,7 +121,7 @@ class SteeringVector(CortexModule):
         negative_prompts: List[str],
         tokenizer,
         layer_idx: int,
-        device: str = "cuda"
     ) -> torch.Tensor:
         """
         Extract a steering direction via contrastive activation analysis.
@@ -137,11 +138,13 @@ class SteeringVector(CortexModule):
             negative_prompts: Prompts exemplifying the undesired behavior
             tokenizer: Model's tokenizer
             layer_idx: Which layer to extract from
-            device: Device
         Returns:
             direction: [hidden_dim] steering direction vector
         """
         model.eval()
         def get_activations(prompts):

 import torch.nn.functional as F
 from typing import Optional, Union, List, Dict, Tuple
 from cortex.core import CortexModule, InjectionPoint
+from cortex.torch_device import resolve_torch_device
 class SteeringVector(CortexModule):
         negative_prompts: List[str],
         tokenizer,
         layer_idx: int,
+        device: Optional[str] = None,
     ) -> torch.Tensor:
         """
         Extract a steering direction via contrastive activation analysis.
             negative_prompts: Prompts exemplifying the undesired behavior
             tokenizer: Model's tokenizer
             layer_idx: Which layer to extract from
+            device: Device (default: auto — cuda, then mps, then cpu)
         Returns:
             direction: [hidden_dim] steering direction vector
         """
+        if device is None:
+            device = resolve_torch_device("auto")
         model.eval()
         def get_activations(prompts):

cortex/torch_device.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""PyTorch device selection (CUDA, Apple MPS, CPU)."""
+import torch
+def resolve_torch_device(preference: str = "auto") -> str:
+    """
+    Resolve a device string for PyTorch.
+    ``auto`` prefers CUDA, then Apple Metal (MPS) on macOS, then CPU.
+    Any other string is returned as-is (e.g. ``cuda:0``).
+    """
+    pref = preference.strip().lower()
+    if pref != "auto":
+        return preference
+    if torch.cuda.is_available():
+        return "cuda"
+    if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
+        return "mps"
+    return "cpu"

test_cortex.py CHANGED Viewed

@@ -16,6 +16,7 @@ import sys
 import os
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from cortex import (
     CortexSurgeon,
     MemoryBank,
@@ -29,7 +30,7 @@ import logging
 logging.basicConfig(level=logging.INFO, format="%(name)s | %(message)s")
 def main():
-    device = "cuda" if torch.cuda.is_available() else "cpu"
     print(f"\n{'='*60}")
     print(f"CORTEX TEST — Device: {device}")
     print(f"{'='*60}\n")

 import os
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from cortex.torch_device import resolve_torch_device
 from cortex import (
     CortexSurgeon,
     MemoryBank,
 logging.basicConfig(level=logging.INFO, format="%(name)s | %(message)s")
 def main():
+    device = resolve_torch_device("auto")
     print(f"\n{'='*60}")
     print(f"CORTEX TEST — Device: {device}")
     print(f"{'='*60}\n")