diff --git a/benchmarks/ADRS/eplb/README.md b/benchmarks/ADRS/eplb/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2e7bacdcd82ac690859fd68c7e02a45f4cab08dc
--- /dev/null
+++ b/benchmarks/ADRS/eplb/README.md
@@ -0,0 +1,63 @@
+# Expert Parallelism Load Balancer (EPLB)
+
+This benchmark uses SkyDiscover to optimize the Expert Parallelism Load Balancer (EPLB) algorithm for Mixture-of-Expert (MoE) models. The goal is to rearrange and replicate experts across GPUs to balance load, while keeping the rearrangement algorithm itself fast.
+
+## Setup
+
+1. **Install PyTorch** (required by the evaluator):
+
+   ```bash
+   uv pip install torch
+   ```
+
+2. **Download the workload file** from [Hugging Face](https://huggingface.co/datasets/abmfy/eplb-openevolve) into this directory:
+
+   ```bash
+   cd benchmarks/ADRS/eplb
+   wget https://huggingface.co/datasets/abmfy/eplb-openevolve/resolve/main/expert-load.json
+   ```
+
+3. **Set your API key:**
+
+   ```bash
+   export OPENAI_API_KEY=...
+   ```
+
+## Run
+
+From the repo root:
+
+```bash
+uv run skydiscover-run \
+  benchmarks/ADRS/eplb/initial_program.py \
+  benchmarks/ADRS/eplb/evaluator.py \
+  -c benchmarks/ADRS/eplb/config.yaml \
+  -s [your_algorithm] \
+  -i 100 \
+  -o eplb_output
+```
+
+Or from this directory:
+
+```bash
+uv run skydiscover-run initial_program.py evaluator.py \
+  -c config.yaml \
+  -s [your_algorithm] \
+  -i 100
+```
+
+## Evaluate a saved program
+
+```bash
+python evaluate_best_program.py
+```
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `initial_program.py` | Baseline `rebalance_experts` function to evolve |
+| `evaluator.py` | Scores programs on load-balance quality and execution speed |
+| `config.yaml` | Task-specific config (LLM, evaluator timeout, system prompt) |
+| `evaluate_best_program.py` | Standalone script to evaluate a saved best program |
+| `expert-load.json` | Workload data (must be downloaded — see Setup) |
diff --git a/benchmarks/ADRS/eplb/initial_program.py b/benchmarks/ADRS/eplb/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..04cb8c59310d19965a980b37d3ec365f2e834a97
--- /dev/null
+++ b/benchmarks/ADRS/eplb/initial_program.py
@@ -0,0 +1,238 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Expert parallelism load balancer (EPLB) for vLLM.
+
+This module implements the core rearrangement algorithm.
+
+The rearrangement algorithm is adapted from
+[DeepSeek EPLB](https://github.com/deepseek-ai/eplb).
+
+Please find at [#12](https://github.com/deepseek-ai/EPLB/issues/12) an example
+on how the EPLB algorithm works.
+"""
+
+# EVOLVE-BLOCK-START
+
+import torch
+
+
+def balanced_packing(weight: torch.Tensor,
+                     num_packs: int) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Pack n weighted objects to m packs, such that each bin contains exactly
+    n/m objects and the weights of all packs are as balanced as possible.
+
+    Parameters:
+        weight: [X, n], the weight of each item
+        num_packs: number of packs
+
+    Returns:
+        pack_index: [X, n], the pack index of each item
+        rank_in_pack: [X, n], the rank of the item in the pack
+    """
+    num_layers, num_groups = weight.shape
+    assert num_groups % num_packs == 0
+    groups_per_pack = num_groups // num_packs
+
+    if groups_per_pack == 1:
+        pack_index = torch.arange(weight.size(-1),
+                                  dtype=torch.int64,
+                                  device=weight.device).expand(weight.shape)
+        rank_in_pack = torch.zeros_like(weight, dtype=torch.int64)
+        return pack_index, rank_in_pack
+
+    indices = weight.float().sort(-1, descending=True).indices.cpu()
+    pack_index = torch.full_like(weight,
+                                 fill_value=-1,
+                                 dtype=torch.int64,
+                                 device="cpu")
+    rank_in_pack = torch.full_like(pack_index, fill_value=-1)
+    for i in range(num_layers):
+        pack_weights = [0] * num_packs
+        pack_items = [0] * num_packs
+        for group in indices[i]:
+            pack = min(
+                (i
+                 for i in range(num_packs) if pack_items[i] < groups_per_pack),
+                key=pack_weights.__getitem__,
+            )
+            assert pack_items[pack] < groups_per_pack
+            pack_index[i, group] = pack
+            rank_in_pack[i, group] = pack_items[pack]
+            pack_weights[pack] += weight[i, group]
+            pack_items[pack] += 1
+    return pack_index, rank_in_pack
+
+
+def replicate_experts(
+        weight: torch.Tensor,
+        num_phy: int) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Replicate `num_log` experts to `num_phy` replicas, such that the maximum
+    load of all replicas is minimized.
+
+    Parameters:
+        weight: [X, num_log]
+        num_phy: total number of experts after replication
+
+    Returns:
+        phy2log: [X, num_phy], logical expert id of each physical expert
+        rank: [X, num_phy], the replica rank
+        logcnt: [X, num_log], number of replicas for each logical expert
+    """
+    n, num_log = weight.shape
+    num_redundant = num_phy - num_log
+    assert num_redundant >= 0
+    device = weight.device
+    phy2log = torch.arange(num_phy, dtype=torch.int64,
+                           device=device).repeat(n, 1)
+    rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device)
+    logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device)
+    arangen = torch.arange(n, dtype=torch.int64, device=device)
+    for i in range(num_log, num_phy):
+        redundant_indices = (weight / logcnt).max(dim=-1).indices
+        phy2log[:, i] = redundant_indices
+        rank[:, i] = logcnt[arangen, redundant_indices]
+        logcnt[arangen, redundant_indices] += 1
+    return phy2log, rank, logcnt
+
+
+def rebalance_experts_hierarchical(
+    weight: torch.Tensor,
+    num_physical_experts: int,
+    num_groups: int,
+    num_nodes: int,
+    num_gpus: int,
+):
+    """
+    Parameters:
+        weight: [num_moe_layers, num_logical_experts]
+        num_physical_experts: number of physical experts after replication
+        num_groups: number of expert groups
+        num_nodes: number of server nodes, where the intra-node network
+        (e.g, NVLink) is faster
+        num_gpus: number of GPUs, must be a multiple of `num_nodes`
+
+    Returns:
+        physical_to_logical_map: [num_moe_layers, num_physical_experts]
+        logical_to_physical_map: [num_moe_layers, num_logical_experts, X]
+        logical_count: [num_moe_layers, num_logical_experts]
+    """
+    num_layers, num_logical_experts = weight.shape
+    assert num_logical_experts % num_groups == 0
+    group_size = num_logical_experts // num_groups
+    assert num_groups % num_nodes == 0
+    groups_per_node = num_groups // num_nodes
+    assert num_gpus % num_nodes == 0
+    assert num_physical_experts % num_gpus == 0
+    phy_experts_per_gpu = num_physical_experts // num_gpus
+
+    def inverse(perm: torch.Tensor) -> torch.Tensor:
+        inv = torch.empty_like(perm)
+        inv.scatter_(
+            1,
+            perm,
+            torch.arange(perm.size(1), dtype=torch.int64,
+                         device=perm.device).expand(perm.shape),
+        )
+        return inv
+
+    # Step 1: pack groups to nodes
+    tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1)
+    group_pack_index, group_rank_in_pack = balanced_packing(
+        tokens_per_group, num_nodes)
+    log2mlog = (((group_pack_index * groups_per_node + group_rank_in_pack) *
+                 group_size).unsqueeze(-1) +
+                torch.arange(group_size,
+                             dtype=torch.int64,
+                             device=group_pack_index.device)).flatten(-2)
+    mlog2log = inverse(log2mlog)
+
+    # Step 2: construct redundant experts within nodes
+    # [num_layers * num_nodes, num_logical_experts // num_nodes]
+    tokens_per_mlog = weight.gather(-1, mlog2log).view(
+        -1, num_logical_experts // num_nodes)
+    phy2mlog, phyrank, mlogcnt = replicate_experts(
+        tokens_per_mlog, num_physical_experts // num_nodes)
+
+    # Step 3: pack physical_experts to GPUs
+    # [num_layers * num_nodes, num_physical_experts // num_nodes]
+    tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog)
+    pack_index, rank_in_pack = balanced_packing(tokens_per_phy,
+                                                num_gpus // num_nodes)
+    phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
+    pphy2phy = inverse(phy2pphy)
+
+    pphy2mlog = phy2mlog.gather(
+        -1, pphy2phy)  # [num_layers * num_nodes, num_log_per_nodes]
+    pphy2mlog = (pphy2mlog.view(num_layers, num_nodes, -1) + torch.arange(
+        0,
+        num_logical_experts,
+        num_logical_experts // num_nodes,
+        device=group_pack_index.device,
+    ).view(1, -1, 1)).flatten(-2)
+    pphy2log = mlog2log.gather(-1, pphy2mlog)
+    pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1)
+    logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog)
+    return pphy2log, pphyrank, logcnt
+
+
+def rebalance_experts(
+    weight: torch.Tensor,
+    num_replicas: int,
+    num_groups: int,
+    num_nodes: int,
+    num_gpus: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Entry point for expert-parallelism load balancer.
+
+    Parameters:
+        weight: [layers, num_logical_experts], the load statistics for all
+            logical experts
+        num_replicas: number of physical experts, must be a multiple of
+            `num_gpus`
+        num_groups: number of expert groups
+        num_nodes: number of server nodes, where the intra-node network
+            (e.g, NVLink) is faster
+        num_gpus: number of GPUs, must be a multiple of `num_nodes`
+
+    Returns:
+        physical_to_logical_map: [layers, num_replicas], the expert index of
+            each replica
+        logical_to_physical_map: [layers, num_logical_experts, X], the replica
+            indices for each expert
+        expert_count: [layers, num_logical_experts], number of physical
+            replicas for each logical expert
+    """
+    num_layers, num_logical_experts = weight.shape
+    weight = weight.float().cpu()
+    if num_groups % num_nodes == 0:
+        # use hierarchical load-balance policy
+        phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
+            weight, num_replicas, num_groups, num_nodes, num_gpus)
+    else:
+        # use global load-balance policy
+        phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
+            weight, num_replicas, 1, 1, num_gpus)
+    num_redundant_experts = num_replicas - num_logical_experts
+    maxlogcnt = num_redundant_experts + 1
+    log2phy: torch.Tensor = torch.full(
+        (num_layers, num_logical_experts, maxlogcnt),
+        -1,
+        dtype=torch.int64,
+        device=logcnt.device,
+    )
+    log2phy.view(num_layers, -1).scatter_(
+        -1,
+        phy2log * maxlogcnt + phyrank,
+        torch.arange(num_replicas, dtype=torch.int64,
+                     device=log2phy.device).expand(num_layers, -1),
+    )
+    return phy2log, log2phy, logcnt
+
+
+# EVOLVE-BLOCK-END
+
+__all__ = ["rebalance_experts"]
+
diff --git a/benchmarks/ADRS/llm_sql/evaluator/utils.py b/benchmarks/ADRS/llm_sql/evaluator/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..be5c1f0547b1b204324e317b5f4dbd2d59eda4f5
--- /dev/null
+++ b/benchmarks/ADRS/llm_sql/evaluator/utils.py
@@ -0,0 +1,81 @@
+from concurrent.futures import ThreadPoolExecutor
+import pandas as pd
+from typing import List, Tuple
+
+class TrieNode:
+    def __init__(self):
+        self.children = {}
+        self.end_of_word = False
+
+
+class Trie:
+    def __init__(self):
+        self.root = TrieNode()
+
+    def insert(self, word):
+        node = self.root
+        for char in word:
+            if char not in node.children:
+                node.children[char] = TrieNode()
+            node = node.children[char]
+        node.end_of_word = True
+
+    def longest_common_prefix(self, word):
+        node = self.root
+        common_prefix_length = 0
+        for char in word:
+            if char in node.children:
+                common_prefix_length += len(char)
+                node = node.children[char]
+            else:
+                break
+        return common_prefix_length
+
+def calculate_length(value):
+    val = 0
+    if isinstance(value, bool):
+        val = 4  # length of 'True' or 'False'
+    elif isinstance(value, (int, float)):
+        val = len(str(value))
+    elif isinstance(value, str):
+        val = len(value)
+    else:
+        val = 0
+    return val**2
+
+def evaluate_df_prefix_hit_cnt(df: pd.DataFrame) -> Tuple[int, int]:
+    """
+    Function to evaluate the prefix hit count of a DataFrame
+    """
+
+    def max_overlap(trie, row_string):
+        return min(len(row_string), trie.longest_common_prefix(row_string))
+
+
+    trie = Trie()
+    total_prefix_hit_count = 0
+    total_string_length = 0
+
+    def process_row(index, row):
+        nonlocal total_string_length
+        row_string = "".join(row.fillna("").astype(str).values)  # No spaces between columns
+        total_string_length += len(row_string)
+        row_prefix_hit_count = max_overlap(trie, row_string)
+        trie.insert(row_string)
+        return row_prefix_hit_count
+
+    with ThreadPoolExecutor() as executor:
+        results = executor.map(process_row, df.index, [row for _, row in df.iterrows()])
+
+    total_prefix_hit_count = sum(results)
+    total_prefix_hit_rate = total_prefix_hit_count / total_string_length
+    assert total_prefix_hit_count <= total_string_length
+    print(f"Total string length: {total_string_length}")
+    no_cache_pricing = 2.5 / 5  # per 1M if not cached
+    cache_pricing = 1.25 / 5  # per 1M if cached
+    cached_tokens_pricing = total_prefix_hit_count * cache_pricing / 1e6
+    non_cached_tokens_pricing = (total_string_length - total_prefix_hit_count) * no_cache_pricing / 1e6
+    print(
+        f"Cached tokens pricing = {round(cached_tokens_pricing,2)}, Non-cached tokens pricing = {round(non_cached_tokens_pricing,2)}, total pricing = {round(cached_tokens_pricing + non_cached_tokens_pricing,2)}"
+    )
+    return total_prefix_hit_count, total_prefix_hit_rate * 100
\ No newline at end of file
diff --git a/benchmarks/ADRS/prism/config.yaml b/benchmarks/ADRS/prism/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8606e18c28146db6bbc073acb15863388f17601a
--- /dev/null
+++ b/benchmarks/ADRS/prism/config.yaml
@@ -0,0 +1,24 @@
+# Prism (GPU Model Placement) — Prompt Caching Column Reordering Optimization
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 5
+max_solution_length: 60000
+
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+
+prompt:
+  system_message: |-
+    You are an expert for model placement on GPUs. Your task is to improve a model placement algorithm by improve the function named compute_model_placement in the intial program that places models to available GPUs.
+    The algorithm must MINIMIZE the maximum KVPR across all GPUs while ensuring models can fit into the GPUs' memory. Note that KVPR is KV cache pressure for a GPU. It indicates how crowded a GPU is. For a specific GPU, its KVPR is computed as sum(model.req_rate/model.slo for model in models) / (GPU_MEM_SIZE - sum(model.model_size for model in models)), where models are the models on this GPU. The generated program should be as simple as possible and the code should be executed correctly without errors.
+
+evaluator:
+  timeout: 360
+
diff --git a/benchmarks/ADRS/prism/evaluator/Dockerfile b/benchmarks/ADRS/prism/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..d63d4207fc866160550ef20e8707f0e00e4a0615
--- /dev/null
+++ b/benchmarks/ADRS/prism/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict, bridging them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/ADRS/prism/evaluator/evaluator.py b/benchmarks/ADRS/prism/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..8689033cbbcb0bb74c1dcb50b73604fa59c287ba
--- /dev/null
+++ b/benchmarks/ADRS/prism/evaluator/evaluator.py
@@ -0,0 +1,259 @@
+import importlib.util
+import numpy as np
+import time
+import concurrent.futures
+import traceback
+from dataclasses import dataclass
+
+GPU_MEM_SIZE = 80 # GB
+MIN_INT = float('-inf')  # Define MIN_INT as negative infinity
+
+@dataclass
+class Model:
+    model_name: str
+    model_size: int
+    req_rate: int
+    slo: int
+    cur_gpu_id: int
+
+
+def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=30):
+    """
+    Run a function with a timeout using concurrent.futures
+    """
+    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+        future = executor.submit(func, *args, **kwargs)
+        try:
+            result = future.result(timeout=timeout_seconds)
+            return result
+        except concurrent.futures.TimeoutError:
+            raise TimeoutError(f"Function timed out after {timeout_seconds} seconds")
+
+
+def safe_float(value):
+    """Convert a value to float safely"""
+    try:
+        if np.isnan(value) or np.isinf(value):
+            return 0.0
+        return float(value)
+    except (TypeError, ValueError):
+        return 0.0
+
+def verify_gpu_mem_constraint(placement_data: dict[int, list[Model]]) -> bool:
+    """
+    Verify the whether models can fit into GPU memory
+    """
+    # Check if the placement data is valid
+    if placement_data is None:
+        return False
+
+    # Check if the placement data is valid
+    for gpu_id, models in placement_data.items():
+        if sum(model.model_size for model in models) > GPU_MEM_SIZE:
+            return False
+
+    return True
+
+
+def calculate_kvcache_pressure(placement_data: dict[int, list[Model]]) -> float:
+    """
+    Calculate the KVCache pressure
+    """
+    max_kvpr = MIN_INT
+    for gpu_id, models in placement_data.items():
+        total_model_size = sum(model.model_size for model in models)
+        total_weighted_req_rate = sum(model.req_rate / model.slo for model in models)
+        if GPU_MEM_SIZE - total_model_size > 0:
+            kvpr = total_weighted_req_rate / (GPU_MEM_SIZE - total_model_size)
+        else:
+            kvpr = 1000000
+        max_kvpr = max(max_kvpr, kvpr)
+
+    return max_kvpr
+
+
+def generate_test_gpu_models(num_tests=50):
+    """
+    Generate multiple test signals with different characteristics
+    """
+    test_cases = []
+    np.random.seed(42)
+
+    for i in range(num_tests):
+        gpu_num = np.random.randint(5, 10)
+        gpu_models = []
+        for j in range(gpu_num*2):
+            model_size = np.random.randint(10, 30)
+            req_rate = np.random.randint(1, 10)
+            slo = np.random.randint(5, 10)
+            gpu_models.append(Model(model_name=f"model_{j}", model_size=model_size, req_rate=req_rate, slo=slo, cur_gpu_id=j))
+
+        test_cases.append((gpu_num, gpu_models))
+
+    return test_cases
+
+def evaluate(program_path):
+    """
+    Main evaluation function that tests the signal processing algorithm
+    on multiple test signals and calculates the composite performance metric.
+    """
+    try:
+        # Load the program
+        spec = importlib.util.spec_from_file_location("program", program_path)
+        program = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(program)
+
+        # Check if required function exists
+        if not hasattr(program, "compute_model_placement"):
+            return {
+                "max_kvpr": 0.0,
+                "success_rate": 0.0,
+                "combined_score": 0.0,
+                "error": "Missing compute_model_placement function",
+                }
+
+        # Generate test gpu and models
+        test_gpu_models = generate_test_gpu_models()
+
+        # Collect metrics across all tests
+        all_kvpr = []
+        all_metrics = []
+        successful_runs = 0
+
+        for i, (gpu_num, gpu_models) in enumerate(test_gpu_models):
+            try:
+                # Run the algorithm with timeout
+                start_time = time.time()
+
+                # Call the program's main function
+                result = run_with_timeout(
+                    program.compute_model_placement,
+                    kwargs={
+                        'gpu_num': gpu_num,
+                        'models': gpu_models
+                    },
+                    timeout_seconds=10
+                )
+
+                execution_time = time.time() - start_time
+
+                # Validate result format
+                if not isinstance(result, dict):
+                    return {
+                        "max_kvpr": 0.0,
+                        "success_rate": 0.0,
+                        "combined_score": 0.0,
+                        "error": f"Placement {i}: Expected dict, got {type(result).__name__}",
+                    }
+
+                # Validate all models are placed
+                placed_models = []
+                for gpu_id, assigned_models in result.items():
+                    if not isinstance(assigned_models, list):
+                        return {
+                            "max_kvpr": 0.0,
+                            "success_rate": 0.0,
+                            "combined_score": 0.0,
+                            "error": f"GPU {gpu_id} value must be list, got {type(assigned_models).__name__}",
+                        }
+                    placed_models.extend(assigned_models)
+
+                if len(placed_models) != len(gpu_models):
+                    return {
+                        "max_kvpr": 0.0,
+                        "success_rate": 0.0,
+                        "combined_score": 0.0,
+                        "error": f"Not all models placed: {len(placed_models)}/{len(gpu_models)}",
+                    }
+
+                # Check for duplicate placements (by object identity)
+                placed_ids = [id(m) for m in placed_models]
+                if len(set(placed_ids)) != len(placed_ids):
+                    return {
+                        "max_kvpr": 0.0,
+                        "success_rate": 0.0,
+                        "combined_score": 0.0,
+                        "error": f"Duplicate models detected",
+                    }
+
+                # Check placed models are the exact input objects
+                original_ids = {id(m) for m in gpu_models}
+                if set(placed_ids) != original_ids:
+                    return {
+                        "max_kvpr": 0.0,
+                        "success_rate": 0.0,
+                        "combined_score": 0.0,
+                        "error": "Placed models don't match input models (missing or foreign models)",
+                    }
+
+                # Verify GPU memory constraints
+                if not verify_gpu_mem_constraint(result):
+                    return {
+                        "max_kvpr": 0.0,
+                        "success_rate": 0.0,
+                        "combined_score": 0.0,
+                        "error": f"GPU memory constraint violated",
+                    }
+
+                # Calculate metrics using the generated test signal
+                max_kvpr = calculate_kvcache_pressure(result)
+
+                # Store metrics
+                metrics = {
+                    'max_kvpr': safe_float(max_kvpr),
+                    'execution_time': safe_float(execution_time),
+                }
+
+                all_kvpr.append(safe_float(max_kvpr))
+                all_metrics.append(metrics)
+                successful_runs += 1
+
+            except TimeoutError:
+                print(f"Placement {i}: Timeout")
+                continue
+            except Exception as e:
+                print(f"Placement {i}: Error - {str(e)}")
+                continue
+
+        # If no successful runs, return minimal scores
+        if successful_runs == 0:
+            return {
+                    "max_kvpr": 0.0,
+                    "success_rate": 0.0,
+                    "combined_score": 0.0,
+                    "error": "All test signals failed"
+                }
+
+        print(all_metrics)
+        # Calculate aggregate metrics
+        avg_kvpr = np.mean(all_kvpr)
+        if avg_kvpr != 0:
+            avg_kvpr = 1.0 / avg_kvpr
+        avg_execution_time = np.mean([m['execution_time'] for m in all_metrics])
+        success_rate = successful_runs / len(test_gpu_models)
+
+        return {
+                "max_kvpr": safe_float(avg_kvpr),
+                "execution_time": safe_float(avg_execution_time),
+                "success_rate": safe_float(success_rate),
+                "combined_score": safe_float(avg_kvpr) + safe_float(success_rate),
+            }
+
+    except Exception as e:
+        print(f"Evaluation failed: {str(e)}")
+        print(traceback.format_exc())
+        return {
+                "max_kvpr": 0.0,
+                "success_rate": 0.0,
+                "combined_score": 0.0,
+                "error": str(e)
+            }
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is auto-injected at build time from
+    # skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/ADRS/prism/evaluator/requirements.txt b/benchmarks/ADRS/prism/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..296d654528b719e554528b956c4bf5a1516e812c
--- /dev/null
+++ b/benchmarks/ADRS/prism/evaluator/requirements.txt
@@ -0,0 +1 @@
+numpy
\ No newline at end of file
diff --git a/benchmarks/ADRS/prism/evaluator/wrapper.py b/benchmarks/ADRS/prism/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/ADRS/prism/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/arc_benchmark/config.yaml b/benchmarks/arc_benchmark/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6b11cef587896cdb13bc50a172b0470fd29675bc
--- /dev/null
+++ b/benchmarks/arc_benchmark/config.yaml
@@ -0,0 +1,51 @@
+# ARC Benchmark base config
+# This file is used by generate_config.py to inject a task-specific prompt.
+# Switch models by editing the 'llm' section below.
+
+# General settings
+max_iterations: 30
+checkpoint_interval: 10
+log_level: "INFO"
+random_seed: 42
+diff_based_generation: true
+max_solution_length: 50000
+
+# LLM configuration
+llm:
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  api_base: "https://api.openai.com/v1"
+  temperature: 0.7
+  # top_p: 0.95  # omitted by default; some providers (e.g. Anthropic) reject both temperature and top_p
+  max_tokens: 32768
+  timeout: 3000
+
+# Option B: Gemini 3 Pro (comment Option A and uncomment below)
+# llm:
+#   models:
+#     - name: "gemini-3-pro-preview"
+#       weight: 1.0
+#   api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
+#   temperature: 0.7
+#   top_p: 0.95
+#   max_tokens: 32768
+#   timeout: 3000
+
+# Search configuration (default: top-k)
+search:
+  type: "topk"
+  database:
+    random_seed: 42
+  num_context_programs: 4
+
+# Prompt configuration
+# NOTE: generate_config.py overwrites prompt.system_message per task.
+prompt:
+  system_message: "PLACEHOLDER_REPLACED_BY_GENERATE_CONFIG"
+
+# Evaluator configuration
+evaluator:
+  timeout: 360
+  max_retries: 3
+  cascade_evaluation: false
diff --git a/benchmarks/arc_benchmark/convert_arc_agi2_data.py b/benchmarks/arc_benchmark/convert_arc_agi2_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..78d588d4fa3c7f2260d553d571dad5d3b8bbc0fa
--- /dev/null
+++ b/benchmarks/arc_benchmark/convert_arc_agi2_data.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+"""
+Convert ARC-AGI-2-style data (data/training/*.json, data/evaluation/*.json)
+into the format expected by this benchmark:
+  - arc-agi_{split}_challenges.json  (task_id -> { train, test with inputs only })
+  - arc-agi_{split}_solutions.json   (task_id -> list of test output grids)
+
+Usage (from benchmarks/arc_benchmark, with data already in ./data/training and ./data/evaluation):
+  OUT_DIR=./data python3 convert_arc_agi2_data.py .
+
+Or with an external ARC-AGI-2 clone:
+  python3 convert_arc_agi2_data.py /path/to/ARC-AGI-2
+  # Writes into that path by default; set OUT_DIR to write elsewhere.
+"""
+import json
+import os
+import sys
+
+
+def convert_split(repo_root: str, split: str, out_dir: str) -> None:
+    """Convert data/{split}/*.json into challenges + solutions JSON."""
+    split_dir = os.path.join(repo_root, "data", split)
+    if not os.path.isdir(split_dir):
+        print(f"Skip {split}: no directory {split_dir}")
+        return
+
+    challenges = {}
+    solutions = {}
+
+    for name in sorted(os.listdir(split_dir)):
+        if not name.endswith(".json"):
+            continue
+        task_id = name[:-5]  # strip .json
+        path = os.path.join(split_dir, name)
+        with open(path, "r") as f:
+            task = json.load(f)
+        # Challenge: train as-is; test with only "input" (no output)
+        challenges[task_id] = {
+            "train": task["train"],
+            "test": [{"input": p["input"]} for p in task["test"]],
+        }
+        # Solutions: list of test output grids
+        solutions[task_id] = [p["output"] for p in task["test"]]
+
+    challenges_path = os.path.join(out_dir, f"arc-agi_{split}_challenges.json")
+    solutions_path = os.path.join(out_dir, f"arc-agi_{split}_solutions.json")
+    with open(challenges_path, "w") as f:
+        json.dump(challenges, f)
+    with open(solutions_path, "w") as f:
+        json.dump(solutions, f)
+    print(f"Wrote {challenges_path} ({len(challenges)} tasks)")
+    print(f"Wrote {solutions_path} ({len(solutions)} tasks)")
+
+
+def main():
+    repo_root = os.path.abspath(sys.argv[1] if len(sys.argv) > 1 else ".")
+    out_dir = os.getenv("OUT_DIR", repo_root)
+    for split in ("training", "evaluation"):
+        convert_split(repo_root, split, out_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/arc_benchmark/evaluator/Dockerfile b/benchmarks/arc_benchmark/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/arc_benchmark/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/arc_benchmark/evaluator/evaluate.sh b/benchmarks/arc_benchmark/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/arc_benchmark/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/arc_benchmark/evaluator/evaluator.py b/benchmarks/arc_benchmark/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..de18fd5bcb88cca5a5a01da83863541956ebee8d
--- /dev/null
+++ b/benchmarks/arc_benchmark/evaluator/evaluator.py
@@ -0,0 +1,407 @@
+import numpy as np
+from typing import List, Tuple, Dict, Any
+import json
+import os
+
+try:
+    from skydiscover.evaluation.evaluation_result import EvaluationResult
+except ImportError:
+    from dataclasses import dataclass, field
+    from typing import Union
+
+    @dataclass
+    class EvaluationResult:
+        metrics: Dict[str, float]
+        artifacts: Dict[str, Union[str, bytes]] = field(default_factory=dict)
+import importlib.util
+
+TASK_FILE = os.getenv("ARC_TASK_FILE", "training")
+TASK_NUM = os.getenv("TASK_NUM", 0)
+DATA_ROOT = os.getenv("DATA_ROOT", os.path.join(os.path.dirname(os.path.abspath(__file__)), "data"))
+INCLUDE_TEST = os.getenv("ARC_EVAL_INCLUDE_TEST", "0").lower() in ("1", "true", "yes")
+USE_TEST_IN_SCORE = os.getenv("ARC_EVAL_USE_TEST_FOR_SCORE", "0").lower() in ("1", "true", "yes")
+
+
+def cell_accuracy_single(pred: np.ndarray, gt: np.ndarray) -> float:
+    """
+    Compute continuous cell-level accuracy between prediction and ground truth.
+    Returns a float in [0, 1]. Handles shape mismatches gracefully.
+    """
+    if pred.shape != gt.shape:
+        # Partial credit for getting shape partially right
+        shape_score = 0.0
+        if len(pred.shape) == len(gt.shape) == 2:
+            row_match = 1.0 if pred.shape[0] == gt.shape[0] else 0.0
+            col_match = 1.0 if pred.shape[1] == gt.shape[1] else 0.0
+            shape_score = (row_match + col_match) * 0.1  # up to 0.2 for correct dimensions
+        return shape_score
+    # Cell-level accuracy
+    total_cells = gt.size
+    if total_cells == 0:
+        return 1.0
+    correct_cells = int(np.sum(pred == gt))
+    return correct_cells / total_cells
+
+
+def best_attempt_cell_accuracy(attempts: List[np.ndarray], gt: np.ndarray) -> float:
+    """Return the best cell accuracy across all attempts for one example."""
+    return max(cell_accuracy_single(a, gt) for a in attempts)
+
+
+def pass_at_2_accuracy_single(
+    attempts: List[np.ndarray],
+    gt: np.ndarray
+) -> Tuple[int, Dict[int, Any]]:
+    """
+    Compute pass@2 accuracy for a single ARC test case.
+
+    Args:
+        attempts: List of 2 numpy arrays representing model attempts.
+        gt: Ground-truth output as a 2D numpy array.
+
+    Returns:
+        pass_at_2: int (1 if any attempt is perfectly correct, else 0)
+        diagnostics: dict mapping attempt index -> diagnostic info.
+                     If sizes match, includes indices of incorrect cells.
+    """
+    assert len(attempts) == 2, "Expected exactly 2 attempts for pass@2 evaluation."
+
+    diagnostics = {}
+    passed = False
+
+    for i, pred in enumerate(attempts):
+        attempt_info = {}
+
+        # Size check
+        if pred.shape != gt.shape:
+            attempt_info["size_match"] = False
+            attempt_info["pred_shape"] = list(pred.shape)
+            attempt_info["gt_shape"] = list(gt.shape)
+            attempt_info["incorrect_indices"] = None
+            attempt_info["cell_accuracy"] = 0.0
+            attempt_passed = False
+        else:
+            attempt_info["size_match"] = True
+
+            # Find incorrect cells
+            incorrect_mask = pred != gt
+            incorrect_indices = np.argwhere(incorrect_mask)
+
+            attempt_info["incorrect_indices"] = incorrect_indices.tolist()
+            attempt_info["num_incorrect"] = int(incorrect_mask.sum())
+            attempt_info["num_total"] = int(gt.size)
+            attempt_info["cell_accuracy"] = float(np.sum(~incorrect_mask)) / gt.size
+
+            # Perfect match
+            if incorrect_mask.sum() == 0:
+                attempt_passed = True
+            else:
+                attempt_passed = False
+
+        attempt_info["perfect_match"] = attempt_passed
+        passed = attempt_passed or passed
+
+        diagnostics[i] = attempt_info
+
+    pass_at_2 = 1 if passed else 0
+
+    return pass_at_2, diagnostics
+
+def pass_at_2_accuracy_multi_test(
+    all_attempts: List[List[np.ndarray]],
+    all_gt: List[np.ndarray]
+) -> Tuple[List[int], List[Dict[int, Any]]]:
+    """
+    Compute pass@2 accuracy across multiple ARC test cases.
+
+    Args:
+        all_attempts: List of lists of 2 numpy arrays for each test case.
+        all_gt: List of ground-truth outputs as 2D numpy arrays.
+    """
+    assert len(all_attempts) == len(all_gt), "Mismatched number of test cases."
+
+    all_diagnostics = []
+    all_pass = []
+
+    for attempts, gt in zip(all_attempts, all_gt):
+        pass_at_2, diagnostics = pass_at_2_accuracy_single(attempts, gt)
+        all_pass.append(pass_at_2)
+        all_diagnostics.append(diagnostics)
+
+    return all_pass, all_diagnostics
+
+def extract_failure_artifacts(diagnostics, pred=None, gt=None):
+    """
+    Extract failure artifacts from diagnostics for a given example.
+    Includes actual vs expected output snippets for better LLM feedback.
+    """
+    artifacts = {}
+    if not diagnostics["size_match"]:
+        artifacts["error_type"] = "SizeMismatch"
+        artifacts["error_message"] = (
+            f"Output shape {diagnostics['pred_shape']} does not match "
+            f"expected shape {diagnostics['gt_shape']}."
+        )
+        artifacts["suggestion"] = (
+            f"Your output has shape {diagnostics['pred_shape']} but the correct output "
+            f"has shape {diagnostics['gt_shape']}. Review how you determine output dimensions."
+        )
+    else:
+        num_incorrect = diagnostics['num_incorrect']
+        num_total = diagnostics['num_total']
+        accuracy = diagnostics['cell_accuracy']
+        artifacts["error_type"] = "IncorrectCells"
+        artifacts["error_message"] = (
+            f"{num_incorrect}/{num_total} cells incorrect "
+            f"(cell accuracy: {accuracy:.1%})."
+        )
+        # Show a compact diff of expected vs actual for first few wrong cells
+        if diagnostics['incorrect_indices'] and pred is not None and gt is not None:
+            wrong = diagnostics['incorrect_indices'][:8]  # first 8 wrong cells
+            diff_lines = []
+            for r, c in wrong:
+                diff_lines.append(f"  [{r},{c}]: got {int(pred[r,c])}, expected {int(gt[r,c])}")
+            artifacts["cell_diffs"] = "\n".join(diff_lines)
+            if len(diagnostics['incorrect_indices']) > 8:
+                artifacts["cell_diffs"] += f"\n  ... and {len(diagnostics['incorrect_indices'])-8} more"
+        artifacts["suggestion"] = (
+            f"Your solution gets {accuracy:.1%} of cells correct. "
+            f"Review the transformation logic for the failing cells."
+        )
+
+    return artifacts
+
+def evaluate(program_path):
+    """
+    Evaluate the program on ARC task training (and optionally test) examples.
+
+    Returns a combined_score that blends:
+      - pass@2 (binary perfect-match, weighted 0.6)
+      - cell accuracy (continuous partial credit, weighted 0.4)
+    This gives evolution gradient signal even when no example is solved perfectly.
+    """
+    spec = importlib.util.spec_from_file_location("program_module", program_path)
+    program_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(program_module)
+
+    if not hasattr(program_module, 'transform_grid_attempt_1') or not hasattr(program_module, 'transform_grid_attempt_2'):
+        print(f"Stage 1 validation failed: Program must define 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.")
+
+        error_artifacts = {
+                "error_type": "MissingFunction",
+                "error_message": "Stage 1: Program is missing required 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.",
+                "suggestion": "Make sure your program includes a functions named 'transform_grid_attempt_1' and 'transform_grid_attempt_2' that take as an argument a 2D numpy array and return a 2D numpy array."
+            }
+
+        return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0,
+                    "combined_score": 0.0,
+                    "error": "Missing transform_grid_attempt_1 and transform_grid_attempt_2 functions"
+                },
+                artifacts=error_artifacts
+            )
+
+    # Load ARC tasks
+    challenge_path = os.path.join(DATA_ROOT, f"arc-agi_{TASK_FILE}_challenges.json")
+
+    with open(challenge_path, 'r') as f:
+        tasks = json.load(f)
+
+    task_id = list(tasks.keys())[int(TASK_NUM)]
+    task = tasks[task_id]
+
+    train_inputs = [np.array(inp["input"]) for inp in task['train']]
+    train_gts = [np.array(gt["output"]) for gt in task['train']]
+
+    train_attempts = []
+
+    # Generate attempts for training data
+    for inp in train_inputs:
+        attempt_1 = program_module.transform_grid_attempt_1(inp)
+        if not isinstance(attempt_1, np.ndarray):
+            print(f"transform_grid_attempt_1 did not return a numpy array")
+
+            error_artifacts = {
+                "error_type": "InvalidReturnType",
+                "error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array.",
+                "suggestion": "Make sure your transform_grid_attempt_1 function returns a 2D numpy array."
+            }
+
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0,
+                    "combined_score": 0.0,
+                    "error": "transform_grid_attempt_1 did not return a numpy array"
+                },
+                artifacts=error_artifacts
+            )
+
+        attempt_2 = program_module.transform_grid_attempt_2(inp)
+        if not isinstance(attempt_2, np.ndarray):
+            print(f"transform_grid_attempt_2 did not return a numpy array")
+
+            error_artifacts = {
+                "error_type": "InvalidReturnType",
+                "error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array.",
+                "suggestion": "Make sure your transform_grid_attempt_2 function returns a 2D numpy array."
+            }
+
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0,
+                    "combined_score": 0.0,
+                    "error": "transform_grid_attempt_2 did not return a numpy array"
+                },
+                artifacts=error_artifacts
+            )
+        train_attempts.append([attempt_1, attempt_2])
+
+    pass_at_2_train, train_diagnostics_list = pass_at_2_accuracy_multi_test(train_attempts, train_gts)
+
+    # Compute both binary pass@2 and continuous cell accuracy
+    train_pass_score = sum(pass_at_2_train) / len(pass_at_2_train)
+    train_cell_acc = sum(
+        best_attempt_cell_accuracy(attempts, gt)
+        for attempts, gt in zip(train_attempts, train_gts)
+    ) / len(train_gts)
+
+    # Blended score: pass@2 (60%) + cell accuracy (40%) gives gradient signal
+    train_score = 0.6 * train_pass_score + 0.4 * train_cell_acc
+
+    metrics = {
+        "runs_successfully": 1.0,
+        "combined_score": train_score,
+        "train_combined_score": train_score,
+        "train_pass_at_2_score": train_pass_score,
+        "train_cell_accuracy": round(train_cell_acc, 4),
+    }
+    error_artifacts = {}
+    for i, (train_pass, train_diagnostics) in enumerate(zip(pass_at_2_train, train_diagnostics_list)):
+        example_name = f"train_example_{i}"
+        metrics[f"{example_name}_pass_at_2"] = train_pass
+        best_acc = best_attempt_cell_accuracy(train_attempts[i], train_gts[i])
+        metrics[f"{example_name}_cell_accuracy"] = round(best_acc, 4)
+        for attempt in train_diagnostics:
+            attempt_pass = train_diagnostics[attempt]["perfect_match"]
+            metrics[f"{example_name}_attempt_{attempt}"] = attempt_pass
+            if not attempt_pass:
+                pred = train_attempts[i][attempt]
+                gt = train_gts[i]
+                error_artifacts[f"{example_name}_attempt_{attempt}_diagnostics"] = extract_failure_artifacts(
+                    train_diagnostics[attempt], pred=pred, gt=gt
+                )
+
+    # Optional: include test feedback (uses solutions if available)
+    if INCLUDE_TEST:
+        solution_path = os.path.join(DATA_ROOT, f"arc-agi_{TASK_FILE}_solutions.json")
+        if os.path.isfile(solution_path):
+            with open(solution_path, 'r') as f:
+                solutions = json.load(f)
+            task_id = list(tasks.keys())[int(TASK_NUM)]
+            solution = solutions.get(task_id)
+            if solution is not None and "test" in task:
+                if len(task["test"]) != len(solution):
+                    raise ValueError(
+                        f"Train/test data mismatch: task {task_id} has {len(task['test'])} test inputs "
+                        f"but {len(solution)} solution outputs. Check that arc-agi_{TASK_FILE}_challenges.json "
+                        f"and arc-agi_{TASK_FILE}_solutions.json were generated together."
+                    )
+                test_inputs = [np.array(inp["input"]) for inp in task['test']]
+                test_gts = [np.array(gt) for gt in solution]
+
+                test_attempts = []
+                for inp in test_inputs:
+                    attempt_1 = program_module.transform_grid_attempt_1(inp)
+                    if not isinstance(attempt_1, np.ndarray):
+                        print(f"transform_grid_attempt_1 did not return a numpy array (test)")
+                        return EvaluationResult(
+                            metrics={
+                                "runs_successfully": 0.0,
+                                "combined_score": 0.0,
+                                "error": "transform_grid_attempt_1 did not return a numpy array (test)"
+                            },
+                            artifacts={
+                                "error_type": "InvalidReturnType",
+                                "error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array (test).",
+                                "suggestion": "Make sure transform_grid_attempt_1 returns a 2D numpy array."
+                            }
+                        )
+
+                    attempt_2 = program_module.transform_grid_attempt_2(inp)
+                    if not isinstance(attempt_2, np.ndarray):
+                        print(f"transform_grid_attempt_2 did not return a numpy array (test)")
+                        return EvaluationResult(
+                            metrics={
+                                "runs_successfully": 0.0,
+                                "combined_score": 0.0,
+                                "error": "transform_grid_attempt_2 did not return a numpy array (test)"
+                            },
+                            artifacts={
+                                "error_type": "InvalidReturnType",
+                                "error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array (test).",
+                                "suggestion": "Make sure transform_grid_attempt_2 returns a 2D numpy array."
+                            }
+                        )
+                    test_attempts.append([attempt_1, attempt_2])
+
+                pass_at_2_test, test_diagnostics_list = pass_at_2_accuracy_multi_test(test_attempts, test_gts)
+                test_pass_score = sum(pass_at_2_test) / len(pass_at_2_test)
+                test_cell_acc = sum(
+                    best_attempt_cell_accuracy(attempts, gt)
+                    for attempts, gt in zip(test_attempts, test_gts)
+                ) / len(test_gts)
+                test_score = 0.6 * test_pass_score + 0.4 * test_cell_acc
+
+                metrics["test_combined_score"] = test_score
+                metrics["test_pass_at_2_score"] = test_pass_score
+                metrics["test_cell_accuracy"] = round(test_cell_acc, 4)
+                metrics["test_included"] = 1
+
+                for i, (test_pass, test_diagnostics) in enumerate(zip(pass_at_2_test, test_diagnostics_list)):
+                    example_name = f"test_example_{i}"
+                    metrics[f"{example_name}_pass_at_2"] = test_pass
+                    best_acc = best_attempt_cell_accuracy(test_attempts[i], test_gts[i])
+                    metrics[f"{example_name}_cell_accuracy"] = round(best_acc, 4)
+                    for attempt in test_diagnostics:
+                        metrics[f"{example_name}_attempt_{attempt}"] = test_diagnostics[attempt]["perfect_match"]
+                    if test_pass == 0:
+                        first_failing_idx = next(
+                            (a for a in test_diagnostics if not test_diagnostics[a]["perfect_match"]),
+                            0,
+                        )
+                        pred = test_attempts[i][first_failing_idx]
+                        gt = test_gts[i]
+                        error_artifacts[f"{example_name}"] = extract_failure_artifacts(
+                            test_diagnostics[first_failing_idx], pred=pred, gt=gt
+                        )
+
+                if USE_TEST_IN_SCORE:
+                    metrics["combined_score"] = (train_score + test_score) / 2.0
+            else:
+                metrics["test_included"] = 0
+        else:
+            metrics["test_included"] = 0
+
+    return EvaluationResult(
+        metrics=metrics,
+        artifacts=error_artifacts
+    )
+
+
+def _evaluate_as_dict(program_path):
+    """Adapter: calls evaluate() and converts EvaluationResult to a plain dict."""
+    result = evaluate(program_path)
+    d = dict(result.metrics)
+    for k, v in result.artifacts.items():
+        d[k] = v
+    return d
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> EvaluationResult to the
+    # container JSON protocol.  wrapper.py is copied from
+    # skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(_evaluate_as_dict)
diff --git a/benchmarks/arc_benchmark/evaluator/requirements.txt b/benchmarks/arc_benchmark/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..296d654528b719e554528b956c4bf5a1516e812c
--- /dev/null
+++ b/benchmarks/arc_benchmark/evaluator/requirements.txt
@@ -0,0 +1 @@
+numpy
\ No newline at end of file
diff --git a/benchmarks/arc_benchmark/evaluator/wrapper.py b/benchmarks/arc_benchmark/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/arc_benchmark/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/arc_benchmark/generate_config.py b/benchmarks/arc_benchmark/generate_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba3da073fd23f9076ae081c193331b14def47d80
--- /dev/null
+++ b/benchmarks/arc_benchmark/generate_config.py
@@ -0,0 +1,101 @@
+import os
+import yaml
+import json
+
+
+def load_task_as_prompt(task_json, task_num):
+    with open(task_json, 'r') as f:
+        tasks = json.load(f)
+    
+    task_id = list(tasks.keys())[int(task_num)]
+    task = tasks[task_id]
+    train_inputs = [inp["input"] for inp in task['train']]
+    train_outputs = [gt["output"] for gt in task['train']]
+
+    train_pairs = ""
+    for i, (inp, out) in enumerate(zip(train_inputs, train_outputs)):
+        train_pairs += f"In {i} - {inp}\nOut {i} - {out}\n"
+    
+    prompt = f"""You are participating in a puzzle solving competition. You are an expert at solving puzzles.
+Find the common pattern that transforms each input grid into its corresponding output grid.
+
+Your task is to write python functions that implement the MOST GENERAL transformation rule. The rule must:
+- Apply consistently to ALL training examples
+- Generalize to unseen inputs (critical for success)
+- Be based on structural patterns, not memorized examples
+- Use relative/spatial rules rather than absolute coordinates
+
+Generalization rules (THIS IS CRITICAL):
+- Infer the transformation ONLY from the training input-output pairs
+- If multiple rules fit the training data, choose the SIMPLEST and MOST GENERAL one
+- Prefer structural/relational rules (shapes, adjacency, symmetry, patterns) over coordinate-based rules
+- Do NOT hardcode any values, coordinates, or specific grid sizes that appear in training examples
+- Think: "What is the underlying principle?" not "What fits these specific examples?"
+- Use numpy only (no external libraries)
+
+Common failure modes to avoid:
+- Overfitting to specific grid sizes or positions in training examples
+- Hardcoding colors, coordinates, or counts from training data
+- Assuming global properties (like separator colors) without verifying across ALL examples
+- Using absolute positions when relative/structural rules would generalize better
+
+Solution approach:
+- Analyze the training examples to identify the CORE transformation principle
+- Prefer block-wise, object-wise, or pattern-based rules that work locally
+- If the grid has distinct regions, solve each region independently
+- Build flexible rules that adapt to different input sizes and structures
+
+Training examples:
+{train_pairs}
+
+Your task: Write 2 different Python functions that implement the general transformation rule.
+- Each function takes a 2D numpy array as input and returns the transformed 2D numpy array
+- The two attempts should use genuinely different strategies (e.g., different algorithmic approaches)
+- Focus on generalization - your solution will be evaluated on BOTH training examples AND unseen test cases
+
+CRITICAL: Write general transformations that discover the underlying rule, not memorize the training examples.
+
+Remember to only output the modified python functions as your solution."""
+    
+    return prompt
+
+def generate_config(task_num, task_file, dataset_root=None, base_config=None):
+    if dataset_root is None:
+        dataset_root = os.getenv("DATA_ROOT")
+        if not dataset_root:
+            dataset_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+    task_json = os.path.join(dataset_root, f"arc-agi_{task_file}_challenges.json")
+    prompt = load_task_as_prompt(task_json, task_num)
+    
+    if base_config is None:
+        default_base = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config.yaml")
+        base_config = os.getenv("BASE_CONFIG", default_base)
+    with open(base_config, 'r') as file:
+        config = yaml.safe_load(file)
+    
+    config['prompt']['system_message'] = prompt
+    # Use OPENAI_API_KEY at runtime if set (keeps real key out of committed config)
+    api_key_env = os.getenv("OPENAI_API_KEY")
+    if api_key_env and api_key_env.strip() and api_key_env != "your-gemini-api-key":
+        config["llm"]["api_key"] = api_key_env.strip()
+    # Override max_iterations from env if set (e.g. by run_discovery.sh)
+    max_iter_env = os.getenv("MAX_ITERATIONS")
+    if max_iter_env is not None and str(max_iter_env).strip() != "":
+        try:
+            config["max_iterations"] = int(max_iter_env)
+        except ValueError:
+            pass
+    
+    # Write to a per-task config file so parallel runs don't conflict
+    out_path = os.getenv("CONFIG_OUT", f"./config_task_{task_num}.yaml")
+    with open(out_path, 'w') as file:
+        yaml.dump(config, file)
+    return out_path
+        
+if __name__ == "__main__":
+    TASK_FILE = os.getenv("ARC_TASK_FILE", "training")
+    TASK_NUM = os.getenv("TASK_NUM", 0)
+    
+    path = generate_config(TASK_NUM, TASK_FILE)
+    print(path)
+    
diff --git a/benchmarks/arc_benchmark/initial_program.py b/benchmarks/arc_benchmark/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..39f75f7c28a5a17a351ca335cd7ad5cf61c4136d
--- /dev/null
+++ b/benchmarks/arc_benchmark/initial_program.py
@@ -0,0 +1,42 @@
+# EVOLVE-BLOCK-START
+
+import numpy as np
+
+def transform_grid_attempt_1(grid):
+    """
+    Example transformation:
+    - Validate input (2D, integer values 0-9).
+    - Rotate the grid 90 degrees clockwise.
+    - Increment every cell by 1 modulo 10 (keeps values 0-9).
+    Returns a new numpy int array.
+    """
+    arr = _validate_grid(grid)
+    out = np.rot90(arr, k=-1)  # 90 degrees clockwise
+    out = (out + 1) % 10
+    return out.astype(np.int32)
+
+def transform_grid_attempt_2(grid):
+    """
+    Example transformation:
+    - Validate input (2D, integer values 0-9).
+    - Upsample each cell to a 2x2 block (doubling both dimensions).
+    - Invert colors by mapping v -> 9 - v (keeps values 0-9).
+    Returns a new numpy int array.
+    """
+    arr = _validate_grid(grid)
+    out = np.repeat(np.repeat(arr, 2, axis=0), 2, axis=1)
+    out = 9 - out
+    return out.astype(np.int32)
+
+# EVOLVE-BLOCK-END
+
+def _validate_grid(grid):
+    arr = np.asarray(grid)
+    if arr.ndim != 2:
+        raise ValueError("Input must be a 2D array.")
+    # cast to integer type for value checks
+    if not np.issubdtype(arr.dtype, np.integer):
+        arr = arr.astype(int)
+    if arr.size and (arr.min() < 0 or arr.max() > 9):
+        raise ValueError("Array values must be integers in the range 0-9.")
+    return arr
\ No newline at end of file
diff --git a/benchmarks/arc_benchmark/post_discovery_eval.py b/benchmarks/arc_benchmark/post_discovery_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..58e707580ead95624cf6321d481383fe9e8e9c83
--- /dev/null
+++ b/benchmarks/arc_benchmark/post_discovery_eval.py
@@ -0,0 +1,157 @@
+import importlib.util
+import os
+import json
+import numpy as np
+from evaluator import pass_at_2_accuracy_multi_test, extract_failure_artifacts
+
+TASK_FILE = os.getenv("ARC_TASK_FILE", "training")
+TASK_NUM = os.getenv("TASK_NUM", 0)
+OUTS_DIR = os.getenv("OUTS_DIR", "")
+# Optional: path to a checkpoint dir (e.g. outputs/evaluation_task_0/checkpoints/checkpoint_10) to eval that best_program.py on test set
+PROGRAM_DIR = os.getenv("PROGRAM_DIR", "")
+
+
+def _program_path():
+    """Path to best_program.py: PROGRAM_DIR if set, else OUTS_DIR/best/."""
+    if PROGRAM_DIR:
+        return os.path.join(PROGRAM_DIR, "best_program.py")
+    return os.path.join(OUTS_DIR, "best", "best_program.py")
+
+
+def _result_path():
+    """Where to write post_evolution_evaluation_result.json."""
+    if PROGRAM_DIR:
+        return os.path.join(PROGRAM_DIR, "post_evolution_evaluation_result.json")
+    return os.path.join(OUTS_DIR, "best", "post_evolution_evaluation_result.json")
+
+
+def load_program_module():
+    """Dynamically load the best_program.py module from the specified directory."""
+    path = _program_path()
+    if not os.path.isfile(path):
+        raise FileNotFoundError(f"Program not found: {path}. Set PROGRAM_DIR to a checkpoint dir (e.g. .../checkpoints/checkpoint_10) or ensure OUTS_DIR/best/best_program.py exists.")
+    spec = importlib.util.spec_from_file_location("program_module", path)
+    program_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(program_module)
+    
+    return program_module
+
+def evaluate():
+    """Evaluate the program module located in the specified directory."""
+    program_module = load_program_module()
+    if not hasattr(program_module, 'transform_grid_attempt_1') or not hasattr(program_module, 'transform_grid_attempt_2'):
+        print(f"Stage 1 validation failed: Program must define 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.")
+        
+        error_artifacts = {
+                "error_type": "MissingFunction",
+                "error_message": "Stage 1: Program is missing required 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.",
+                "suggestion": "Make sure your program includes a functions named 'transform_grid_attempt_1' and 'transform_grid_attempt_2' that take as an argument a 2D numpy array and return a 2D numpy array."
+            }
+        
+        return dict(
+                metrics={
+                    "runs_successfully": 0.0, 
+                    "combined_score": 0.0,
+                    "error": "Missing transform_grid_attempt_1 and transform_grid_attempt_2 functions"
+                },
+                artifacts=error_artifacts
+            )
+    # Load ARC tasks
+    data_root = os.getenv("DATA_ROOT")
+    if not data_root:
+        data_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+    challenge_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_challenges.json")
+    solution_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_solutions.json")
+
+    with open(challenge_path, 'r') as f:
+        tasks = json.load(f)
+    with open(solution_path, 'r') as f:
+        solutions = json.load(f)
+        
+    task_id = list(tasks.keys())[int(TASK_NUM)]
+    solution = solutions[task_id]
+    task = tasks[task_id]
+
+    # Sanity check: test inputs and solutions must align (same task, same order)
+    if len(task["test"]) != len(solution):
+        raise ValueError(
+            f"Train/test data mismatch: task {task_id} has {len(task['test'])} test inputs "
+            f"but {len(solution)} solution outputs. Check that arc-agi_{TASK_FILE}_challenges.json "
+            f"and arc-agi_{TASK_FILE}_solutions.json were generated together (convert_arc_agi2_data.py)."
+        )
+
+    test_inputs = [np.array(inp["input"]) for inp in task['test']]
+    test_gts = [np.array(gt) for gt in solution]
+    
+    test_attempts = []
+    for inp in test_inputs:
+        attempt_1 = program_module.transform_grid_attempt_1(inp)
+        if not isinstance(attempt_1, np.ndarray):
+            print(f"transform_grid_attempt_1 did not return a numpy array")
+            
+            error_artifacts = {
+                "error_type": "InvalidReturnType",
+                "error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array.",
+                "suggestion": "Make sure your transform_grid_attempt_1 function returns a 2D numpy array."
+            }
+            
+            return dict(
+                metrics={
+                    "runs_successfully": 0.0, 
+                    "combined_score": 0.0,
+                    "error": "transform_grid_attempt_1 did not return a numpy array"
+                },
+                artifacts=error_artifacts
+            )
+        
+        attempt_2 = program_module.transform_grid_attempt_2(inp)
+        if not isinstance(attempt_2, np.ndarray):
+            print(f"transform_grid_attempt_2 did not return a numpy array")
+            
+            error_artifacts = {
+                "error_type": "InvalidReturnType",
+                "error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array.",
+                "suggestion": "Make sure your transform_grid_attempt_2 function returns a 2D numpy array."
+            }
+            
+            return dict(
+                metrics={
+                    "runs_successfully": 0.0, 
+                    "combined_score": 0.0,
+                    "error": "transform_grid_attempt_2 did not return a numpy array"
+                },
+                artifacts=error_artifacts
+            )
+        test_attempts.append([attempt_1, attempt_2])
+        
+    pass_at_2_test, test_diagnostics_list = pass_at_2_accuracy_multi_test(test_attempts, test_gts)
+    metrics = {
+        "runs_successfully": 1.0,
+        "combined_score": sum(pass_at_2_test) / len(pass_at_2_test),
+    }
+    error_artifacts = {}
+    for i, (test_pass, test_diagnostics) in enumerate(zip(pass_at_2_test, test_diagnostics_list)):
+        example_name = f"test_example_{i}"
+        metrics[f"{example_name}_pass_at_2"] = test_pass
+        for attempt in test_diagnostics:
+            metrics[f"{example_name}_attempt_{attempt}"] = test_diagnostics[attempt]["perfect_match"]
+        if test_pass == 0:
+            # test_diagnostics is {0: {...}, 1: {...}}; extract_failure_artifacts expects one attempt's dict
+            first_failing = next(
+                (test_diagnostics[a] for a in test_diagnostics if not test_diagnostics[a]["perfect_match"]),
+                test_diagnostics[0],
+            )
+            error_artifacts[f"{example_name}"] = extract_failure_artifacts(first_failing)
+    
+    return dict(
+        metrics=metrics,
+        artifacts=error_artifacts
+    )
+    
+if __name__ == "__main__":
+    evaluation_result = evaluate()
+    result_path = _result_path()
+    os.makedirs(os.path.dirname(result_path), exist_ok=True)
+    with open(result_path, 'w') as f:
+        json.dump(evaluation_result, f, indent=4)
+    print(f"Test-set evaluation written to {result_path}")
\ No newline at end of file
diff --git a/benchmarks/frontier-cs-eval/README.md b/benchmarks/frontier-cs-eval/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d7c97f8213ef9207738dd073295e909e02e0308e
--- /dev/null
+++ b/benchmarks/frontier-cs-eval/README.md
@@ -0,0 +1,72 @@
+# Frontier-CS Benchmark
+
+Evolves C++ solutions for [Frontier-CS](https://github.com/facebookresearch/Frontier-CS) algorithmic optimization problems using SkyDiscover.
+
+## Setup
+
+```bash
+# 1. Clone Frontier-CS
+cd benchmarks/frontier-cs-eval
+git clone https://github.com/FrontierCS/Frontier-CS.git
+
+# 2. Start the judge server (requires Docker)
+cd Frontier-CS/algorithmic
+docker compose up -d
+
+# 3. Install dependencies (from project root)
+cd ../../..
+uv sync --extra frontier-cs
+
+# 4. Set your API key
+export OPENAI_API_KEY=...
+```
+
+## Run
+
+Supported algorithms: `adaevolve`, `evox`, `openevolve`, `gepa`, `shinkaevolve`
+
+
+Single problem:
+```bash
+cd benchmarks/frontier-cs-eval
+FRONTIER_CS_PROBLEM=0 uv run skydiscover-run initial_program.cpp evaluator.py \
+  -c config.yaml -s [search_algorithm] -i 50
+```
+
+All problems in parallel:
+```bash
+uv run python run_all_frontiercs.py --search [search_algorithm] --iterations 50 --workers 6
+```
+
+## Evaluate best programs (post-discovery)
+
+```bash
+uv run python run_best_programs_frontiercs.py
+```
+
+## Analyze results
+
+```bash
+uv run python combine_results.py   # merge training/testing scores into CSV
+uv run python analyze_results.py   # generate plots and statistics
+```
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `initial_program.cpp` | Seed C++ program |
+| `evaluator.py` | Evaluates C++ solutions via Frontier-CS docker judge |
+| `config.yaml` | Config with system prompt template |
+| `run_all_frontiercs.py` | Parallelizes evolution across all problems |
+| `run_best_programs_frontiercs.py` | Re-evaluates best programs after evolution |
+| `combine_results.py` | Combines training/testing scores into CSV |
+| `analyze_results.py` | Generates score analysis plots and statistics |
+
+## Environment variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `OPENAI_API_KEY` | (required) | API key |
+| `FRONTIER_CS_PROBLEM` | `0` | Problem ID to evolve |
+| `JUDGE_URLS` | `http://localhost:8081` | Comma-separated judge server URLs |
diff --git a/benchmarks/frontier-cs-eval/analyze_results.py b/benchmarks/frontier-cs-eval/analyze_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8359c99df099bf201b8deaf8d05c8e028e14901
--- /dev/null
+++ b/benchmarks/frontier-cs-eval/analyze_results.py
@@ -0,0 +1,105 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+from pathlib import Path
+
+# Define paths
+_script_dir = str(Path(__file__).resolve().parent)
+input_csv = str(Path(_script_dir) / "combined_results.csv")
+output_dir = _script_dir
+
+# Read the CSV file
+df = pd.read_csv(input_csv)
+
+# Calculate average of training and testing scores
+df['average_score'] = (df['training_score'] + df['testing_score']) / 2
+
+# Remove rows where either score is None (NaN)
+df_complete = df.dropna(subset=['training_score', 'testing_score'])
+
+print(f"\n=== Analysis Results ===")
+print(f"Total problems: {len(df)}")
+print(f"Problems with complete data: {len(df_complete)}")
+print(f"\nTraining Scores:")
+print(f"  Mean: {df_complete['training_score'].mean():.4f}")
+print(f"  Median: {df_complete['training_score'].median():.4f}")
+print(f"  Std Dev: {df_complete['training_score'].std():.4f}")
+print(f"  Min: {df_complete['training_score'].min():.4f}")
+print(f"  Max: {df_complete['training_score'].max():.4f}")
+
+print(f"\nTesting Scores:")
+print(f"  Mean: {df_complete['testing_score'].mean():.4f}")
+print(f"  Median: {df_complete['testing_score'].median():.4f}")
+print(f"  Std Dev: {df_complete['testing_score'].std():.4f}")
+print(f"  Min: {df_complete['testing_score'].min():.4f}")
+print(f"  Max: {df_complete['testing_score'].max():.4f}")
+
+print(f"\nAverage Scores:")
+print(f"  Mean: {df_complete['average_score'].mean():.4f}")
+print(f"  Median: {df_complete['average_score'].median():.4f}")
+print(f"  Std Dev: {df_complete['average_score'].std():.4f}")
+
+# Save the updated CSV with averages
+output_csv = Path(output_dir) / "combined_results_with_averages.csv"
+df.to_csv(output_csv, index=False)
+print(f"\nUpdated CSV with averages saved to {output_csv}")
+
+# Create visualizations
+fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+
+# 1. Scatter plot: Training vs Testing scores
+ax = axes[0, 0]
+ax.scatter(df_complete['training_score'], df_complete['testing_score'], alpha=0.6, s=50)
+# Add diagonal line for reference (where training == testing)
+lim = [min(df_complete['training_score'].min(), df_complete['testing_score'].min()),
+       max(df_complete['training_score'].max(), df_complete['testing_score'].max())]
+ax.plot(lim, lim, 'r--', alpha=0.5, label='Training = Testing')
+ax.set_xlabel('Training Score')
+ax.set_ylabel('Testing Score')
+ax.set_title('Training vs Testing Scores')
+ax.legend()
+ax.grid(True, alpha=0.3)
+
+# 2. Distribution comparison - histograms
+ax = axes[0, 1]
+ax.hist(df_complete['training_score'], bins=20, alpha=0.6, label='Training', edgecolor='black')
+ax.hist(df_complete['testing_score'], bins=20, alpha=0.6, label='Testing', edgecolor='black')
+ax.set_xlabel('Score')
+ax.set_ylabel('Frequency')
+ax.set_title('Distribution of Training vs Testing Scores')
+ax.legend()
+ax.grid(True, alpha=0.3, axis='y')
+
+# 3. Box plot comparison
+ax = axes[1, 0]
+box_data = [df_complete['training_score'], df_complete['testing_score'], df_complete['average_score']]
+bp = ax.boxplot(box_data, labels=['Training', 'Testing', 'Average'])
+ax.set_ylabel('Score')
+ax.set_title('Score Comparison (Box Plot)')
+ax.grid(True, alpha=0.3, axis='y')
+
+# 4. Difference plot: Training - Testing
+ax = axes[1, 1]
+difference = df_complete['training_score'] - df_complete['testing_score']
+ax.scatter(df_complete['problem_id'].astype(int), difference, alpha=0.6, s=50)
+ax.axhline(y=0, color='r', linestyle='--', alpha=0.5, label='No Difference')
+ax.set_xlabel('Problem ID')
+ax.set_ylabel('Training Score - Testing Score')
+ax.set_title('Score Difference (Training - Testing)')
+ax.legend()
+ax.grid(True, alpha=0.3)
+
+plt.tight_layout()
+plot_path = Path(output_dir) / "results_analysis.png"
+plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+print(f"Plot saved to {plot_path}")
+
+# Additional statistics about differences
+print(f"\nScore Differences (Training - Testing):")
+print(f"  Mean Difference: {difference.mean():.4f}")
+print(f"  Median Difference: {difference.median():.4f}")
+print(f"  Std Dev: {difference.std():.4f}")
+print(f"  Problems where training > testing: {(difference > 0).sum()}")
+print(f"  Problems where testing > training: {(difference < 0).sum()}")
+
+plt.show()
diff --git a/benchmarks/frontier-cs-eval/combine_results.py b/benchmarks/frontier-cs-eval/combine_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..8df35c7131cac075c4f9a29e45fc1c4440511d03
--- /dev/null
+++ b/benchmarks/frontier-cs-eval/combine_results.py
@@ -0,0 +1,66 @@
+import json
+import csv
+import os
+from pathlib import Path
+
+# Define paths
+_script_dir = Path(__file__).resolve().parent
+_repo_root = _script_dir.parent.parent
+training_dir = str(_repo_root / "outputs" / "frontier_cs")
+testing_dir = str(_script_dir / "evaluation_results")
+output_csv = str(_script_dir / "combined_results.csv")
+
+# Collect all problems
+results = []
+
+# Get all problem directories from training data
+training_problems = sorted([d for d in os.listdir(training_dir) if d.startswith("problem_")])
+
+print(f"Found {len(training_problems)} training problems")
+
+for problem_dir in training_problems:
+    problem_id = problem_dir.replace("problem_", "")
+    
+    # Get training score from best_program_info.json
+    training_score = None
+    training_info_path = os.path.join(training_dir, problem_dir, "best", "best_program_info.json")
+    
+    if os.path.exists(training_info_path):
+        try:
+            with open(training_info_path, 'r') as f:
+                training_data = json.load(f)
+                training_score = training_data.get("metrics", {}).get("combined_score")
+        except Exception as e:
+            print(f"Error reading training data for problem {problem_id}: {e}")
+    
+    # Get testing score from evaluation_results json
+    testing_score = None
+    testing_json_path = os.path.join(testing_dir, f"problem_{problem_id}.json")
+    
+    if os.path.exists(testing_json_path):
+        try:
+            with open(testing_json_path, 'r') as f:
+                testing_data = json.load(f)
+                testing_score = testing_data.get("combined_score")
+        except Exception as e:
+            print(f"Error reading testing data for problem {problem_id}: {e}")
+    
+    results.append({
+        "problem_id": problem_id,
+        "training_score": training_score,
+        "testing_score": testing_score
+    })
+
+# Write to CSV
+with open(output_csv, 'w', newline='') as csvfile:
+    fieldnames = ["problem_id", "training_score", "testing_score"]
+    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+    
+    writer.writeheader()
+    writer.writerows(results)
+
+print(f"\nResults written to {output_csv}")
+print(f"Total problems: {len(results)}")
+print(f"Problems with both scores: {sum(1 for r in results if r['training_score'] is not None and r['testing_score'] is not None)}")
+print(f"Problems missing training score: {sum(1 for r in results if r['training_score'] is None)}")
+print(f"Problems missing testing score: {sum(1 for r in results if r['testing_score'] is None)}")
diff --git a/benchmarks/frontier-cs-eval/config.yaml b/benchmarks/frontier-cs-eval/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..46159f8107e7acd6833300c1150dc7e90f122625
--- /dev/null
+++ b/benchmarks/frontier-cs-eval/config.yaml
@@ -0,0 +1,57 @@
+# Frontier-CS Benchmark
+# Usage: uv run skydiscover-run initial_program.cpp evaluator.py -c config.yaml -s <strategy> -i 50
+
+max_iterations: 100
+checkpoint_interval: 10
+log_level: INFO
+
+llm:
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  api_base: https://api.openai.com/v1
+  temperature: 0.7
+  # top_p: 0.95  # omitted by default; some providers (e.g. Anthropic) reject both temperature and top_p
+  max_tokens: 32000
+  timeout: 600
+  # To use Gemini: override with --model gemini-3-pro-preview
+
+prompt:
+  system_message: |
+    You are an expert competitive programmer specializing in algorithmic optimization.
+
+    PROBLEM STATEMENT:
+    {problem_statement}
+
+    CONSTRAINTS:
+    {problem_constraints}
+
+    OBJECTIVE: Maximize the score returned by the Frontier-CS judge (higher is better).
+    Your solution must be valid C++ code that compiles and runs correctly.
+
+    KEY STRATEGIES:
+    - Analyze the problem structure carefully before coding
+    - Consider time and space complexity constraints
+    - Use efficient data structures (vectors, maps, sets, priority queues)
+    - Implement clean, well-structured code
+    - Handle edge cases properly
+    - Optimize hot loops and critical sections
+
+    COMMON TECHNIQUES:
+    - Dynamic programming for optimization problems
+    - Greedy algorithms with proper ordering
+    - Graph algorithms (BFS, DFS, shortest paths)
+    - Binary search for monotonic functions
+    - Divide and conquer approaches
+    - Heuristic search (simulated annealing, genetic algorithms, local search)
+
+    OUTPUT: Complete C++ program with main() function that reads from stdin and writes to stdout.
+
+evaluator:
+  timeout: 300
+  max_retries: 3
+  cascade_evaluation: false
+
+diff_based_generation: true
+max_solution_length: 50000
+random_seed: 42
diff --git a/benchmarks/frontier-cs-eval/evaluator.py b/benchmarks/frontier-cs-eval/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..af6eea0f0f4fda9b4d6ff63a4fa496c998bff3a9
--- /dev/null
+++ b/benchmarks/frontier-cs-eval/evaluator.py
@@ -0,0 +1,174 @@
+"""
+Evaluator for Frontier-CS algorithmic problems.
+
+This evaluator integrates with SkyDiscover to evaluate generated C++ solutions
+against Frontier-CS benchmark problems using the local judge server.
+"""
+
+import traceback
+from pathlib import Path
+import logging
+import sys
+import os
+import random
+
+logger = logging.getLogger(__name__)
+
+# Support multiple judge servers for load balancing
+DEFAULT_JUDGE_URL = "http://localhost:8081"
+JUDGE_URLS = os.environ.get("JUDGE_URLS", DEFAULT_JUDGE_URL).split(",")
+JUDGE_URLS = [url.strip() for url in JUDGE_URLS if url.strip()]
+
+def get_judge_url() -> str:
+    """Get a judge URL using random selection for load balancing."""
+    return random.choice(JUDGE_URLS)
+
+# Add Frontier-CS to path
+frontier_cs_path = Path(__file__).resolve().parent / "Frontier-CS" / "src"
+if str(frontier_cs_path) not in sys.path:
+    sys.path.insert(0, str(frontier_cs_path))
+
+try:
+    from frontier_cs.single_evaluator import SingleEvaluator as FrontierCSEvaluator
+    from frontier_cs.runner.base import EvaluationStatus
+except ImportError as e:
+    logger.error(f"Failed to import Frontier-CS: {e}")
+    logger.error("Please ensure Frontier-CS is installed as a submodule in benchmarks/frontier-cs-eval/Frontier-CS")
+    raise
+
+def evaluate(program_path: str, problem_id: str = None, **kwargs) -> dict:
+    """
+    Evaluate a C++ solution for a Frontier-CS algorithmic problem.
+
+    Args:
+        program_path: Path to the C++ solution file
+        problem_id: Frontier-CS problem ID (e.g., "0", "1", "2", etc.)
+                    If None, will be read from FRONTIER_CS_PROBLEM env var or config
+
+    Returns:
+        dict with evaluation results:
+            - combined_score: The score from the judge (higher is better)
+            - runs_successfully: 1.0 if evaluation succeeded, 0.0 otherwise
+            - status: Evaluation status string
+            - message: Any error or status messages
+            - problem_id: The problem ID
+            - program_path: Path to the evaluated program
+            - score_unbounded: Unbounded score if available
+            - metadata: Additional evaluation metadata
+    """
+    # Get problem_id from parameter, environment, or kwargs
+    if problem_id is None:
+        import os
+        problem_id = os.environ.get('FRONTIER_CS_PROBLEM')
+        if problem_id is None:
+            problem_id = kwargs.get('frontier_cs_problem', '0')
+
+    logger.info(f"Evaluating program {program_path} for Frontier-CS problem {problem_id}")
+
+    try:
+        # Initialize evaluator with judge server (load balanced if multiple configured)
+        judge_url = get_judge_url()
+        logger.info(f"Using judge server: {judge_url}")
+        evaluator = FrontierCSEvaluator(
+            backend="docker",
+            judge_url=judge_url,
+            register_cleanup=False,
+        )
+        
+        # Read the solution code
+        solution_path = Path(program_path)
+        if not solution_path.exists():
+            error_msg = f"Solution file not found: {program_path}"
+            logger.error(error_msg)
+            return {
+                "combined_score": 0.0,
+                "runs_successfully": 0.0,
+                "status": "error",
+                "message": error_msg,
+                "problem_id": problem_id,
+                "program_path": program_path,
+            }
+        
+        # Extract code and remove any EVOLVE-BLOCK markers
+        code = solution_path.read_text().replace(
+            "// EVOLVE-BLOCK-START", ""
+        ).replace(
+            "// EVOLVE-BLOCK-END", ""
+        ).strip()
+        
+        logger.info(f"Code extracted from {program_path}")
+        
+        # Evaluate the solution
+        result = evaluator.evaluate(
+            track="algorithmic",
+            problem_id=problem_id,
+            code=code,
+            backend="docker",
+        )
+        
+        logger.info(f"Evaluation completed with status: {result.status}")
+        
+        # Process result
+        if result.status == EvaluationStatus.SUCCESS:
+            print(result)
+            score = result.score
+            # Use unbounded score for optimization (allows >100 if beating reference)
+            score_unbounded = result.metadata.get('scoreUnbounded', score) if result.metadata else score
+            print(f"score={score}, score_unbounded={score_unbounded}")
+
+            # Extract only essential metadata (exclude large test case outputs)
+            essential_metadata = {}
+            if result.metadata:
+                essential_metadata = {
+                    "status": result.metadata.get("status"),
+                    "passed": result.metadata.get("passed"),
+                    "result": result.metadata.get("result"),
+                    "score": result.metadata.get("score"),
+                    "scoreUnbounded": result.metadata.get("scoreUnbounded"),
+                }
+
+            return {
+                "combined_score": float(score),  # Ensure it's a float
+                "score_unbounded": score_unbounded,
+                "runs_successfully": 1.0,
+                "status": "success",
+                "message": result.message or "Evaluation successful",
+                "problem_id": problem_id,
+                "program_path": program_path,
+                "duration_seconds": result.duration_seconds,
+                "metadata": essential_metadata,
+            } 
+        elif result.status == EvaluationStatus.TIMEOUT:
+            logger.warning(f"Evaluation timed out: {result.message}")
+            return {
+                "combined_score": 0.0,
+                "runs_successfully": 0.0,
+                "status": "timeout",
+                "message": result.message or "Evaluation timed out",
+                "problem_id": problem_id,
+                "program_path": program_path,
+            }
+        else:  # ERROR status
+            logger.error(f"Evaluation error: {result.message}")
+            return {
+                "combined_score": 0.0,
+                "runs_successfully": 0.0,
+                "status": "error",
+                "message": result.message or "Evaluation failed",
+                "problem_id": problem_id,
+                "program_path": program_path,
+                "logs": result.logs,
+            }
+            
+    except Exception as e:
+        logger.error(f"Evaluation failed completely: {str(e)}")
+        logger.error(traceback.format_exc())
+        return {
+            "combined_score": 0.0,
+            "runs_successfully": 0.0,
+            "status": "error",
+            "message": str(e),
+            "problem_id": problem_id,
+            "program_path": program_path,
+            "error": str(e),
+        }
diff --git a/benchmarks/frontier-cs-eval/initial_program.cpp b/benchmarks/frontier-cs-eval/initial_program.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..20e5839ba653567805fc89560d23bf3ddc243d5c
--- /dev/null
+++ b/benchmarks/frontier-cs-eval/initial_program.cpp
@@ -0,0 +1,6 @@
+#include <bits/stdc++.h>
+using namespace std;
+int main(){
+    std::cout << "Hello, World!" << std::endl;
+    return 0;
+}
\ No newline at end of file
diff --git a/benchmarks/frontier-cs-eval/run_all_frontiercs.py b/benchmarks/frontier-cs-eval/run_all_frontiercs.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b805fd8e21b6d63238688cf5e5089342cba1a4c
--- /dev/null
+++ b/benchmarks/frontier-cs-eval/run_all_frontiercs.py
@@ -0,0 +1,70 @@
+import argparse
+import os
+import sys
+import subprocess
+from pathlib import Path
+from concurrent.futures import ProcessPoolExecutor
+
+from dotenv import load_dotenv
+load_dotenv()
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+
+frontier_cs_path = SCRIPT_DIR / "Frontier-CS" / "src"
+if str(frontier_cs_path) not in sys.path:
+    sys.path.insert(0, str(frontier_cs_path))
+
+from frontier_cs.runner.algorithmic_local import AlgorithmicLocalRunner
+
+
+def run_single_problem(args):
+    p_id, search, iterations, env = args
+    print(f"\n[START] Problem ID: {p_id}")
+    command = [
+        "uv", "run", "skydiscover-run",
+        "initial_program.cpp", "evaluator.py",
+        "-c", "config.yaml",
+        "-s", search,
+        "-i", str(iterations),
+        "-o", f"outputs/frontier_cs/problem_{p_id}",
+    ]
+    env = {**env, "FRONTIER_CS_PROBLEM": str(p_id)}
+    try:
+        subprocess.run(command, check=True, env=env, cwd=str(SCRIPT_DIR))
+        return f"✅ Problem {p_id} completed."
+    except subprocess.CalledProcessError as e:
+        return f"❌ Problem {p_id} failed: {e}"
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run SkyDiscover on all Frontier-CS problems")
+    parser.add_argument("--search", "-s", default="adaevolve",
+                        help="Search algorithm (default: adaevolve)")
+    parser.add_argument("--iterations", "-i", type=int, default=50,
+                        help="Iterations per problem (default: 50)")
+    parser.add_argument("--workers", "-w", type=int, default=6,
+                        help="Parallel workers (default: 6)")
+    args = parser.parse_args()
+
+    runner = AlgorithmicLocalRunner()
+    problems_data = runner.list_problems()
+    problem_ids = sorted([p['id'] for p in problems_data['problems']], key=int)
+
+    print(f"Running {len(problem_ids)} problems with {args.workers} workers "
+          f"(search={args.search}, iterations={args.iterations})...")
+
+    env = os.environ.copy()
+    task_args = [(p_id, args.search, args.iterations, env) for p_id in problem_ids]
+
+    with ProcessPoolExecutor(max_workers=args.workers) as executor:
+        results = list(executor.map(run_single_problem, task_args))
+
+    print("\n" + "=" * 30)
+    print("ALL RUNS COMPLETE")
+    print("=" * 30)
+    for result in results:
+        print(result)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/benchmarks/frontier-cs-eval/run_best_programs_frontiercs.py b/benchmarks/frontier-cs-eval/run_best_programs_frontiercs.py
new file mode 100644
index 0000000000000000000000000000000000000000..9be033548a0aebf71f080d882014bfe76d5e68ef
--- /dev/null
+++ b/benchmarks/frontier-cs-eval/run_best_programs_frontiercs.py
@@ -0,0 +1,404 @@
+import os
+import sys
+import json
+import logging
+import threading
+from pathlib import Path
+from typing import Dict, List, Tuple
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# Add Frontier-CS to path
+frontier_cs_path = Path(__file__).resolve().parent / "Frontier-CS" / "src"
+if str(frontier_cs_path) not in sys.path:
+    sys.path.insert(0, str(frontier_cs_path))
+
+try:
+    from frontier_cs.evaluator import FrontierCSEvaluator
+    from frontier_cs.runner.base import EvaluationStatus
+except ImportError as e:
+    logger.error(f"Failed to import Frontier-CS: {e}")
+    logger.error("Please ensure Frontier-CS is installed as a submodule in benchmarks/frontier-cs-eval/Frontier-CS")
+    sys.exit(1)
+
+
+class BestProgramEvaluator:
+    """Evaluates all best_program.cpp files in the outputs directory."""
+    
+    def __init__(self, outputs_dir: str, judge_url: str = "http://localhost:8081", num_workers: int = 8):
+        """
+        Initialize the evaluator.
+        
+        Args:
+            outputs_dir: Path to the outputs directory containing problem folders
+            judge_url: URL of the judge server
+            num_workers: Number of parallel workers for evaluation
+        """
+        self.outputs_dir = Path(outputs_dir)
+        self.judge_url = judge_url
+        self.num_workers = num_workers
+        
+        # Use thread-local storage for evaluator instances (avoid race condition)
+        self._evaluator_local = threading.local()
+        
+        self.results = []
+        
+        # Create results directory in the script's directory
+        self.results_dir = Path(__file__).resolve().parent / "evaluation_results"
+        self.results_dir.mkdir(exist_ok=True)
+        logger.info(f"Results will be saved to {self.results_dir}")
+        logger.info(f"Using {self.num_workers} parallel workers with thread-local evaluators")
+    
+    def _get_evaluator(self) -> 'FrontierCSEvaluator':
+        """
+        Get the evaluator for the current thread.
+        Creates a new instance if this thread hasn't created one yet.
+        This avoids race conditions from sharing a single evaluator across threads.
+        """
+        if not hasattr(self._evaluator_local, 'evaluator'):
+            self._evaluator_local.evaluator = FrontierCSEvaluator(
+                backend="docker",
+                judge_url=self.judge_url,
+            )
+            logger.debug(f"Created new evaluator for thread {threading.current_thread().name}")
+        return self._evaluator_local.evaluator
+    
+    def find_best_programs(self) -> Dict[str, Path]:
+        """
+        Find all best_program.cpp files in the outputs directory.
+        
+        Returns:
+            Dict mapping problem_id to best_program.cpp path
+        """
+        best_programs = {}
+        
+        # Look for frontier_cs subdirectory
+        frontier_cs_dir = self.outputs_dir / "frontier_cs"
+        if not frontier_cs_dir.exists():
+            logger.error(f"frontier_cs directory not found at {frontier_cs_dir}")
+            return best_programs
+        
+        # Iterate through problem directories
+        for problem_dir in sorted(frontier_cs_dir.iterdir()):
+            if not problem_dir.is_dir() or not problem_dir.name.startswith("problem_"):
+                continue
+            
+            # Extract problem ID
+            problem_id = problem_dir.name.replace("problem_", "")
+            
+            # Look for best_program.cpp
+            best_program_path = problem_dir / "best" / "best_program.cpp"
+            if best_program_path.exists():
+                best_programs[problem_id] = best_program_path
+                logger.info(f"Found best_program.cpp for problem {problem_id}")
+            else:
+                logger.warning(f"best_program.cpp not found for problem {problem_id} at {best_program_path}")
+        
+        return best_programs
+    
+    def evaluate_program(self, problem_id: str, program_path: Path) -> Dict:
+        """
+        Evaluate a single best_program.cpp file.
+        
+        Args:
+            problem_id: The Frontier-CS problem ID
+            program_path: Path to the best_program.cpp file
+            
+        Returns:
+            Dictionary with evaluation results
+        """
+        logger.info(f"Evaluating problem {problem_id}: {program_path}")
+        
+        try:
+            # Read the solution code
+            if not program_path.exists():
+                error_msg = f"Solution file not found: {program_path}"
+                logger.error(error_msg)
+                return {
+                    "problem_id": problem_id,
+                    "program_path": str(program_path),
+                    "combined_score": 0.0,
+                    "runs_successfully": 0.0,
+                    "status": "error",
+                    "message": error_msg,
+                }
+            
+            # Read the code
+            code = program_path.read_text().replace(
+                "// EVOLVE-BLOCK-START", ""
+            ).replace(
+                "// EVOLVE-BLOCK-END", ""
+            ).strip()
+            
+            logger.info(f"Code extracted from {program_path}, length: {len(code)} characters")
+            
+            # Evaluate the solution (use thread-local evaluator)
+            evaluator = self._get_evaluator()
+            result = evaluator.evaluate(
+                track="algorithmic",
+                problem_id=problem_id,
+                code=code,
+                backend="docker",
+            )
+            
+            logger.info(f"Evaluation completed for problem {problem_id} with status: {result.status}")
+            
+            # Log the result object and its properties
+            logger.info(f"Judger output for problem {problem_id}:")
+            logger.info(f"  Status: {result.status}")
+            logger.info(f"  Message: {result.message}")
+            if hasattr(result, 'score'):
+                logger.info(f"  Score: {result.score}")
+            if hasattr(result, 'duration_seconds'):
+                logger.info(f"  Duration: {result.duration_seconds}s")
+            if hasattr(result, 'metadata'):
+                logger.info(f"  Metadata: {result.metadata}")
+            logger.info(f"  Full result object: {result}")
+            
+            # Process result
+            if result.status == EvaluationStatus.SUCCESS:
+                score = result.score
+                logger.info(f"Problem {problem_id}: Score = {score}")
+                
+                return {
+                    "problem_id": problem_id,
+                    "program_path": str(program_path),
+                    "combined_score": float(score),
+                    "runs_successfully": 1.0,
+                    "status": "success",
+                    "message": result.message or "Evaluation successful",
+                    "duration_seconds": result.duration_seconds,
+                    "judger_output": str(result),
+                    "metadata": result.metadata if hasattr(result, 'metadata') else None,
+                }
+            elif result.status == EvaluationStatus.TIMEOUT:
+                logger.warning(f"Problem {problem_id}: Evaluation timed out")
+                return {
+                    "problem_id": problem_id,
+                    "program_path": str(program_path),
+                    "combined_score": 0.0,
+                    "runs_successfully": 0.0,
+                    "status": "timeout",
+                    "message": f"Evaluation timed out: {result.message}",
+                    "duration_seconds": result.duration_seconds,
+                    "judger_output": str(result),
+                }
+            elif result.status == EvaluationStatus.COMPILATION_ERROR:
+                logger.warning(f"Problem {problem_id}: Compilation error")
+                return {
+                    "problem_id": problem_id,
+                    "program_path": str(program_path),
+                    "combined_score": 0.0,
+                    "runs_successfully": 0.0,
+                    "status": "compilation_error",
+                    "message": f"Compilation error: {result.message}",
+                    "duration_seconds": result.duration_seconds,
+                    "judger_output": str(result),
+                }
+            else:
+                logger.error(f"Problem {problem_id}: Evaluation failed with status {result.status}")
+                return {
+                    "problem_id": problem_id,
+                    "program_path": str(program_path),
+                    "combined_score": 0.0,
+                    "runs_successfully": 0.0,
+                    "status": str(result.status),
+                    "message": f"Evaluation failed: {result.message}",
+                    "duration_seconds": result.duration_seconds,
+                    "judger_output": str(result),
+                }
+        
+        except Exception as e:
+            logger.error(f"Exception while evaluating problem {problem_id}: {str(e)}")
+            logger.error(f"Exception traceback: {type(e).__name__}")
+            import traceback
+            logger.error(traceback.format_exc())
+            
+            return {
+                "problem_id": problem_id,
+                "program_path": str(program_path),
+                "combined_score": 0.0,
+                "runs_successfully": 0.0,
+                "status": "exception",
+                "message": str(e),
+            }
+    
+    def run_all_evaluations(self) -> List[Dict]:
+        """
+        Run evaluations for all best_program.cpp files sequentially (one at a time).
+        
+        Returns:
+            List of evaluation results
+        """
+        logger.info(f"Starting evaluation of all best programs in {self.outputs_dir}")
+        
+        best_programs = self.find_best_programs()
+        logger.info(f"Found {len(best_programs)} best_program.cpp files")
+        
+        if not best_programs:
+            logger.warning("No best_program.cpp files found!")
+            return []
+        
+        # Sort problems by ID for consistent ordering
+        sorted_problems = sorted(best_programs.items(), key=lambda x: int(x[0]))
+        
+        # Evaluate each program sequentially (no parallelization)
+        results = []
+        total = len(sorted_problems)
+        for idx, (problem_id, program_path) in enumerate(sorted_problems, 1):
+            logger.info(f"[SEQ] Evaluating problem {problem_id} ({idx}/{total})")
+            try:
+                result = self.evaluate_program(problem_id, program_path)
+                
+                # CRITICAL: Ensure problem_id matches
+                if result.get("problem_id") != problem_id:
+                    logger.error(f"[CRITICAL] Problem ID MISMATCH! Expected {problem_id}, got {result.get('problem_id')}")
+                    result["problem_id"] = problem_id  # Force correct problem_id
+                
+                results.append(result)
+                self.results.append(result)
+                
+                logger.info(f"[SAVE] Saving problem {problem_id} result to file")
+                # Save result immediately after evaluation
+                self.save_problem_result(result)
+                
+            except Exception as e:
+                logger.error(f"Exception evaluating problem {problem_id}: {str(e)}")
+                import traceback
+                logger.error(traceback.format_exc())
+                
+                error_result = {
+                    "problem_id": problem_id,
+                    "combined_score": 0.0,
+                    "runs_successfully": 0.0,
+                    "status": "exception",
+                    "message": str(e),
+                }
+                results.append(error_result)
+                self.results.append(error_result)
+                self.save_problem_result(error_result)
+        
+        return results
+    
+    def save_results(self, output_file: str = "evaluation_results.json"):
+        """
+        Save evaluation results to a JSON file.
+        
+        Args:
+            output_file: Path to save the results
+        """
+        output_path = Path(output_file)
+        with open(output_path, 'w') as f:
+            json.dump(self.results, f, indent=2)
+        logger.info(f"Results saved to {output_path}")
+    
+    def save_problem_result(self, result: Dict):
+        """
+        Save individual problem result to a separate file.
+        
+        Args:
+            result: The evaluation result for a single problem
+        """
+        problem_id = result.get("problem_id", "unknown")
+        result_file = self.results_dir / f"problem_{problem_id}.json"
+        
+        with open(result_file, 'w') as f:
+            json.dump(result, f, indent=2)
+        logger.info(f"Problem {problem_id} result saved to {result_file}")
+    
+    def print_summary(self):
+        """Print a summary of the evaluation results."""
+        if not self.results:
+            logger.info("No results to summarize")
+            return
+        
+        logger.info("\n" + "="*80)
+        logger.info("EVALUATION SUMMARY")
+        logger.info("="*80)
+        
+        successful = [r for r in self.results if r.get("status") == "success"]
+        timeout = [r for r in self.results if r.get("status") == "timeout"]
+        compilation_error = [r for r in self.results if r.get("status") == "compilation_error"]
+        other_error = [r for r in self.results if r.get("status") not in ["success", "timeout", "compilation_error"]]
+        
+        logger.info(f"Total problems evaluated: {len(self.results)}")
+        logger.info(f"Successful: {len(successful)}")
+        logger.info(f"Timeouts: {len(timeout)}")
+        logger.info(f"Compilation errors: {len(compilation_error)}")
+        logger.info(f"Other errors: {len(other_error)}")
+        
+        if successful:
+            scores = [r["combined_score"] for r in successful]
+            logger.info(f"\nSuccessful evaluation scores:")
+            logger.info(f"  Average score: {sum(scores) / len(scores):.2f}")
+            logger.info(f"  Min score: {min(scores):.2f}")
+            logger.info(f"  Max score: {max(scores):.2f}")
+            
+            logger.info(f"\nTop 5 problems by score:")
+            top_5 = sorted(successful, key=lambda r: r["combined_score"], reverse=True)[:5]
+            for i, result in enumerate(top_5, 1):
+                logger.info(f"  {i}. Problem {result['problem_id']}: {result['combined_score']:.2f}")
+        
+        logger.info("="*80 + "\n")
+
+
+def main():
+    """Main entry point."""
+    import argparse
+    
+    parser = argparse.ArgumentParser(
+        description="Evaluate all best_program.cpp files in the outputs directory"
+    )
+    
+    # Default outputs directory is two levels up from this script
+    default_outputs_dir = Path(__file__).resolve().parent.parent.parent / "outputs"
+    
+    parser.add_argument(
+        "--outputs-dir",
+        type=str,
+        default=str(default_outputs_dir),
+        help="Path to the outputs directory (default: ../../outputs from script location)"
+    )
+    parser.add_argument(
+        "--judge-url",
+        type=str,
+        default="http://localhost:8081",
+        help="URL of the judge server (default: http://localhost:8081)"
+    )
+    parser.add_argument(
+        "--output-file",
+        type=str,
+        default="evaluation_results.json",
+        help="Path to save the evaluation results (default: evaluation_results.json)"
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=8,
+        help="Number of parallel workers for evaluation (default: 8)"
+    )
+    
+    args = parser.parse_args()
+    
+    # Run evaluations
+    evaluator = BestProgramEvaluator(
+        outputs_dir=args.outputs_dir,
+        judge_url=args.judge_url,
+        num_workers=args.workers
+    )
+    
+    results = evaluator.run_all_evaluations()
+    evaluator.save_results(args.output_file)
+    evaluator.print_summary()
+    
+    logger.info(f"Evaluation complete. Results saved to {args.output_file}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/image_gen/README.md b/benchmarks/image_gen/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..23b0b87cd91b65c3bc9e0d0ba9ad238a4736c44d
--- /dev/null
+++ b/benchmarks/image_gen/README.md
@@ -0,0 +1,40 @@
+# Image Generation Benchmark
+
+This benchmark evaluates whether SkyDiscover can optimize images, not just code or text. Each "solution" in the population is an image, evolved by generating and scoring variants from a candidate pool stored in the database. The evolutionary loop is the same as for code — parent selection, mutation via LLM, crossover via other context images from other islands — but instead of evolving Python programs, SkyDiscover evolves text prompts fed to GPT-5's native image generation. The VLM receives actual parent and other context images alongside text guidance, reasons about what to improve, and generates a new image. Setting `language: "image"` in the config is the only change needed.
+
+## Benchmark: Sky Festival
+
+**Directory:** `sky_festival/`
+
+The system must generate a floating sky-festival image where many details must match exact structural constraints: 9 clouds with specific shapes (rabbit, teacup, musical note, crescent moon, whale, etc.), 5 hot-air balloons with exact colors, passengers, and a banner reading "HAPPY 100TH SKY FESTIVAL", a floating island with 4 trees in a specific left-to-right order, and a party table with precisely counted items (6 cupcakes, 8 golden plates, 5 gift boxes in a pyramid). The scene also includes 6 characters with specific attributes (e.g., a robot with 3 colored buttons on its chest, a grandmother giving a thumbs-up with her left hand), flying creatures, and a correctly ordered 7-band rainbow. The full specification is about 2000 words and lives in `config.yaml`'s `prompt.system_message`.
+
+**Evaluator.** Each generated image is graded by a GPT-5 vision judge using a strict rubric. The judge receives the image and a detailed scoring sheet, then returns per-category scores across 7 dimensions — cloud shapes (15 pts), balloons (20 pts), floating island (10 pts), table items (20 pts), characters (15 pts), decorations/creatures (10 pts), and rainbow/lighting (10 pts) — for a total of 100 points. The judge is instructed to be extremely harsh: points are awarded only when requirements are clearly and unambiguously met in the image.
+
+## Setup
+
+1. **Set your API key:**
+
+   ```bash
+   export OPENAI_API_KEY=...
+   ```
+
+   Both the image generator (GPT-5) and the evaluator judge (GPT-5) use the OpenAI API.
+
+## Run
+
+```bash
+cd benchmarks/image_gen/sky_festival
+
+# AdaEvolve
+uv run skydiscover-run evaluator.py -c config.yaml -s adaevolve -o sky_festival_output
+
+# EvoX
+uv run skydiscover-run evaluator.py -c config.yaml -s evox -o sky_festival_output
+```
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `sky_festival/evaluator.py` | GPT-5 vision judge that scores images against the 100-point rubric |
+| `sky_festival/config.yaml` | Config — scene specification in `prompt.system_message` |
diff --git a/benchmarks/image_gen/sky_festival/config.yaml b/benchmarks/image_gen/sky_festival/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3d28bbb281b2a9a709f96f74fb82d89e68df6205
--- /dev/null
+++ b/benchmarks/image_gen/sky_festival/config.yaml
@@ -0,0 +1,103 @@
+# Sky Festival Benchmark
+#
+# Usage:
+#   cd benchmarks/image_gen/sky_festival
+#   skydiscover-run evaluator.py -c config.yaml -s adaevolve -o sky_festival_output
+
+language: "image"
+diff_based_generation: false
+max_iterations: 100
+checkpoint_interval: 1
+
+llm:
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  temperature: 0.9
+  max_tokens: 16384
+  timeout: 300
+
+evaluator:
+  timeout: 300
+
+prompt:
+  system_message: |
+    You are an expert visual artist and image generation AI specializing in
+    complex compositional scenes with precise object counting, spatial
+    arrangement, and rich detail.
+
+    You can see the current images from the database along with their scores
+    across 7 categories: cloud shapes, hot air balloons, floating island,
+    table items, characters, decorations/creatures, and rainbow/lighting.
+
+    Your goal is to generate a NEW, improved image that scores higher on
+    the rubric. Pay special attention to:
+    - EXACT counts: 9 shaped clouds, 5 balloons, 4 trees, 6 cupcakes, 8 plates, 5 gifts, 6 characters, 11 bunting flags, 7 lanterns, 7 rainbow bands
+    - Correct passengers in each balloon (2 children, 1 woman, 3 cats, 1 violinist, empty)
+    - Legible text: "HAPPY 100TH SKY FESTIVAL" on banner, "100 YEARS" on cake
+    - Specific character details: robot buttons, grandmother's LEFT hand thumbs-up, dog's striped hat
+    - Correct spatial ordering: trees left-to-right, gift pyramid, cupcake grid
+    - Warm golden lighting from upper left, consistent shadows
+
+    Also provide brief text reasoning about your approach and what you changed.
+
+    # Target Image Description
+    A joyful, sunlit floating sky festival on a perfect summer day, viewed from a slightly elevated angle.
+
+    THE SKY AND BACKGROUND:
+    The sky is a brilliant gradient from warm gold at the horizon to deep cerulean blue at the top. There are exactly 9 fluffy white clouds scattered across the sky. Each cloud has a distinct shape: cloud 1 looks like a rabbit, cloud 2 looks like a teacup, cloud 3 looks like a musical note, cloud 4 looks like a crescent moon, cloud 5 looks like a whale, cloud 6 looks like a bicycle, cloud 7 looks like a crown, cloud 8 looks like a butterfly, cloud 9 looks like the number 7. The clouds are arranged in a gentle arc from left to right across the upper third of the image.
+
+    THE HOT AIR BALLOONS:
+    There are exactly 5 hot air balloons floating at different heights. Each balloon has a unique color and pattern:
+    - Balloon 1 (leftmost, highest): Red with white horizontal stripes. Its basket carries exactly 2 waving children.
+    - Balloon 2 (second from left, medium height): Sunshine yellow with orange polka dots. Its basket carries exactly 1 old woman holding a telescope.
+    - Balloon 3 (center, lowest): Rainbow gradient (red-orange-yellow-green-blue-purple from top to bottom). Its basket carries exactly 3 cats — one orange tabby, one black, one white — all wearing tiny party hats.
+    - Balloon 4 (second from right, medium height): Deep purple with gold stars printed on it. Its basket carries exactly 1 man playing a violin.
+    - Balloon 5 (rightmost, highest): Emerald green with a large white peace sign on the front. Its basket is empty but has a banner hanging from it that reads exactly: "HAPPY 100TH SKY FESTIVAL"
+
+    THE FLOATING ISLAND:
+    Below the balloons, there is a lush green floating island suspended in mid-air. The island is roughly circular and has grass, wildflowers, and 4 trees on it. The trees are different species: one oak with a thick trunk, one cherry blossom in full pink bloom, one palm tree leaning slightly right, and one pine tree (tallest of the four). The trees are spaced evenly along the island from left to right in that exact order: oak, cherry blossom, palm, pine.
+
+    THE PARTY TABLE:
+    On the center of the floating island sits a long rectangular wooden table covered with a checkered red-and-white tablecloth. On the table, from left to right:
+    - A 3-tier birthday cake with white frosting. The bottom tier has blue frosting roses, the middle tier has pink frosting roses, the top tier has a single golden candle that is lit with a bright flame. Written on the middle tier in purple icing: "100 YEARS"
+    - Exactly 6 cupcakes arranged in 2 rows of 3. Each cupcake has a different colored frosting: red, orange, yellow, green, blue, purple (in that order, left to right, top row first).
+    - A glass pitcher of lemonade, three-quarters full, with exactly 3 lemon slices floating in it and 2 ice cubes visible.
+    - A stack of exactly 8 golden plates.
+    - Exactly 5 colorful gift boxes stacked in a pyramid: 3 on the bottom row (red, blue, green from left to right), 2 on top (yellow, purple from left to right). Each gift box has a white ribbon bow on top.
+
+    THE CHARACTERS AROUND THE TABLE:
+    Seated around the table are exactly 6 characters, 3 on each long side facing each other:
+    - Left side (facing right), from left to right: A smiling girl with pigtails wearing a blue dress, a jolly round penguin wearing a red bowtie, and a tall giraffe whose long neck extends above the frame but whose smiling face peeks down from above.
+    - Right side (facing left), from left to right: A friendly robot with a square head and glowing green eyes, a grandmother in a floral apron giving a thumbs-up with her LEFT hand, and a golden retriever dog sitting upright on a chair wearing a cone-shaped party hat with blue and white stripes.
+
+    THE BUNTING AND DECORATIONS:
+    Strung between the cherry blossom tree and the pine tree is a triangular bunting banner with exactly 11 small triangular flags. The flags alternate in color: red, yellow, blue, red, yellow, blue, red, yellow, blue, red, yellow. Below the bunting, there are exactly 7 paper lanterns hanging at different heights. The lanterns are spherical and glow warmly in these colors from left to right: orange, pink, gold, white, lavender, mint green, coral.
+
+    THE ANIMALS IN THE SKY:
+    Flying around the balloons are exactly 4 birds and 2 butterflies. The birds are: 1 blue jay, 1 cardinal (red), 1 canary (yellow), and 1 hummingbird (iridescent green). The 2 butterflies are: one monarch (orange and black) and one morpho (brilliant blue). The blue jay and the cardinal are flying together near Balloon 2. The canary is perched on top of Balloon 4. The hummingbird hovers near the cherry blossom tree. The monarch butterfly is near the bunting. The morpho butterfly is near Balloon 5.
+
+    THE FLOATING MUSICAL NOTES:
+    Drifting upward from the violin player in Balloon 4, there are exactly 5 golden musical notes of different sizes, getting smaller as they rise higher. They follow a gentle curved path upward and to the right.
+
+    THE RAINBOW:
+    Behind everything, a complete semicircular rainbow arcs from the lower left to the lower right of the scene. It has the correct 7 color bands in order from outside to inside: red, orange, yellow, green, blue, indigo, violet.
+
+    LIGHTING AND ATMOSPHERE:
+    The scene is lit by warm, golden afternoon sunlight coming from the upper left. All shadows fall to the lower right. The overall mood is magical, celebratory, and full of wonder. There is a soft, warm glow around the floating island. The light catches the glass lemonade pitcher creating a small sparkle. The golden candle flame on the cake emits a tiny warm glow.
+
+    IMPORTANT DETAILS:
+    - The girl with pigtails has exactly 5 fingers visible on each hand.
+    - The robot has exactly 3 buttons on its chest: a red circle, a green square, and a blue triangle, arranged vertically.
+    - The grandmother's floral apron has exactly sunflowers on it, not roses or daisies.
+    - Every character at the table who has a mouth is smiling.
+    - The penguin's red bowtie has white polka dots on it.
+
+monitor:
+  enabled: true
+  port: 8765
+  summary_model: "gpt-5"
+  summary_interval: 5
+
+hil_enabled: true
+hil_mode: "append"
\ No newline at end of file
diff --git a/benchmarks/image_gen/sky_festival/evaluator.py b/benchmarks/image_gen/sky_festival/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd138c5cf80924c62e1407749de96f7e3b1d4d92
--- /dev/null
+++ b/benchmarks/image_gen/sky_festival/evaluator.py
@@ -0,0 +1,220 @@
+"""
+Sky Festival evaluator — GPT-5 LLM-as-a-judge.
+
+Scores VLM-generated images against a 100-point rubric using GPT-5 vision.
+Returns combined_score normalized to [0, 1].
+
+The framework passes the image path via a sidecar file:
+    <program_path>.image_path  ->  absolute path to the generated image
+
+Requirements:
+    pip install openai
+    Environment: OPENAI_API_KEY (required), JUDGE_MODEL (optional, default gpt-5)
+"""
+
+import base64
+import json
+import logging
+import os
+import re
+from typing import Dict, Union
+
+logger = logging.getLogger(__name__)
+
+JUDGE_MODEL = os.environ.get("JUDGE_MODEL", "gpt-5")
+
+SYSTEM_PROMPT = """\
+You are an extremely strict image evaluation judge. You score images against a precise rubric.
+You must output ONLY valid JSON with the exact keys specified. No markdown, no explanation outside JSON.
+Be harsh — most AI-generated images fail these criteria. Award points only when clearly met.
+If you cannot verify a requirement (e.g., too small to see), award 0 for that item."""
+
+RUBRIC_PROMPT = """\
+Score this image against the following rubric for a "Floating Sky Festival" scene.
+Be extremely strict. Only award points when requirements are CLEARLY and UNAMBIGUOUSLY met.
+
+## Category 1: Cloud Counting and Shapes (15 pts)
+- Exactly 9 clouds visible in the sky: 5 pts (8 or 10 clouds = 0)
+- At least 5 of the 9 clouds have recognizable distinct shapes (rabbit, teacup, musical note, crescent moon, whale, bicycle, crown, butterfly, number 7): 10 pts (2 pts per recognizable shape, max 10)
+
+## Category 2: Hot Air Balloons — Count, Colors, and Passengers (20 pts)
+- Exactly 5 hot air balloons visible: 4 pts (4 or 6 = 0)
+- Each balloon has correct distinct color/pattern (red-striped, yellow-dotted, rainbow, purple-stars, green-peace-sign): 6 pts (deduct 2 per wrong/missing pattern)
+- Correct passenger count per balloon (2 children, 1 woman, 3 cats, 1 violinist, empty): 6 pts (deduct 2 per wrong count)
+- Banner on Balloon 5 reads exactly "HAPPY 100TH SKY FESTIVAL": 4 pts (any word wrong = 0)
+
+## Category 3: Floating Island and Trees (10 pts)
+- Floating island visible suspended in air: 3 pts
+- Exactly 4 different trees on the island: 4 pts (3 or 5 = 0)
+- Trees in correct order left to right (oak, cherry blossom, palm, pine): 3 pts
+
+## Category 4: Party Table Items — Counting and Arrangement (20 pts)
+- 3-tier cake with candle present: 3 pts
+- Cake text "100 YEARS" legible on middle tier: 3 pts
+- Exactly 6 cupcakes in 2 rows of 3 with different colored frostings: 4 pts
+- Lemonade pitcher with 3 lemon slices and 2 ice cubes: 3 pts
+- Stack of exactly 8 golden plates: 3 pts
+- Exactly 5 gift boxes in pyramid (3 bottom, 2 top): 4 pts
+
+## Category 5: Characters — Count, Identity, and Details (15 pts)
+- Exactly 6 characters seated at the table (3 per side): 5 pts
+- Correct characters identifiable (girl with pigtails, penguin with bowtie, giraffe, robot, grandmother, golden retriever): 5 pts (1 pt per correct character, max 5 — giraffe counts as 1 even if neck extends)
+- Specific details: robot has 3 colored buttons on chest, grandmother thumbs-up with LEFT hand, dog wears striped party hat, girl has 5 fingers per hand: 5 pts (deduct 1.5 per missing detail)
+
+## Category 6: Decorations and Flying Creatures (10 pts)
+- Bunting banner with approximately 11 flags in alternating red/yellow/blue: 3 pts
+- Exactly 7 paper lanterns in different colors: 3 pts
+- Correct flying creatures: 4 birds (blue jay, cardinal, canary, hummingbird) + 2 butterflies (monarch, morpho): 4 pts (1 pt per 2 correct creatures)
+
+## Category 7: Rainbow, Lighting, and Overall Composition (10 pts)
+- Complete semicircular rainbow with 7 color bands in correct order: 4 pts
+- Consistent warm golden lighting from upper left with shadows falling lower right: 3 pts
+- Overall magical/celebratory mood, scene is joyful and cohesive: 3 pts
+
+Respond with ONLY this JSON (no other text):
+{
+  "cloud_shapes": <0-15>,
+  "balloons": <0-20>,
+  "floating_island": <0-10>,
+  "table_items": <0-20>,
+  "characters": <0-15>,
+  "decorations_creatures": <0-10>,
+  "rainbow_lighting": <0-10>,
+  "reasoning": "<brief 2-3 sentence explanation>"
+}"""
+
+# Category maximum scores for validation
+CATEGORY_MAXES = {
+    "cloud_shapes": 15,
+    "balloons": 20,
+    "floating_island": 10,
+    "table_items": 20,
+    "characters": 15,
+    "decorations_creatures": 10,
+    "rainbow_lighting": 10,
+}
+
+_client = None
+
+
+def _get_client():
+    global _client
+    if _client is None:
+        from openai import OpenAI
+        _client = OpenAI()
+    return _client
+
+
+def _encode_image(image_path: str) -> str:
+    with open(image_path, "rb") as f:
+        return base64.b64encode(f.read()).decode("utf-8")
+
+
+def _judge_image(image_path: str) -> Dict[str, Union[float, str]]:
+    """Call GPT-5 to score the image. Retries once on failure."""
+    client = _get_client()
+    b64 = _encode_image(image_path)
+
+    ext = os.path.splitext(image_path)[1].lstrip(".").lower()
+    mime = {"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg", "webp": "image/webp"}.get(ext, "image/png")
+    data_url = f"data:{mime};base64,{b64}"
+
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": data_url, "detail": "high"}},
+                {"type": "text", "text": RUBRIC_PROMPT},
+            ],
+        },
+    ]
+
+    last_error = None
+    for attempt in range(2):
+        try:
+            response = client.chat.completions.create(
+                model=JUDGE_MODEL,
+                messages=messages,
+                max_completion_tokens=16384,
+            )
+            content = response.choices[0].message.content or ""
+            raw = content.strip()
+            logger.info(f"Judge raw response (first 300 chars): {raw[:300]}")
+
+            # Extract JSON from markdown code block if present
+            if "```" in raw:
+                m = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", raw, re.DOTALL)
+                if m:
+                    raw = m.group(1).strip()
+
+            # Find JSON object in response
+            start = raw.find("{")
+            end = raw.rfind("}") + 1
+            if start >= 0 and end > start:
+                raw = raw[start:end]
+
+            result = json.loads(raw)
+
+            # Validate and clamp scores
+            scores = {}
+            for cat, max_val in CATEGORY_MAXES.items():
+                val = result.get(cat, 0)
+                if not isinstance(val, (int, float)):
+                    val = 0
+                scores[cat] = max(0, min(max_val, float(val)))
+
+            scores["reasoning"] = str(result.get("reasoning", ""))
+            return scores
+
+        except Exception as e:
+            last_error = e
+            logger.warning(f"Judge attempt {attempt + 1} failed: {e}")
+
+    logger.error(f"GPT-5 judge failed after retries: {last_error}")
+    return {cat: 0.0 for cat in CATEGORY_MAXES}
+
+
+def evaluate(program_path: str) -> Dict[str, Union[float, str]]:
+    """Score a VLM-generated image using GPT-5 as judge.
+
+    Args:
+        program_path: Path to the text file (VLM reasoning).
+            A sidecar file ``<program_path>.image_path`` contains the
+            absolute path to the generated image.
+
+    Returns:
+        Dictionary with combined_score (0-1), per-category scores, and image_path.
+    """
+    # Read image path from sidecar
+    sidecar = program_path + ".image_path"
+    image_path = None
+    if os.path.exists(sidecar):
+        with open(sidecar) as f:
+            image_path = f.read().strip()
+
+    if not image_path or not os.path.exists(image_path):
+        logger.warning("No image found for scoring")
+        return {"combined_score": 0.0, "error": "No image to score"}
+
+    # Score with GPT-5
+    scores = _judge_image(image_path)
+
+    # Compute total out of 100, normalize to 0-1
+    total = sum(v for k, v in scores.items() if k in CATEGORY_MAXES)
+    combined = round(total / 100.0, 4)
+
+    result = {"combined_score": combined, "image_path": image_path}
+
+    # Add per-category scores (normalized to 0-1 for each category)
+    for cat, max_val in CATEGORY_MAXES.items():
+        result[cat] = round(scores.get(cat, 0) / max_val, 4)
+
+    # Also store raw scores
+    result["raw_total"] = round(total, 1)
+
+    reasoning = scores.get("reasoning", "")
+    if reasoning:
+        result["judge_reasoning"] = reasoning
+
+    return result
diff --git a/benchmarks/math/circle_packing_rect/evaluator/evaluator.py b/benchmarks/math/circle_packing_rect/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2a12b218b019e6bdb363afdc47a6cfdc5c9bfe4
--- /dev/null
+++ b/benchmarks/math/circle_packing_rect/evaluator/evaluator.py
@@ -0,0 +1,119 @@
+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for the circle packing problem on a rectangle
+# of perimeter 4.
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+
+import time
+import numpy as np
+import sys
+import os
+from importlib import __import__
+
+BENCHMARK = 2.3658321334167627
+NUM_CIRCLES = 21
+TOL = 1e-6
+
+
+def minimum_circumscribing_rectangle(circles: np.ndarray):
+    """Returns the width and height of the minimum circumscribing rectangle.
+
+    Args:
+    circles: A numpy array of shape (num_circles, 3), where each row is of the
+        form (x, y, radius), specifying a circle.
+
+    Returns:
+    A tuple (width, height) of the minimum circumscribing rectangle.
+    """
+    min_x = np.min(circles[:, 0] - circles[:, 2])
+    max_x = np.max(circles[:, 0] + circles[:, 2])
+    min_y = np.min(circles[:, 1] - circles[:, 2])
+    max_y = np.max(circles[:, 1] + circles[:, 2])
+    return max_x - min_x, max_y - min_y
+
+
+def validate_packing_radii(radii: np.ndarray) -> None:
+    n = len(radii)
+    for i in range(n):
+        if radii[i] < 0:
+            raise ValueError(f"Circle {i} has negative radius {radii[i]}")
+        elif np.isnan(radii[i]):
+            raise ValueError(f"Circle {i} has nan radius")
+
+
+def validate_packing_overlap_wtol(circles: np.ndarray, tol: float = 1e-6) -> None:
+    n = len(circles)
+    for i in range(n):
+        for j in range(i + 1, n):
+            dist = np.sqrt(np.sum((circles[i, :2] - circles[j, :2]) ** 2))
+            if dist < circles[i, 2] + circles[j, 2] - tol:
+                raise ValueError(
+                    f"Circles {i} and {j} overlap: dist={dist}, r1+r2={circles[i,2]+circles[j,2]}"
+                )
+
+
+def validate_packing_inside_rect_wtol(circles: np.array, tol: float = 1e-6) -> None:
+    width, height = minimum_circumscribing_rectangle(circles)
+    if width + height > (2 + tol):
+        raise ValueError("Circles are not contained inside a rectangle of perimeter 4.")
+
+
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+
+        circles = None
+        eval_time = 0
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+
+            start_time = time.time()
+            circles = program.circle_packing21()
+            end_time = time.time()
+            eval_time = end_time - start_time
+        except Exception as err:
+            raise err
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+
+        if not isinstance(circles, np.ndarray):
+            circles = np.array(circles)
+
+        if circles.shape != (NUM_CIRCLES, 3):
+            raise ValueError(
+                f"Invalid shapes: circles = {circles.shape}, expected {(NUM_CIRCLES,3)}"
+            )
+
+        validate_packing_radii(circles[:, -1])
+        validate_packing_overlap_wtol(circles, TOL)
+        validate_packing_inside_rect_wtol(circles, TOL)
+
+        radii_sum = np.sum(circles[:, -1])
+
+        return {
+            "radii_sum": float(radii_sum),
+            "combined_score": float(radii_sum / BENCHMARK),
+            "eval_time": float(eval_time),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/erdos_min_overlap/config.yaml b/benchmarks/math/erdos_min_overlap/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d15966b4b056962769db1ed82e48e951b548a1a5
--- /dev/null
+++ b/benchmarks/math/erdos_min_overlap/config.yaml
@@ -0,0 +1,41 @@
+# Math benchmark: erdos_min_overlap
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: |
+    SETTING:
+    You are an expert in harmonic analysis, numerical optimization, and AI-driven mathematical discovery.
+    Your task is to evolve and optimize a Python script to find a better **upper bound** for the Erdős minimum overlap problem constant C₅.
+
+    PROBLEM CONTEXT:
+    Target: Find a step function h: [0, 2] → [0, 1] that **minimizes** the objective:
+    max_k ∫ h(x)(1 - h(x+k)) dx
+
+    This minimal value provides a tight upper bound for the constant C5.
+
+    Current best known upper bound: C5 ≤ 0.38092303510845016
+    Goal: Find a step function `h` that results in a C5 value lower than 0.38092303510845016.
+
+    CONSTRAINTS:
+    1. The function `h` must have values in the range [0, 1].
+    2. The integral of h(x) over [0, 2] must be exactly 1.
+
+    PERFORMANCE METRICS:
+    - c5_bound: The bound found by the program.
+    - combined_score: 0.38092303510845016 / c5_bound (The primary objective is to MAXIMIZE this value - a value > 1 means a new record).
+    - n_points: number of points used in the discretization.
+    - eval_time: evaluation time of the program.
+evaluator:
+  timeout: 600
+  max_retries: 3
diff --git a/benchmarks/math/erdos_min_overlap/evaluator/Dockerfile b/benchmarks/math/erdos_min_overlap/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/erdos_min_overlap/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/erdos_min_overlap/evaluator/requirements.txt b/benchmarks/math/erdos_min_overlap/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3dee69521695f692e340afea5918ed74f057d6aa
--- /dev/null
+++ b/benchmarks/math/erdos_min_overlap/evaluator/requirements.txt
@@ -0,0 +1,3 @@
+numpy
+jax
+optax
\ No newline at end of file
diff --git a/benchmarks/math/erdos_min_overlap/initial_program.py b/benchmarks/math/erdos_min_overlap/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d07d8cd584e0ec37f112db83a14db9bc708b9a5
--- /dev/null
+++ b/benchmarks/math/erdos_min_overlap/initial_program.py
@@ -0,0 +1,96 @@
+# EVOLVE-BLOCK-START
+import jax
+import jax.numpy as jnp
+import optax
+import numpy as np
+from dataclasses import dataclass
+import tqdm
+
+
+@dataclass
+class Hyperparameters:
+    num_intervals: int = 200
+    learning_rate: float = 0.005
+    num_steps: int = 20000
+    penalty_strength: float = 1000000.0
+
+
+class ErdosOptimizer:
+    """
+    Finds a step function h that minimizes the maximum overlap integral.
+    """
+
+    def __init__(self, hypers: Hyperparameters):
+        self.hypers = hypers
+        self.domain_width = 2.0
+        self.dx = self.domain_width / self.hypers.num_intervals
+
+    def _objective_fn(self, latent_h_values: jnp.ndarray) -> jnp.ndarray:
+        """
+        The loss function includes the objective and a penalty for the constraint.
+        """
+        # Enforce h(x) in [0, 1] via sigmoid (hard constraint)
+        h = jax.nn.sigmoid(latent_h_values)
+
+        # Calculate the primary objective (max correlation)
+        j = 1.0 - h
+        N = self.hypers.num_intervals
+        h_padded = jnp.pad(h, (0, N))
+        j_padded = jnp.pad(j, (0, N))
+        corr_fft = jnp.fft.fft(h_padded) * jnp.conj(jnp.fft.fft(j_padded))
+        correlation = jnp.fft.ifft(corr_fft).real
+        scaled_correlation = correlation * self.dx
+        objective_loss = jnp.max(scaled_correlation)
+
+        # Calculate the penalty for the integral constraint
+        integral_h = jnp.sum(h) * self.dx
+        constraint_loss = (integral_h - 1.0) ** 2
+
+        # Combine the objective with the penalty
+        total_loss = objective_loss + self.hypers.penalty_strength * constraint_loss
+        return total_loss
+
+    def run_optimization(self):
+        optimizer = optax.adam(self.hypers.learning_rate)
+
+        key = jax.random.PRNGKey(42)
+        latent_h_values = jax.random.normal(key, (self.hypers.num_intervals,))
+
+        opt_state = optimizer.init(latent_h_values)
+
+        @jax.jit
+        def train_step(latent_h_values, opt_state):
+            loss, grads = jax.value_and_grad(self._objective_fn)(latent_h_values)
+            updates, opt_state = optimizer.update(grads, opt_state)
+            latent_h_values = optax.apply_updates(latent_h_values, updates)
+            return latent_h_values, opt_state, loss
+
+        print(f"Optimizing a step function with {self.hypers.num_intervals} intervals...")
+        for step in tqdm.tqdm(range(self.hypers.num_steps), desc="Optimizing"):
+            latent_h_values, opt_state, loss = train_step(latent_h_values, opt_state)
+
+        # Final h is just the sigmoid of the latent values
+        final_h = jax.nn.sigmoid(latent_h_values)
+
+        # Re-calculate final objective loss without the penalty for the report
+        j = 1.0 - final_h
+        N = self.hypers.num_intervals
+        h_padded = jnp.pad(final_h, (0, N))
+        j_padded = jnp.pad(j, (0, N))
+        corr_fft = jnp.fft.fft(h_padded) * jnp.conj(jnp.fft.fft(j_padded))
+        correlation = jnp.fft.ifft(corr_fft).real
+        c5_bound = jnp.max(correlation * self.dx)
+
+        print(f"Optimization complete. Final C5 upper bound: {c5_bound:.8f}")
+        return np.array(final_h), float(c5_bound)
+
+
+def run():
+    hypers = Hyperparameters()
+    optimizer = ErdosOptimizer(hypers)
+    final_h_values, c5_bound = optimizer.run_optimization()
+
+    return final_h_values, c5_bound, hypers.num_intervals
+
+
+# EVOLVE-BLOCK-END
diff --git a/benchmarks/math/heilbronn_convex/13/evaluator/evaluate.sh b/benchmarks/math/heilbronn_convex/13/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/heilbronn_convex/13/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/heilbronn_convex/13/evaluator/wrapper.py b/benchmarks/math/heilbronn_convex/13/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/heilbronn_convex/13/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/hexagon_packing/11/evaluator/evaluate.sh b/benchmarks/math/hexagon_packing/11/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/hexagon_packing/11/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/matmul/evaluator/Dockerfile b/benchmarks/math/matmul/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/matmul/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/matmul/evaluator/evaluate.sh b/benchmarks/math/matmul/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/matmul/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/matmul/evaluator/evaluator.py b/benchmarks/math/matmul/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..331efefdcd80afd32aaa5ca362bc231bb0216970
--- /dev/null
+++ b/benchmarks/math/matmul/evaluator/evaluator.py
@@ -0,0 +1,115 @@
+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for the matrix multiplication problem with tensor size
+# of <2,4,5>
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+
+import sys
+import os
+from importlib import __import__
+import time
+import numpy as np
+
+BENCHMARK = 32
+
+
+def verify_tensor_decomposition(
+    decomposition: tuple[np.ndarray, np.ndarray, np.ndarray], n: int, m: int, p: int, rank: int
+):
+    """Verifies the correctness of the tensor decomposition."""
+
+    # Add robustness for cases where the optimizer might fail
+    if not all(isinstance(arr, np.ndarray) for arr in decomposition) or not decomposition:
+        raise ValueError("Decomposition must be a tuple of NumPy arrays.")
+    if any(arr.size == 0 for arr in decomposition):
+        print("Warning: One or more decomposition arrays are empty. Verification skipped.")
+        return
+
+    # Check that each factor matrix has the correct shape.
+    factor_matrix_1, factor_matrix_2, factor_matrix_3 = decomposition
+    if factor_matrix_1.shape != (n * m, rank):
+        raise ValueError(
+            f"Expected shape of factor matrix 1 is {(n * m, rank)}. Actual shape is {factor_matrix_1.shape}."
+        )
+    if factor_matrix_2.shape != (m * p, rank):
+        raise ValueError(
+            f"Expected shape of factor matrix 2 is {(m * p, rank)}. Actual shape is {factor_matrix_2.shape}."
+        )
+    if factor_matrix_3.shape != (n * p, rank):
+        raise ValueError(
+            f"Expected shape of factor matrix 3 is {(n * p, rank)}. Actual shape is {factor_matrix_3.shape}."
+        )
+
+    # Form the matrix multiplication tensor <n, m, p>.
+    matmul_tensor = np.zeros((n * m, m * p, n * p), dtype=np.float32)
+    for i in range(n):
+        for j in range(m):
+            for k in range(p):
+                # Use the standard k*n+i indexing for the third dimension
+                matmul_tensor[i * m + j, j * p + k, k * n + i] = 1
+
+    # Check that the tensor is correctly constructed.
+    constructed_tensor = np.einsum("ir,jr,kr -> ijk", *decomposition)
+
+    # Exact check
+    if not np.array_equal(constructed_tensor, matmul_tensor):
+        # If the exact check fails, report the floating-point difference for diagnostics.
+        diff = np.max(np.abs(constructed_tensor - matmul_tensor))
+        raise ValueError(
+            f"Tensor constructed by decomposition does not exactly match the target tensor. Maximum difference is {diff:.6e}."
+        )
+
+
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+            start_time = time.time()
+            decomposition, n, m, p, loss, rank = program.run()
+            end_time = time.time()
+            eval_time = end_time - start_time
+        except Exception as err:
+            raise err
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+
+        verify_tensor_decomposition(decomposition, n, m, p, rank)
+
+        success_threshold = 1e-6
+        if loss > success_threshold:
+            print(
+                f"\nWarning: Final loss {loss:.2e} is above the success threshold of {success_threshold:.2e}."
+            )
+
+        inverse_rank = BENCHMARK / rank
+
+        return {
+            "combined_score": inverse_rank,
+            "loss": loss,
+            "rank": rank,
+            "eval_time": float(eval_time),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/matmul/evaluator/requirements.txt b/benchmarks/math/matmul/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3dee69521695f692e340afea5918ed74f057d6aa
--- /dev/null
+++ b/benchmarks/math/matmul/evaluator/requirements.txt
@@ -0,0 +1,3 @@
+numpy
+jax
+optax
\ No newline at end of file
diff --git a/benchmarks/math/matmul/evaluator/wrapper.py b/benchmarks/math/matmul/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/matmul/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/matmul/initial_program.py b/benchmarks/math/matmul/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9c30df06ba319dbc7bc9f1880aa735aaf43dc79
--- /dev/null
+++ b/benchmarks/math/matmul/initial_program.py
@@ -0,0 +1,199 @@
+# Disable progress bar for cleaner output logs
+import os
+
+os.environ["TQDM_DISABLE"] = "1"
+
+# Fixed parameters
+n, m, p = 2, 4, 5
+
+# EVOLVE-BLOCK-START
+import numpy as np
+import jax
+import jax.numpy as jnp
+import optax
+from dataclasses import dataclass
+import tqdm
+
+
+# --- Straight-Through Estimator for Rounding ---
+@jax.custom_vjp
+def round_to_half_ste(x):
+    """Forward pass: snaps values to the nearest half-integer."""
+    return jnp.round(x * 2) / 2
+
+
+def round_ste_fwd(x):
+    """Standard forward pass and identity for backward pass."""
+    return round_to_half_ste(x), None
+
+
+def round_ste_bwd(res, g):
+    """Backward pass: Identity function, passes gradient straight through."""
+    return (g,)
+
+
+round_to_half_ste.defvjp(round_ste_fwd, round_ste_bwd)
+# --- End of STE definition ---
+
+
+# --- Loss Functions ---
+def weighted_l2_loss(reconstructed: jnp.ndarray, target: jnp.ndarray) -> jnp.ndarray:
+    error = reconstructed - target
+    weights = jnp.where(target != 0, 100.0, 1.0)
+    return jnp.mean(weights * (error**2))
+
+
+def l2_loss_real(x: jnp.ndarray, y: jnp.ndarray) -> jnp.ndarray:
+    return jnp.mean((x - y) ** 2)
+
+
+# --- Hyperparameters ---
+@dataclass
+class Hyperparameters:
+    rank: int = 55
+    # Phase 1: Continuous Search
+    num_restarts: int = 10
+    phase1_steps: int = 80000
+    phase1_lr: float = 0.01
+    init_scale: float = 0.1
+    l1_strength: float = 1e-6
+    clamp_range: float = 4.0
+    # Phase 2: Discrete Fine-tuning
+    phase2_steps: int = 20000
+    phase2_lr: float = 1e-4  # A much smaller learning rate for fine-tuning
+
+
+# --- Optimizer Classes ---
+class ContinuousOptimizer:
+    """Finds a high-quality approximate continuous solution."""
+
+    def __init__(self, target_tensor: jnp.ndarray, hypers: Hyperparameters):
+        self.target_tensor = target_tensor
+        self.hypers = hypers
+        self.opt = optax.adam(hypers.phase1_lr)
+
+    def _get_constrained_decomposition(self, latent_decomposition: tuple) -> tuple:
+        """Applies a scaled tanh to map latent parameters to the desired range."""
+        return jax.tree_util.tree_map(
+            lambda x: self.hypers.clamp_range * jnp.tanh(x), latent_decomposition
+        )
+
+    def _loss_fn(self, latent_decomposition: tuple) -> jnp.ndarray:
+        constrained = self._get_constrained_decomposition(latent_decomposition)
+        reconstructed = jnp.einsum("ir,jr,kr->ijk", *constrained)
+        recon_loss = weighted_l2_loss(reconstructed, self.target_tensor)
+        l1_penalty = sum(jnp.mean(jnp.abs(arr)) for arr in constrained)
+        return recon_loss + self.hypers.l1_strength * l1_penalty
+
+
+class DiscreteOptimizer:
+    """Refines a continuous solution into an exact discrete one using an STE."""
+
+    def __init__(self, target_tensor: jnp.ndarray, hypers: Hyperparameters):
+        self.target_tensor = target_tensor
+        self.hypers = hypers
+        self.opt = optax.adam(hypers.phase2_lr)
+
+    def _loss_fn(self, continuous_decomposition: tuple) -> jnp.ndarray:
+        # Snap the continuous parameters to the discrete grid
+        discrete_decomposition = jax.tree_util.tree_map(round_to_half_ste, continuous_decomposition)
+        # Compute the loss using only these exact half-integer values
+        reconstructed = jnp.einsum("ir,jr,kr->ijk", *discrete_decomposition)
+        return l2_loss_real(reconstructed, self.target_tensor)
+
+
+# --- JIT-compatible Train Step ---
+def train_step(params, opt_state, optimizer, loss_fn):
+    loss, grads = jax.value_and_grad(loss_fn)(params)
+    updates, opt_state = optimizer.update(grads, opt_state, params)
+    params = optax.apply_updates(params, updates)
+    return params, opt_state, loss
+
+
+def get_matrix_multiplication_tensor(n, m, p):
+    T = jnp.zeros((n * m, m * p, n * p))
+    for i, j, k in np.ndindex(n, m, p):
+        T = T.at[i * m + j, j * p + k, k * n + i].set(1)
+    return T
+
+
+def run():
+    hypers = Hyperparameters()
+    target_tensor = get_matrix_multiplication_tensor(n, m, p)
+    main_key = jax.random.PRNGKey(42)
+
+    # --- PHASE 1: CONTINUOUS EXPLORATION ---
+    print(f"\n{'='*20} PHASE 1: Continuous Exploration {'='*20}")
+    best_loss_phase1 = float("inf")
+    best_latent_decomp = None
+
+    continuous_optimizer = ContinuousOptimizer(target_tensor, hypers)
+
+    # JIT the train_step for the continuous phase
+    jit_train_step_continuous = jax.jit(train_step, static_argnums=(2, 3))
+
+    for i in range(hypers.num_restarts):
+        print(f"\n--- Restart {i+1}/{hypers.num_restarts} ---")
+        main_key, restart_key = jax.random.split(main_key)
+        init_fn = jax.nn.initializers.normal(stddev=hypers.init_scale)
+        latent_decomp = (
+            init_fn(restart_key, (n * m, hypers.rank)),
+            init_fn(restart_key, (m * p, hypers.rank)),
+            init_fn(restart_key, (n * p, hypers.rank)),
+        )
+        opt_state = continuous_optimizer.opt.init(latent_decomp)
+
+        for _ in tqdm.tqdm(range(hypers.phase1_steps), desc="Continuous Search"):
+            latent_decomp, opt_state, loss = jit_train_step_continuous(
+                latent_decomp,
+                opt_state,
+                continuous_optimizer.opt,
+                continuous_optimizer._loss_fn,
+            )
+
+        final_loss = l2_loss_real(
+            target_tensor,
+            jnp.einsum(
+                "ir,jr,kr->ijk",
+                *continuous_optimizer._get_constrained_decomposition(latent_decomp),
+            ),
+        )
+        print(f"End of Trial | Final continuous loss: {final_loss:.8f}")
+
+        if final_loss < best_loss_phase1:
+            best_loss_phase1 = final_loss
+            best_latent_decomp = latent_decomp
+
+    # --- PHASE 2: DISCRETE FINE-TUNING ---
+    print(f"\n{'='*20} PHASE 2: Discrete Fine-tuning (STE) {'='*20}")
+    print(f"Starting with best continuous solution (loss: {best_loss_phase1:.8f})")
+
+    continuous_params = continuous_optimizer._get_constrained_decomposition(best_latent_decomp)
+
+    discrete_optimizer = DiscreteOptimizer(target_tensor, hypers)
+    opt_state = discrete_optimizer.opt.init(continuous_params)
+
+    # JIT the train_step for the discrete phase
+    jit_train_step_discrete = jax.jit(train_step, static_argnums=(2, 3))
+
+    for step in tqdm.tqdm(range(hypers.phase2_steps), desc="Discrete Fine-tuning"):
+        continuous_params, opt_state, loss = jit_train_step_discrete(
+            continuous_params, opt_state, discrete_optimizer.opt, discrete_optimizer._loss_fn
+        )
+        if (step + 1) % 2000 == 0:
+            print(f"Step {step+1} | Discrete Loss: {loss:.8f}")
+        if loss < 1e-7:
+            print("\nFound a perfect solution!")
+            break
+
+    final_discrete_decomposition = jax.tree_util.tree_map(round_to_half_ste, continuous_params)
+    final_loss = l2_loss_real(
+        target_tensor, jnp.einsum("ir,jr,kr->ijk", *final_discrete_decomposition)
+    )
+    print(f"Search complete. Final discrete loss: {final_loss:.8f}")
+
+    final_decomposition_np = jax.tree_util.tree_map(np.array, final_discrete_decomposition)
+    return final_decomposition_np, n, m, p, float(final_loss), hypers.rank
+
+
+# EVOLVE-BLOCK-END
diff --git a/benchmarks/math/minimizing_max_min_dist/2/config.yaml b/benchmarks/math/minimizing_max_min_dist/2/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2cccde5812635c07b2e1390c45bdd90e087cab6a
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/2/config.yaml
@@ -0,0 +1,29 @@
+# Math benchmark: minimizing_max_min_dist/2
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: "SETTING:\nYou are an expert computational geometer and optimization specialist focusing on point dispersion\
+    \ problems.\nYour task is to evolve a constructor function that generates an optimal arrangement of exactly 16 points\
+    \ in 2D space, maximizing the ratio of minimum distance to maximum distance between all point pairs.\n\nPROBLEM CONTEXT:\n\
+    - Target: Beat the AlphaEvolve benchmark of min/max ratio = 1/√12.889266112 ≈ 0.2786\n- Constraint: Points must be placed\
+    \ in 2D Euclidean space (typically normalized to unit square [0,1] × [0,1])\n- Mathematical formulation: For points Pi\
+    \ = (xi, yi), i = 1,...,16:\n  * Distance matrix: dij = √[(xi-xj)² + (yi-yj)²] for all i≠j\n  * Minimum distance: dmin\
+    \ = min{dij : i≠j}\n  * Maximum distance: dmax = max{dij : i≠j}\n  * Objective: maximize dmin/dmax subject to spatial\
+    \ constraints\n\nPERFORMANCE METRICS:\n1. **min_max_ratio**: dmin/dmax ratio (PRIMARY OBJECTIVE - maximize)\n2. **combined_score**:\
+    \ min_max_ratio / 0.2786 (progress toward beating AlphaEvolve benchmark)\n3. **eval_time**: Execution time in seconds\
+    \ (balance accuracy vs. efficiency)\n\nTECHNICAL REQUIREMENTS:\n- **Reproducibility**: Fixed random seeds for all stochastic\
+    \ components\n"
+evaluator:
+  timeout: 360
+  max_retries: 3
diff --git a/benchmarks/math/minimizing_max_min_dist/2/evaluator/Dockerfile b/benchmarks/math/minimizing_max_min_dist/2/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/2/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/minimizing_max_min_dist/2/evaluator/evaluate.sh b/benchmarks/math/minimizing_max_min_dist/2/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/2/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/minimizing_max_min_dist/2/evaluator/evaluator.py b/benchmarks/math/minimizing_max_min_dist/2/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..094ac2fb911487ddefc2373309eaf501bb0a9687
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/2/evaluator/evaluator.py
@@ -0,0 +1,78 @@
+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for problem of minimizing the ratio of maximum
+# to minimum distance on dimension 2 and with 16 points.
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+
+import sys
+import os
+from importlib import __import__
+import scipy as sp
+import time
+import numpy as np
+
+NUM_POINTS = 16
+DIMENSION = 2
+BENCHMARK = 1 / 12.889266112
+
+# Scoring: (dmin/dmax)^2.
+# Key reformulation: maximize auxiliary variable t
+#   subject to d(i,j)^2 >= t AND d(i,j)^2 <= 1 for every pair (i,j).
+# This is a constrained NLP with O(n^2) pairwise inequality constraints.
+
+
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+            start_time = time.time()
+            points = program.min_max_dist_dim2_16()
+            end_time = time.time()
+            eval_time = end_time - start_time
+        except Exception as err:
+            raise err
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+
+        if not isinstance(points, np.ndarray):
+            points = np.array(points)
+
+        if points.shape != (NUM_POINTS, DIMENSION):
+            raise ValueError(
+                f"Invalid shapes: points = {points.shape}, expected {(NUM_POINTS,DIMENSION)}"
+            )
+
+        pairwise_distances = sp.spatial.distance.pdist(points)
+        min_distance = np.min(pairwise_distances)
+        max_distance = np.max(pairwise_distances)
+
+        inv_ratio_squared = (min_distance / max_distance) ** 2 if max_distance > 0 else 0
+        return {
+            "min_max_ratio": float(inv_ratio_squared),
+            "combined_score": float(inv_ratio_squared / BENCHMARK),
+            "eval_time": float(eval_time),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/minimizing_max_min_dist/2/evaluator/requirements.txt b/benchmarks/math/minimizing_max_min_dist/2/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5576e19feaf684e56c8fd6f43f64cef3f800e53d
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/2/evaluator/requirements.txt
@@ -0,0 +1,2 @@
+numpy
+scipy
\ No newline at end of file
diff --git a/benchmarks/math/minimizing_max_min_dist/2/evaluator/wrapper.py b/benchmarks/math/minimizing_max_min_dist/2/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/2/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/minimizing_max_min_dist/2/initial_program.py b/benchmarks/math/minimizing_max_min_dist/2/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..9348ce431ecab8e13a0ed58c8f47d43d7e7db9de
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/2/initial_program.py
@@ -0,0 +1,24 @@
+# EVOLVE-BLOCK-START
+import numpy as np
+
+
+def min_max_dist_dim2_16() -> np.ndarray:
+    """
+    Creates 16 points in 2 dimensions in order to maximize the ratio of minimum to maximum distance.
+
+    Returns
+        points: np.ndarray of shape (16,2) containing the (x,y) coordinates of the 16 points.
+
+    """
+
+    n = 16
+    d = 2
+
+    # places points randomly
+    np.random.seed(42)
+    points = np.random.randn(n, d)
+
+    return points
+
+
+# EVOLVE-BLOCK-END
diff --git a/benchmarks/math/minimizing_max_min_dist/3/config.yaml b/benchmarks/math/minimizing_max_min_dist/3/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..06094abafc75bff2234417206743f2921a4635f2
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/3/config.yaml
@@ -0,0 +1,29 @@
+# Math benchmark: minimizing_max_min_dist/3
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: "SETTING:\nYou are an expert computational geometer and optimization specialist focusing on 3D point dispersion\
+    \ problems.\nYour task is to evolve a constructor function that generates an optimal arrangement of exactly 14 points\
+    \ in 3D space, maximizing the ratio of minimum distance to maximum distance between all point pairs.\n\nPROBLEM CONTEXT:\n\
+    - Target: Beat the current state-of-the-art benchmark of min/max ratio = 1/√4.165849767 ≈ 0.4898\n- Constraint: Points\
+    \ must be placed in 3D Euclidean space (typically normalized to unit cube [0,1]³ or unit sphere)\n- Mathematical formulation:\
+    \ For points Pi = (xi, yi, zi), i = 1,...,14:\n  * Distance matrix: dij = √[(xi-xj)² + (yi-yj)² + (zi-zj)²] for all i≠j\n\
+    \  * Minimum distance: dmin = min{dij : i≠j}\n  * Maximum distance: dmax = max{dij : i≠j}\n  * Objective: maximize dmin/dmax\
+    \ subject to spatial constraints\n\nPERFORMANCE METRICS:\n1. **min_max_ratio**: dmin/dmax ratio (PRIMARY OBJECTIVE - maximize)\n\
+    2. **combined_score**: min_max_ratio / 0.4898 (progress toward beating AlphaEvolve benchmark)\n3. **eval_time**: Execution\
+    \ time in seconds (balance accuracy vs. efficiency)\n\nTECHNICAL REQUIREMENTS:\n- **Reproducibility**: Fixed random seeds\
+    \ for all stochastic components\n"
+evaluator:
+  timeout: 360
+  max_retries: 3
diff --git a/benchmarks/math/minimizing_max_min_dist/3/evaluator/Dockerfile b/benchmarks/math/minimizing_max_min_dist/3/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/3/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/minimizing_max_min_dist/3/evaluator/evaluate.sh b/benchmarks/math/minimizing_max_min_dist/3/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/3/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/minimizing_max_min_dist/3/evaluator/evaluator.py b/benchmarks/math/minimizing_max_min_dist/3/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..05b53639da88ca4a5be745b7435c30fab9703fe0
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/3/evaluator/evaluator.py
@@ -0,0 +1,78 @@
+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for problem of minimizing the ratio of maximum
+# to minimum distance on dimension 3 and with 14 points.
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+
+import sys
+import os
+from importlib import __import__
+import scipy as sp
+import time
+import numpy as np
+
+NUM_POINTS = 14
+DIMENSION = 3
+BENCHMARK = 1 / 4.165849767
+
+# Scoring: (dmin/dmax)^2.
+# Key reformulation: maximize auxiliary variable t
+#   subject to d(i,j)^2 >= t AND d(i,j)^2 <= 1 for every pair (i,j).
+# This is a constrained NLP with O(n^2) pairwise inequality constraints.
+
+
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+            start_time = time.time()
+            points = program.min_max_dist_dim3_14()
+            end_time = time.time()
+            eval_time = end_time - start_time
+        except Exception as err:
+            raise err
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+
+        if not isinstance(points, np.ndarray):
+            points = np.array(points)
+
+        if points.shape != (NUM_POINTS, DIMENSION):
+            raise ValueError(
+                f"Invalid shapes: points = {points.shape}, expected {(NUM_POINTS,DIMENSION)}"
+            )
+
+        pairwise_distances = sp.spatial.distance.pdist(points)
+        min_distance = np.min(pairwise_distances)
+        max_distance = np.max(pairwise_distances)
+
+        inv_ratio_squared = (min_distance / max_distance) ** 2 if max_distance > 0 else 0
+        return {
+            "min_max_ratio": float(inv_ratio_squared),
+            "combined_score": float(inv_ratio_squared / BENCHMARK),
+            "eval_time": float(eval_time),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/minimizing_max_min_dist/3/evaluator/requirements.txt b/benchmarks/math/minimizing_max_min_dist/3/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5576e19feaf684e56c8fd6f43f64cef3f800e53d
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/3/evaluator/requirements.txt
@@ -0,0 +1,2 @@
+numpy
+scipy
\ No newline at end of file
diff --git a/benchmarks/math/minimizing_max_min_dist/3/evaluator/wrapper.py b/benchmarks/math/minimizing_max_min_dist/3/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/3/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/minimizing_max_min_dist/3/initial_program.py b/benchmarks/math/minimizing_max_min_dist/3/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..d58a3179efe75808f5031fc805016324fe2cad8b
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/3/initial_program.py
@@ -0,0 +1,24 @@
+# EVOLVE-BLOCK-START
+import numpy as np
+
+
+def min_max_dist_dim3_14() -> np.ndarray:
+    """
+    Creates 14 points in 3 dimensions in order to maximize the ratio of minimum to maximum distance.
+
+    Returns
+        points: np.ndarray of shape (14,3) containing the (x,y) coordinates of the 14 points.
+
+    """
+
+    n = 14
+    d = 3
+
+    # places points randomly
+    np.random.seed(42)
+    points = np.random.randn(n, d)
+
+    return points
+
+
+# EVOLVE-BLOCK-END
diff --git a/benchmarks/math/signal_processing/README.md b/benchmarks/math/signal_processing/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7494ee3ca15bfc15c3e2eea507c46cbf32b65a51
--- /dev/null
+++ b/benchmarks/math/signal_processing/README.md
@@ -0,0 +1,46 @@
+# Real-Time Adaptive Signal Processing
+
+Evolve a real-time adaptive filtering algorithm for non-stationary time series data. The algorithm must filter noise while preserving signal dynamics and minimizing computational latency.
+
+## Problem
+
+**Input**: Univariate time series with non-linear dynamics, non-stationary statistics, and rapidly changing spectral characteristics.
+
+**Constraints**: Causal processing (finite sliding window), fixed latency, real-time capability.
+
+**Multi-objective function**:
+```
+J(theta) = 0.3*S + 0.2*L_recent + 0.2*L_avg + 0.3*R
+```
+- **S**: Slope change penalty (directional reversals in filtered signal)
+- **L_recent**: Instantaneous lag error
+- **L_avg**: Average tracking error
+- **R**: False reversal penalty (noise-induced trend changes)
+
+The evaluator tests on 5 synthetic signals: sinusoidal, multi-frequency, non-stationary, step changes, and random walk.
+
+## Run
+
+```bash
+# From repo root
+uv run skydiscover-run \
+  benchmarks/math/signal_processing/initial_program.py \
+  benchmarks/math/signal_processing/evaluator.py \
+  -c benchmarks/math/signal_processing/config.yaml \
+  -s [your_algorithm] \
+  -i 100
+```
+
+## Scoring
+
+- **combined_score**: Composite J(theta) metric (higher is better)
+- Also reports: slope changes, correlation, lag error, noise reduction, processing time
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `initial_program.py` | Seed: basic moving average / weighted exponential filters |
+| `evaluator.py` | Multi-objective evaluation across 5 synthetic test signals |
+| `config.yaml` | LLM and evaluator settings |
+| `requirements.txt` | Python dependencies |
diff --git a/benchmarks/math/signal_processing/config.yaml b/benchmarks/math/signal_processing/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9e7f684648c48b70a19a99265ffab8e552f8bad9
--- /dev/null
+++ b/benchmarks/math/signal_processing/config.yaml
@@ -0,0 +1,30 @@
+# Math benchmark: signal_processing
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: 'You are an expert signal processing engineer specializing in real-time adaptive filtering algorithms. Your
+    task is to improve a signal processing algorithm that filters volatile, non-stationary time series data using a sliding
+    window approach. The algorithm must minimize noise while preserving signal dynamics with minimal computational latency
+    and phase delay. Focus on the multi-objective optimization of: (1) Slope change minimization - reducing spurious directional
+    reversals, (2) Lag error minimization - maintaining responsiveness, (3) Tracking accuracy - preserving genuine signal
+    trends, and (4) False reversal penalty - avoiding noise-induced trend changes. Consider advanced techniques like adaptive
+    filtering (Kalman filters, particle filters), multi-scale processing (wavelets, EMD), predictive enhancement (polynomial
+    fitting, neural networks), and trend detection methods.'
+evaluator:
+  timeout: 360
+  cascade_evaluation: true
+  cascade_thresholds:
+  - 0.3
+  - 0.6
+
diff --git a/benchmarks/math/signal_processing/evaluator/Dockerfile b/benchmarks/math/signal_processing/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/signal_processing/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/signal_processing/evaluator/evaluate.sh b/benchmarks/math/signal_processing/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/signal_processing/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/signal_processing/evaluator/evaluator.py b/benchmarks/math/signal_processing/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..e19d989064b39845e3c39d03bf6478d08e0d5d40
--- /dev/null
+++ b/benchmarks/math/signal_processing/evaluator/evaluator.py
@@ -0,0 +1,536 @@
+"""
+Evaluator for the Real-Time Adaptive Signal Processing Algorithm
+
+This evaluator implements the multi-objective optimization function defined in the specification:
+J(θ) = α₁·S(θ) + α₂·L_recent(θ) + α₃·L_avg(θ) + α₄·R(θ)
+
+Where:
+- S(θ): Slope change penalty - counts directional reversals
+- L_recent(θ): Instantaneous lag error - |y[n] - x[n]|
+- L_avg(θ): Average tracking error over window
+- R(θ): False reversal penalty - mismatched trend changes
+- α₁=0.3, α₂=α₃=0.2, α₄=0.3: Weighting coefficients
+"""
+
+import importlib.util
+import numpy as np
+import time
+import concurrent.futures
+import traceback
+from scipy import signal
+from scipy.stats import pearsonr
+
+
+def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=30):
+    """
+    Run a function with a timeout using concurrent.futures
+    """
+    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+        future = executor.submit(func, *args, **kwargs)
+        try:
+            result = future.result(timeout=timeout_seconds)
+            return result
+        except concurrent.futures.TimeoutError:
+            raise TimeoutError(f"Function timed out after {timeout_seconds} seconds")
+
+
+def safe_float(value):
+    """Convert a value to float safely"""
+    try:
+        if np.isnan(value) or np.isinf(value):
+            return 0.0
+        return float(value)
+    except (TypeError, ValueError):
+        return 0.0
+
+
+def calculate_slope_changes(signal_data):
+    """
+    Calculate slope change penalty S(θ) - counts directional reversals
+
+    Args:
+        signal_data: 1D array of signal values
+
+    Returns:
+        Number of slope changes (directional reversals)
+    """
+    if len(signal_data) < 3:
+        return 0
+
+    # Calculate differences
+    diffs = np.diff(signal_data)
+
+    # Count sign changes in consecutive differences
+    sign_changes = 0
+    for i in range(1, len(diffs)):
+        if np.sign(diffs[i]) != np.sign(diffs[i - 1]) and diffs[i - 1] != 0:
+            sign_changes += 1
+
+    return sign_changes
+
+
+def calculate_lag_error(filtered_signal, original_signal, window_size):
+    """
+    Calculate instantaneous lag error L_recent(θ) = |y[n] - x[n]|
+
+    Args:
+        filtered_signal: Output of the filter
+        original_signal: Original input signal
+        window_size: Size of the processing window
+
+    Returns:
+        Instantaneous lag error at the most recent sample
+    """
+    if len(filtered_signal) == 0:
+        return 1.0  # Maximum penalty
+
+    # Account for processing delay
+    delay = window_size - 1
+    if len(original_signal) <= delay:
+        return 1.0
+
+    # Compare the last filtered sample with the corresponding original sample
+    recent_filtered = filtered_signal[-1]
+    recent_original = original_signal[delay + len(filtered_signal) - 1]
+
+    return abs(recent_filtered - recent_original)
+
+
+def calculate_average_tracking_error(filtered_signal, original_signal, window_size):
+    """
+    Calculate average tracking error L_avg(θ) over the window
+
+    Args:
+        filtered_signal: Output of the filter
+        original_signal: Original input signal
+        window_size: Size of the processing window
+
+    Returns:
+        Average absolute error over the processed samples
+    """
+    if len(filtered_signal) == 0:
+        return 1.0  # Maximum penalty
+
+    # Account for processing delay
+    delay = window_size - 1
+    if len(original_signal) <= delay:
+        return 1.0
+
+    # Align signals
+    aligned_original = original_signal[delay : delay + len(filtered_signal)]
+
+    # Ensure same length
+    min_length = min(len(filtered_signal), len(aligned_original))
+    if min_length == 0:
+        return 1.0
+
+    filtered_aligned = filtered_signal[:min_length]
+    original_aligned = aligned_original[:min_length]
+
+    # Calculate mean absolute error
+    return np.mean(np.abs(filtered_aligned - original_aligned))
+
+
+def calculate_false_reversal_penalty(filtered_signal, clean_signal, window_size):
+    """
+    Calculate false reversal penalty R(θ) - mismatched trend changes
+
+    Args:
+        filtered_signal: Output of the filter
+        clean_signal: Ground truth clean signal
+        window_size: Size of the processing window
+
+    Returns:
+        Penalty for trend changes that don't match the clean signal
+    """
+    if len(filtered_signal) < 3 or len(clean_signal) < 3:
+        return 0
+
+    # Account for processing delay
+    delay = window_size - 1
+    if len(clean_signal) <= delay:
+        return 1.0
+
+    # Align signals
+    aligned_clean = clean_signal[delay : delay + len(filtered_signal)]
+    min_length = min(len(filtered_signal), len(aligned_clean))
+
+    if min_length < 3:
+        return 0
+
+    filtered_aligned = filtered_signal[:min_length]
+    clean_aligned = aligned_clean[:min_length]
+
+    # Calculate trend changes for both signals
+    filtered_diffs = np.diff(filtered_aligned)
+    clean_diffs = np.diff(clean_aligned)
+
+    # Count mismatched trend changes
+    false_reversals = 0
+    for i in range(1, len(filtered_diffs)):
+        # Check if there's a trend change in filtered signal
+        filtered_change = (
+            np.sign(filtered_diffs[i]) != np.sign(filtered_diffs[i - 1])
+            and filtered_diffs[i - 1] != 0
+        )
+
+        # Check if there's a corresponding trend change in clean signal
+        clean_change = (
+            np.sign(clean_diffs[i]) != np.sign(clean_diffs[i - 1]) and clean_diffs[i - 1] != 0
+        )
+
+        # Count as false reversal if filtered has change but clean doesn't
+        if filtered_change and not clean_change:
+            false_reversals += 1
+
+    return false_reversals
+
+
+def calculate_composite_score(S, L_recent, L_avg, R, alpha=[0.3, 0.2, 0.2, 0.3]):
+    """
+    Calculate the composite metric J(θ) = α₁·S(θ) + α₂·L_recent(θ) + α₃·L_avg(θ) + α₄·R(θ)
+
+    All metrics are normalized and converted to penalties (higher = worse)
+    The final score is converted to a maximization problem (higher = better)
+    """
+    # Normalize slope changes (typical range 0-100)
+    S_norm = min(S / 50.0, 2.0)
+
+    # Lag errors are already in reasonable range (0-10 typically)
+    L_recent_norm = min(L_recent, 2.0)
+    L_avg_norm = min(L_avg, 2.0)
+
+    # Normalize false reversals (typical range 0-50)
+    R_norm = min(R / 25.0, 2.0)
+
+    # Calculate weighted penalty
+    penalty = (
+        alpha[0] * S_norm + alpha[1] * L_recent_norm + alpha[2] * L_avg_norm + alpha[3] * R_norm
+    )
+
+    # Convert to maximization score (higher is better)
+    score = 1.0 / (1.0 + penalty)
+
+    return score
+
+
+def generate_test_signals(num_signals=5):
+    """
+    Generate multiple test signals with different characteristics
+    """
+    test_signals = []
+
+    for i in range(num_signals):
+        np.random.seed(42 + i)  # Different seed for each signal
+        length = 500 + i * 100  # Varying lengths
+        noise_level = 0.2 + i * 0.1  # Varying noise levels
+
+        t = np.linspace(0, 10, length)
+
+        # Different signal characteristics
+        if i == 0:
+            # Smooth sinusoidal with trend
+            clean = 2 * np.sin(2 * np.pi * 0.5 * t) + 0.1 * t
+        elif i == 1:
+            # Multiple frequency components
+            clean = (
+                np.sin(2 * np.pi * 0.5 * t)
+                + 0.5 * np.sin(2 * np.pi * 2 * t)
+                + 0.2 * np.sin(2 * np.pi * 5 * t)
+            )
+        elif i == 2:
+            # Non-stationary with changing frequency
+            clean = np.sin(2 * np.pi * (0.5 + 0.2 * t) * t)
+        elif i == 3:
+            # Step changes
+            clean = np.concatenate(
+                [
+                    np.ones(length // 3),
+                    2 * np.ones(length // 3),
+                    0.5 * np.ones(length - 2 * (length // 3)),
+                ]
+            )
+        else:
+            # Random walk with trend
+            clean = np.cumsum(np.random.randn(length) * 0.1) + 0.05 * t
+
+        # Add noise
+        noise = np.random.normal(0, noise_level, length)
+        noisy = clean + noise
+
+        test_signals.append((noisy, clean))
+
+    return test_signals
+
+
+# Input: run_signal_processing(noisy_signal, window_size) — full signal array and window size.
+# Scoring: composite of smoothness, tracking accuracy, correlation, and noise reduction.
+
+def evaluate(program_path):
+    """
+    Main evaluation function that tests the signal processing algorithm
+    on multiple test signals and calculates the composite performance metric.
+    """
+    try:
+        # Load the program
+        spec = importlib.util.spec_from_file_location("program", program_path)
+        program = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(program)
+
+        # Check if required function exists
+        if not hasattr(program, "run_signal_processing"):
+            return {"combined_score": 0.0, "composite_score": 0.0, "error": "Missing run_signal_processing function"}
+
+        # Generate test signals
+        test_signals = generate_test_signals(5)
+
+        # Collect metrics across all test signals
+        all_scores = []
+        all_metrics = []
+        successful_runs = 0
+
+        for i, (noisy_signal, clean_signal) in enumerate(test_signals):
+            try:
+                # Run the algorithm with timeout
+                start_time = time.time()
+
+                # Call the program's main function
+                result = run_with_timeout(
+                    program.run_signal_processing,
+                    kwargs={
+                        "noisy_signal": noisy_signal,
+                        "window_size": 20,
+                    },
+                    timeout_seconds=10,
+                )
+
+                execution_time = time.time() - start_time
+
+                # Validate result format
+                if not isinstance(result, dict):
+                    print(f"Signal {i}: Invalid result format")
+                    continue
+
+                if "filtered_signal" not in result:
+                    print(f"Signal {i}: Missing filtered_signal in result")
+                    continue
+
+                filtered_signal = result["filtered_signal"]
+
+                if len(filtered_signal) == 0:
+                    print(f"Signal {i}: Empty filtered signal")
+                    continue
+
+                # Convert to numpy arrays
+                filtered_signal = np.array(filtered_signal)
+
+                # Calculate metrics using the generated test signal
+                window_size = 20
+
+                # Calculate all penalty components
+                S = calculate_slope_changes(filtered_signal)
+                L_recent = calculate_lag_error(filtered_signal, noisy_signal, window_size)
+                L_avg = calculate_average_tracking_error(filtered_signal, noisy_signal, window_size)
+                R = calculate_false_reversal_penalty(filtered_signal, clean_signal, window_size)
+
+                # Calculate composite score
+                composite_score = calculate_composite_score(S, L_recent, L_avg, R)
+
+                # Additional quality metrics
+                correlation = 0.0
+                noise_reduction = 0.0
+
+                try:
+                    # Calculate correlation with clean signal
+                    delay = window_size - 1
+                    aligned_clean = clean_signal[delay : delay + len(filtered_signal)]
+                    min_length = min(len(filtered_signal), len(aligned_clean))
+
+                    if min_length > 1:
+                        corr_result = pearsonr(
+                            filtered_signal[:min_length], aligned_clean[:min_length]
+                        )
+                        correlation = corr_result[0] if not np.isnan(corr_result[0]) else 0.0
+
+                    # Calculate noise reduction
+                    aligned_noisy = noisy_signal[delay : delay + len(filtered_signal)]
+                    aligned_noisy = aligned_noisy[:min_length]
+                    aligned_clean = aligned_clean[:min_length]
+
+                    if min_length > 0:
+                        noise_before = np.var(aligned_noisy - aligned_clean)
+                        noise_after = np.var(filtered_signal[:min_length] - aligned_clean)
+                        noise_reduction = (
+                            (noise_before - noise_after) / noise_before if noise_before > 0 else 0
+                        )
+                        noise_reduction = max(0, noise_reduction)  # Ensure non-negative
+
+                except Exception as e:
+                    print(f"Signal {i}: Error calculating additional metrics: {e}")
+
+                # Store metrics
+                metrics = {
+                    "slope_changes": safe_float(S),
+                    "lag_error": safe_float(L_recent),
+                    "avg_error": safe_float(L_avg),
+                    "false_reversals": safe_float(R),
+                    "composite_score": safe_float(composite_score),
+                    "correlation": safe_float(correlation),
+                    "noise_reduction": safe_float(noise_reduction),
+                    "execution_time": safe_float(execution_time),
+                    "signal_length": len(filtered_signal),
+                }
+
+                all_scores.append(composite_score)
+                all_metrics.append(metrics)
+                successful_runs += 1
+
+            except TimeoutError:
+                print(f"Signal {i}: Timeout")
+                continue
+            except Exception as e:
+                print(f"Signal {i}: Error - {str(e)}")
+                continue
+
+        # If no successful runs, return minimal scores
+        if successful_runs == 0:
+            return {
+                "combined_score": 0.0,
+                "composite_score": 0.0,
+                "slope_changes": 100.0,
+                "lag_error": 1.0,
+                "avg_error": 1.0,
+                "false_reversals": 50.0,
+                "correlation": 0.0,
+                "noise_reduction": 0.0,
+                "success_rate": 0.0,
+                "error": "All test signals failed",
+            }
+
+        # Calculate aggregate metrics
+        avg_composite_score = np.mean(all_scores)
+        avg_slope_changes = np.mean([m["slope_changes"] for m in all_metrics])
+        avg_lag_error = np.mean([m["lag_error"] for m in all_metrics])
+        avg_avg_error = np.mean([m["avg_error"] for m in all_metrics])
+        avg_false_reversals = np.mean([m["false_reversals"] for m in all_metrics])
+        avg_correlation = np.mean([m["correlation"] for m in all_metrics])
+        avg_noise_reduction = np.mean([m["noise_reduction"] for m in all_metrics])
+        avg_execution_time = np.mean([m["execution_time"] for m in all_metrics])
+        success_rate = successful_runs / len(test_signals)
+
+        # Calculate additional derived scores
+        smoothness_score = 1.0 / (1.0 + avg_slope_changes / 20.0)  # Higher is better
+        responsiveness_score = 1.0 / (1.0 + avg_lag_error)  # Higher is better
+        accuracy_score = max(0, avg_correlation)  # 0-1, higher is better
+        efficiency_score = min(1.0, 1.0 / max(0.001, avg_execution_time))  # Speed bonus
+
+        # Overall score combining multiple factors
+        overall_score = (
+            0.4 * avg_composite_score  # Primary metric
+            + 0.2 * smoothness_score  # Smoothness
+            + 0.2 * accuracy_score  # Correlation with clean signal
+            + 0.1 * avg_noise_reduction  # Noise reduction capability
+            + 0.1 * success_rate  # Reliability
+        )
+
+        # Gate: zero out score if accuracy is too low
+        if accuracy_score < 0.1:
+            overall_score = 0.0
+
+        return {
+            "combined_score": safe_float(overall_score),  # Primary selection metric for SkyDiscover
+            "composite_score": safe_float(avg_composite_score),
+            "overall_score": safe_float(overall_score),
+            "slope_changes": safe_float(avg_slope_changes),
+            "lag_error": safe_float(avg_lag_error),
+            "avg_error": safe_float(avg_avg_error),
+            "false_reversals": safe_float(avg_false_reversals),
+            "correlation": safe_float(avg_correlation),
+            "noise_reduction": safe_float(avg_noise_reduction),
+            "smoothness_score": safe_float(smoothness_score),
+            "responsiveness_score": safe_float(responsiveness_score),
+            "accuracy_score": safe_float(accuracy_score),
+            "efficiency_score": safe_float(efficiency_score),
+            "execution_time": safe_float(avg_execution_time),
+            "success_rate": safe_float(success_rate),
+        }
+
+    except Exception as e:
+        print(f"Evaluation failed: {str(e)}")
+        print(traceback.format_exc())
+        return {"combined_score": 0.0, "composite_score": 0.0, "overall_score": 0.0, "error": str(e)}
+
+
+def evaluate_stage1(program_path):
+    """
+    Stage 1 evaluation: Quick validation that the program runs without errors
+    """
+    try:
+        # Load the program
+        spec = importlib.util.spec_from_file_location("program", program_path)
+        program = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(program)
+
+        # Check if required function exists
+        if not hasattr(program, "run_signal_processing"):
+            return {"runs_successfully": 0.0, "error": "Missing run_signal_processing function"}
+
+        # Generate a small test signal (consistent with evaluate() API)
+        np.random.seed(42)
+        signal_length = 100
+        window_size = 10
+        t = np.linspace(0, 2, signal_length)
+        clean_signal = np.sin(2 * np.pi * 0.5 * t)
+        noisy_signal = clean_signal + np.random.normal(0, 0.3, signal_length)
+
+        # Quick test with small signal
+        try:
+            result = run_with_timeout(
+                program.run_signal_processing,
+                kwargs={"noisy_signal": noisy_signal, "window_size": window_size},
+                timeout_seconds=5,
+            )
+
+            if isinstance(result, dict) and "filtered_signal" in result:
+                filtered_signal = result["filtered_signal"]
+                if len(filtered_signal) > 0:
+                    # Quick quality check
+                    composite_score = 0.5  # Baseline score for working programs
+
+                    # Bonus for reasonable output length
+                    expected_length = signal_length - window_size + 1
+                    if len(filtered_signal) == expected_length:
+                        composite_score += 0.2
+
+                    return {
+                        "runs_successfully": 1.0,
+                        "composite_score": composite_score,
+                        "output_length": len(filtered_signal),
+                    }
+                else:
+                    return {"runs_successfully": 0.5, "error": "Empty filtered signal"}
+            else:
+                return {"runs_successfully": 0.3, "error": "Invalid result format"}
+
+        except TimeoutError:
+            return {"runs_successfully": 0.0, "error": "Timeout in stage 1"}
+        except Exception as e:
+            return {"runs_successfully": 0.0, "error": f"Stage 1 error: {str(e)}"}
+
+    except Exception as e:
+        return {"runs_successfully": 0.0, "error": f"Stage 1 failed: {str(e)}"}
+
+
+def evaluate_stage2(program_path):
+    """
+    Stage 2 evaluation: Full evaluation with all test signals
+    """
+    return evaluate(program_path)
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/signal_processing/evaluator/requirements.txt b/benchmarks/math/signal_processing/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9e8a3b7abecc7bf6cb941f155e91fb39a40afd6e
--- /dev/null
+++ b/benchmarks/math/signal_processing/evaluator/requirements.txt
@@ -0,0 +1,4 @@
+# Requirements for Real-Time Signal Processing Example
+numpy>=1.21.0
+scipy>=1.7.0
+PyWavelets
diff --git a/benchmarks/math/signal_processing/evaluator/wrapper.py b/benchmarks/math/signal_processing/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/signal_processing/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/signal_processing/initial_program.py b/benchmarks/math/signal_processing/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8fef3b2e848629915b1f76feb0d494c3309b60f
--- /dev/null
+++ b/benchmarks/math/signal_processing/initial_program.py
@@ -0,0 +1,207 @@
+# EVOLVE-BLOCK-START
+"""
+Real-Time Adaptive Signal Processing Algorithm for Non-Stationary Time Series
+
+This algorithm implements a sliding window approach to filter volatile, non-stationary
+time series data while minimizing noise and preserving signal dynamics.
+"""
+import numpy as np
+
+
+def adaptive_filter(x, window_size=20):
+    """
+    Adaptive signal processing algorithm using sliding window approach.
+
+    Args:
+        x: Input signal (1D array of real-valued samples)
+        window_size: Size of the sliding window (W samples)
+
+    Returns:
+        y: Filtered output signal with length = len(x) - window_size + 1
+    """
+    if len(x) < window_size:
+        raise ValueError(f"Input signal length ({len(x)}) must be >= window_size ({window_size})")
+
+    # Initialize output array
+    output_length = len(x) - window_size + 1
+    y = np.zeros(output_length)
+
+    # Simple moving average as baseline
+    for i in range(output_length):
+        window = x[i : i + window_size]
+
+        # Basic moving average filter
+        y[i] = np.mean(window)
+
+    return y
+
+
+def enhanced_filter_with_trend_preservation(x, window_size=20):
+    """
+    Enhanced version with trend preservation using weighted moving average.
+
+    Args:
+        x: Input signal (1D array of real-valued samples)
+        window_size: Size of the sliding window
+
+    Returns:
+        y: Filtered output signal
+    """
+    if len(x) < window_size:
+        raise ValueError(f"Input signal length ({len(x)}) must be >= window_size ({window_size})")
+
+    output_length = len(x) - window_size + 1
+    y = np.zeros(output_length)
+
+    # Create weights that emphasize recent samples
+    weights = np.exp(np.linspace(-2, 0, window_size))
+    weights = weights / np.sum(weights)
+
+    for i in range(output_length):
+        window = x[i : i + window_size]
+
+        # Weighted moving average with exponential weights
+        y[i] = np.sum(window * weights)
+
+    return y
+
+
+def process_signal(input_signal, window_size=20, algorithm_type="enhanced"):
+    """
+    Main signal processing function that applies the selected algorithm.
+
+    Args:
+        input_signal: Input time series data
+        window_size: Window size for processing
+        algorithm_type: Type of algorithm to use ("basic" or "enhanced")
+
+    Returns:
+        Filtered signal
+    """
+    if algorithm_type == "enhanced":
+        return enhanced_filter_with_trend_preservation(input_signal, window_size)
+    else:
+        return adaptive_filter(input_signal, window_size)
+
+
+# EVOLVE-BLOCK-END
+
+
+def generate_test_signal(length=1000, noise_level=0.3, seed=42):
+    """
+    Generate synthetic test signal with known characteristics.
+
+    Args:
+        length: Length of the signal
+        noise_level: Standard deviation of noise to add
+        seed: Random seed for reproducibility
+
+    Returns:
+        Tuple of (noisy_signal, clean_signal)
+    """
+    np.random.seed(seed)
+    t = np.linspace(0, 10, length)
+
+    # Create a complex signal with multiple components
+    clean_signal = (
+        2 * np.sin(2 * np.pi * 0.5 * t)  # Low frequency component
+        + 1.5 * np.sin(2 * np.pi * 2 * t)  # Medium frequency component
+        + 0.5 * np.sin(2 * np.pi * 5 * t)  # Higher frequency component
+        + 0.8 * np.exp(-t / 5) * np.sin(2 * np.pi * 1.5 * t)  # Decaying oscillation
+    )
+
+    # Add non-stationary behavior
+    trend = 0.1 * t * np.sin(0.2 * t)  # Slowly varying trend
+    clean_signal += trend
+
+    # Add random walk component for non-stationarity
+    random_walk = np.cumsum(np.random.randn(length) * 0.05)
+    clean_signal += random_walk
+
+    # Add noise
+    noise = np.random.normal(0, noise_level, length)
+    noisy_signal = clean_signal + noise
+
+    return noisy_signal, clean_signal
+
+
+def run_signal_processing(noisy_signal=None, signal_length=1000, noise_level=0.3, window_size=20):
+    """
+    Run the signal processing algorithm on a test signal.
+
+    Args:
+        noisy_signal: Input signal to filter (if provided, use this; otherwise generate)
+        signal_length: Length if generating signal (for backward compatibility)
+        noise_level: Noise level if generating signal (for backward compatibility)
+        window_size: Window size for processing
+
+    Returns:
+        Dictionary containing results and metrics
+    """
+    # Use provided signal or generate test signal (for backward compatibility)
+    if noisy_signal is not None:
+        # Filter the provided signal
+        filtered_signal = process_signal(noisy_signal, window_size, "enhanced")
+        clean_signal = None  # Not available when using provided signal
+    else:
+        # Generate test signal (for __main__ and backward compatibility)
+        noisy_signal, clean_signal = generate_test_signal(signal_length, noise_level)
+        filtered_signal = process_signal(noisy_signal, window_size, "enhanced")
+
+    # Calculate basic metrics (only if we have clean_signal from generation)
+    if len(filtered_signal) > 0 and clean_signal is not None:
+        # Align signals for comparison (account for processing delay)
+        delay = window_size - 1
+        aligned_clean = clean_signal[delay:]
+        aligned_noisy = noisy_signal[delay:]
+
+        # Ensure same length
+        min_length = min(len(filtered_signal), len(aligned_clean))
+        filtered_signal = filtered_signal[:min_length]
+        aligned_clean = aligned_clean[:min_length]
+        aligned_noisy = aligned_noisy[:min_length]
+
+        # Calculate correlation with clean signal
+        correlation = np.corrcoef(filtered_signal, aligned_clean)[0, 1] if min_length > 1 else 0
+
+        # Calculate noise reduction
+        noise_before = np.var(aligned_noisy - aligned_clean)
+        noise_after = np.var(filtered_signal - aligned_clean)
+        noise_reduction = (noise_before - noise_after) / noise_before if noise_before > 0 else 0
+
+        return {
+            "filtered_signal": filtered_signal,
+            "clean_signal": aligned_clean,
+            "noisy_signal": aligned_noisy,
+            "correlation": correlation,
+            "noise_reduction": noise_reduction,
+            "signal_length": min_length,
+        }
+    elif len(filtered_signal) > 0:
+        # When using provided signal (no clean_signal available), just return filtered signal
+        return {
+            "filtered_signal": filtered_signal,
+            "clean_signal": None,
+            "noisy_signal": None,
+            "correlation": 0,
+            "noise_reduction": 0,
+            "signal_length": len(filtered_signal),
+        }
+    else:
+        return {
+            "filtered_signal": [],
+            "clean_signal": [],
+            "noisy_signal": [],
+            "correlation": 0,
+            "noise_reduction": 0,
+            "signal_length": 0,
+        }
+
+
+if __name__ == "__main__":
+    # Test the algorithm
+    results = run_signal_processing()
+    print("Signal processing completed!")
+    print(f"Correlation with clean signal: {results['correlation']:.3f}")
+    print(f"Noise reduction: {results['noise_reduction']:.3f}")
+    print(f"Processed signal length: {results['signal_length']}")
diff --git a/benchmarks/math/sums_diffs_finite_sets/config.yaml b/benchmarks/math/sums_diffs_finite_sets/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..72441284aae838b9cf735ccf640715eb05d62fce
--- /dev/null
+++ b/benchmarks/math/sums_diffs_finite_sets/config.yaml
@@ -0,0 +1,41 @@
+# Math benchmark: sums_diffs_finite_sets
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: |
+    SETTING:
+    You are an expert in number theory, combinatorial optimization, and AI-driven mathematical discovery.
+    Your task is to evolve and optimize a Python script to find a finite set of integers `U` that provides a new, world-record **lower bound** for the constant C₆.
+
+    PROBLEM CONTEXT:
+    Target: Find a finite set `U` of non-negative integers (containing 0) that **maximizes** the objective function:
+    C6(U) = 1 + log(|U-U| / |U+U|) / log(2*max(U) + 1)
+
+    This maximum value provides a tight lower bound for the constant C6.
+
+    Current best known lower bound: C6 ≥ 1.158417281556896
+    Goal: Find a set `U` that results in a C6 value greater than 1.158417281556896.
+
+    PERFORMANCE METRICS:
+    - c6_bound: Bound found.
+    - combined_score: c6_bound/1.158417281556896 (The primary objective is to MAXIMIZE this value - a value > 1 means a new record).
+    - set_size: size of the set U.
+    - max_val: max value in the set U.
+    - eval_time: evaluation time of the main program.
+
+    VALIDATION FRAMEWORK:
+    - The evaluation script re-computes the C6 value using standard NumPy set operations and verifies the constraints on `U`.
+evaluator:
+  timeout: 600
+  max_retries: 3
diff --git a/benchmarks/math/sums_diffs_finite_sets/evaluator/Dockerfile b/benchmarks/math/sums_diffs_finite_sets/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/sums_diffs_finite_sets/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/sums_diffs_finite_sets/evaluator/evaluate.sh b/benchmarks/math/sums_diffs_finite_sets/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/sums_diffs_finite_sets/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/sums_diffs_finite_sets/evaluator/evaluator.py b/benchmarks/math/sums_diffs_finite_sets/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..04deb415b541c38754118112e88f742e77598404
--- /dev/null
+++ b/benchmarks/math/sums_diffs_finite_sets/evaluator/evaluator.py
@@ -0,0 +1,97 @@
+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for the sums and differences of finite sets problem.
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+
+import sys
+import os
+from importlib import __import__
+import time
+import numpy as np
+
+BENCHMARK = 1.158417281556896
+
+
+def verify_c6_solution(u_set: np.ndarray, c6_achieved: float):
+    """Verifies the C6 lower bound solution."""
+
+    if not isinstance(u_set, np.ndarray) or u_set.ndim != 1:
+        raise ValueError("Solution U must be a 1D numpy array of integers.")
+
+    # Verify constraints
+    if 0 not in u_set:
+        raise ValueError("Set U must contain 0.")
+    if np.any(u_set < 0):
+        raise ValueError("Set U must contain non-negative integers.")
+
+    # Re-calculate the C6 bound using NumPy
+    u_plus_u = np.unique(u_set[:, None] + u_set[None, :])
+    u_minus_u = np.unique(u_set[:, None] - u_set[None, :])
+
+    size_U_plus_U = len(u_plus_u)
+    size_U_minus_U = len(u_minus_u)
+    max_U = np.max(u_set)
+
+    ratio = size_U_minus_U / size_U_plus_U
+    log_ratio = np.log(ratio)
+    log_denom = np.log(2 * max_U + 1)
+
+    computed_c6 = 1 + log_ratio / log_denom
+
+    # Check for consistency
+    if not np.isclose(computed_c6, c6_achieved):
+        raise ValueError(f"C6 mismatch: reported {c6_achieved:.6f}, computed {computed_c6:.6f}")
+
+    print(f"C6 lower bound achieved: {c6_achieved:.6f}")
+    print(f"Known best bound (AlphaEvolve): {BENCHMARK}")
+
+    if c6_achieved > BENCHMARK:
+        print("Successfully found a new, better lower bound!")
+    else:
+        print("Result is not better than the known lower bounds.")
+
+
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+            start_time = time.time()
+            u_set, c6_bound = program.run()
+            end_time = time.time()
+            eval_time = end_time - start_time
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+
+        verify_c6_solution(u_set, c6_bound)
+
+        return {
+            "c6_bound": float(c6_bound),
+            "combined_score": float(c6_bound) / BENCHMARK,
+            "set_size": len(u_set),
+            "max_val": int(np.max(u_set)),
+            "eval_time": float(eval_time),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/sums_diffs_finite_sets/evaluator/requirements.txt b/benchmarks/math/sums_diffs_finite_sets/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3dee69521695f692e340afea5918ed74f057d6aa
--- /dev/null
+++ b/benchmarks/math/sums_diffs_finite_sets/evaluator/requirements.txt
@@ -0,0 +1,3 @@
+numpy
+jax
+optax
\ No newline at end of file
diff --git a/benchmarks/math/sums_diffs_finite_sets/evaluator/wrapper.py b/benchmarks/math/sums_diffs_finite_sets/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/sums_diffs_finite_sets/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/sums_diffs_finite_sets/initial_program.py b/benchmarks/math/sums_diffs_finite_sets/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..d63997af4b28cd0838812006562c60d1cf710bed
--- /dev/null
+++ b/benchmarks/math/sums_diffs_finite_sets/initial_program.py
@@ -0,0 +1,117 @@
+# EVOLVE-BLOCK-START
+import jax
+import jax.numpy as jnp
+from dataclasses import dataclass
+import numpy as np
+import tqdm
+
+
+@dataclass
+class Hyperparameters:
+    max_integer: int = 250
+    num_restarts: int = 5
+    num_search_steps: int = 1000
+    initial_temperature: float = 0.01
+
+
+class C6Searcher:
+    """
+    Searches for a set U by running the search in pure Python for correctness.
+    """
+
+    def __init__(self, hypers: Hyperparameters):
+        self.hypers = hypers
+        self.allowed_values = jnp.array((-1, 0, 1), dtype=jnp.int32)
+
+    @staticmethod
+    def _objective_fn(u_mask: jnp.ndarray) -> jnp.ndarray:
+        """Calculates the C6 lower bound using jnp.unique"""
+        U = jnp.where(u_mask)[0]
+
+        sums = U[:, None] + U[None, :]
+        diffs = U[:, None] - U[None, :]
+
+        size_U_plus_U = jnp.unique(sums).shape[0]
+        size_U_minus_U = jnp.unique(diffs).shape[0]
+        max_U = jnp.max(U)
+
+        # Handle the case where max_U is 0 to avoid log(1)=0 in denominator
+        if max_U == 0:
+            return -1.0  # Return a low value for trivial sets
+
+        ratio = size_U_minus_U / size_U_plus_U
+        c6_bound = 1 + jnp.log(ratio) / jnp.log(2 * max_U + 1)
+
+        return -c6_bound  # Return negative for maximization
+
+    def anneal_step(self, key, temp, current_mask, current_loss):
+        """Performs one step of Simulated Annealing (not JIT-compiled)."""
+        # Propose a random mutation
+        idx_to_flip = jax.random.randint(key, (), 1, len(current_mask))
+        neighbor_mask = current_mask.at[idx_to_flip].set(1 - current_mask[idx_to_flip])
+
+        neighbor_loss = self._objective_fn(neighbor_mask)
+        delta_loss = neighbor_loss - current_loss
+
+        # Metropolis acceptance criterion
+        should_accept = False
+        if delta_loss < 0:
+            should_accept = True
+        else:
+            accept_prob = jnp.exp(-delta_loss / temp)
+            if jax.random.uniform(key) < accept_prob:
+                should_accept = True
+
+        if should_accept:
+            return neighbor_mask, neighbor_loss
+        else:
+            return current_mask, current_loss
+
+
+def run():
+    hypers = Hyperparameters()
+    main_key = jax.random.PRNGKey(42)
+
+    best_loss = float("inf")
+    best_set_np = None
+
+    for i in range(hypers.num_restarts):
+        print(f"\n{'='*20} Restart {i+1}/{hypers.num_restarts} {'='*20}")
+        restart_key, main_key = jax.random.split(main_key)
+        loss, u_set_np = run_single_trial(hypers, restart_key)
+
+        if loss < best_loss:
+            print(f"New best C6 bound found: {-loss:.8f}")
+            best_loss = loss
+            best_set_np = u_set_np
+
+    c6_bound = -best_loss
+    print(f"\nSearch complete. Best C6 lower bound found: {c6_bound:.8f}")
+    return best_set_np, c6_bound
+
+
+def run_single_trial(hypers, key):
+    # Initialize a random sparse set, ensuring 0 is included
+    key, subkey = jax.random.split(key)
+    sparsity = 0.95
+    u_mask = jax.random.bernoulli(subkey, p=(1 - sparsity), shape=(hypers.max_integer + 1,))
+    u_mask = u_mask.at[0].set(True)
+
+    searcher = C6Searcher(hypers)
+    current_loss = searcher._objective_fn(u_mask)
+
+    print(f"Starting SA search. Initial C6 bound: {-current_loss:.6f}")
+
+    current_mask = u_mask
+    for step in tqdm.tqdm(range(hypers.num_search_steps), desc="Annealing Progress"):
+        key, subkey = jax.random.split(key)
+        current_temp = hypers.initial_temperature * (1 - step / hypers.num_search_steps)
+        current_mask, current_loss = searcher.anneal_step(
+            subkey, jnp.maximum(current_temp, 1e-6), current_mask, current_loss
+        )
+
+    final_set = np.where(current_mask)[0]
+    return current_loss, final_set
+
+
+# EVOLVE-BLOCK-END
diff --git a/benchmarks/math/third_autocorr_ineq/config.yaml b/benchmarks/math/third_autocorr_ineq/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6abd6d4eb665b10f22973c78496b7d64c5d90ce8
--- /dev/null
+++ b/benchmarks/math/third_autocorr_ineq/config.yaml
@@ -0,0 +1,66 @@
+# Math benchmark: third_autocorr_ineq
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: 'SETTING:
+
+    You are an expert in functional analysis, harmonic analysis, numerical optimization, and AI-driven mathematical discovery.
+
+    Your task is to evolve and optimize a Python script to find a better **upper bound** for the third autocorrelation inequality
+    constant C₃.
+
+
+    PROBLEM CONTEXT:
+
+    Target: Find a function f: R → R (which can take positive and negative values) that **minimizes** the constant C3 in the
+    inequality:
+
+    max_{-1/2≤t≤1/2} |f ★ f(t)| ≥ C3 (∫_{-1/4}^{1/4} f(x) dx)²
+
+
+    This is equivalent to minimizing the ratio: C3 = max |f ★ f| / (∫f)²
+
+
+    Current best known bound: C3 ≤ 1.45810
+
+    Goal: Beat the AlphaEvolve upper bound of 1.4556427953745406.
+
+
+    Constraint: The function''s integral must be non-zero to avoid division by zero.
+
+
+    PERFORMANCE METRICS:
+
+    - c3: The C3 constant achieved by the discovered function.
+
+    - combined_score: 1.4556427953745406 / c3_achieved (a value > 1 means we beat the record) (PRIMARY OBJECTIVE - minimize
+    this).
+
+    - loss: loss value returned by the loss function.
+
+    - n_points: number of points used to the discretization of the interval.
+
+    - eval_time: evaluation time to run the solution script.
+
+
+    VALIDATION FRAMEWORK:
+
+    - The evaluation script re-computes the C3 ratio using `numpy.convolve` and `numpy.abs` to verify the value from the optimizer.
+
+    - It checks that the function''s integral is not close to zero.
+
+    '
+evaluator:
+  timeout: 600
+  max_retries: 3
diff --git a/benchmarks/math/third_autocorr_ineq/evaluator/Dockerfile b/benchmarks/math/third_autocorr_ineq/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/third_autocorr_ineq/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/third_autocorr_ineq/evaluator/evaluate.sh b/benchmarks/math/third_autocorr_ineq/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/third_autocorr_ineq/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/third_autocorr_ineq/evaluator/evaluator.py b/benchmarks/math/third_autocorr_ineq/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..6481d1e4b849d28e248dc3bf8615696568903abe
--- /dev/null
+++ b/benchmarks/math/third_autocorr_ineq/evaluator/evaluator.py
@@ -0,0 +1,91 @@
+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for the third autocorrelation inequality problem.
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+
+import sys
+import os
+from importlib import __import__
+import time
+import numpy as np
+
+# Known bounds
+BENCHMARK = 1.4556427953745406
+# Note: This is a non-convex optimization problem with multiple local optima.
+# Observed performance depends on search space coverage and exploration thoroughness.
+# Computational budget can be allocated toward improving solution quality.
+
+
+def verify_c3_solution(f_values: np.ndarray, c3_achieved: float, n_points: int):
+    """Verify the solution for the C3 UPPER BOUND optimization."""
+
+    if f_values.shape != (n_points,):
+        raise ValueError(f"Expected function values shape {(n_points,)}. Got {f_values.shape}.")
+
+    # Recompute C3 using NumPy to verify
+    dx = 0.5 / n_points
+
+    # Squared integral of f
+    integral_f_sq = (np.sum(f_values) * dx) ** 2
+
+    if integral_f_sq < 1e-9:
+        raise ValueError("Function integral is close to zero, ratio is unstable.")
+
+    # Max absolute value of the scaled autoconvolution
+    conv = np.convolve(f_values, f_values, mode="full")
+    scaled_conv = conv * dx
+    max_abs_conv = np.max(np.abs(scaled_conv))
+
+    computed_c3 = max_abs_conv / integral_f_sq
+
+    delta = abs(computed_c3 - c3_achieved)
+    if delta > 1e-3:
+        raise ValueError(
+            f"C3 mismatch: reported {c3_achieved:.6f}, computed {computed_c3:.6f}, delta: {delta:.6f}"
+        )
+
+
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+            start_time = time.time()
+            f_values, c3_achieved, loss, n_points = program.run()
+            end_time = time.time()
+            eval_time = end_time - start_time
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+
+        verify_c3_solution(f_values, c3_achieved, n_points)
+
+        return {
+            "c3": float(c3_achieved),
+            "combined_score": BENCHMARK / float(c3_achieved),
+            "loss": float(loss),
+            "n_points": int(n_points),
+            "eval_time": float(eval_time),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/third_autocorr_ineq/evaluator/requirements.txt b/benchmarks/math/third_autocorr_ineq/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3dee69521695f692e340afea5918ed74f057d6aa
--- /dev/null
+++ b/benchmarks/math/third_autocorr_ineq/evaluator/requirements.txt
@@ -0,0 +1,3 @@
+numpy
+jax
+optax
\ No newline at end of file
diff --git a/benchmarks/math/third_autocorr_ineq/evaluator/wrapper.py b/benchmarks/math/third_autocorr_ineq/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/third_autocorr_ineq/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/third_autocorr_ineq/initial_program.py b/benchmarks/math/third_autocorr_ineq/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f8bbfb40b60285d3d9b6ea841fbfe1045735235
--- /dev/null
+++ b/benchmarks/math/third_autocorr_ineq/initial_program.py
@@ -0,0 +1,107 @@
+# EVOLVE-BLOCK-START
+import jax
+import jax.numpy as jnp
+import optax
+import numpy as np
+from dataclasses import dataclass
+
+
+@dataclass
+class Hyperparameters:
+    """Hyperparameters for the optimization process."""
+
+    num_intervals: int = 400
+    learning_rate: float = 0.005
+    num_steps: int = 20000
+    warmup_steps: int = 2000
+
+
+class C3Optimizer:
+    """
+    Optimizes a function f (with positive and negative values) to find an
+    upper bound for the C3 constant.
+    """
+
+    def __init__(self, hypers: Hyperparameters):
+        self.hypers = hypers
+        self.domain_width = 0.5
+        self.dx = self.domain_width / self.hypers.num_intervals
+
+    def _objective_fn(self, f_values: jnp.ndarray) -> jnp.ndarray:
+        """
+        Computes the C3 ratio. The goal is to minimize this value.
+        """
+        # The squared integral of f.
+        integral_f = jnp.sum(f_values) * self.dx
+        eps = 1e-9
+        integral_f_sq_safe = jnp.maximum(integral_f**2, eps)
+
+        # The max of the absolute value of the autoconvolution.
+        N = self.hypers.num_intervals
+        padded_f = jnp.pad(f_values, (0, N))
+
+        fft_f = jnp.fft.fft(padded_f)
+        conv_f_f = jnp.fft.ifft(fft_f * fft_f).real
+
+        # Scale the unscaled convolution sum by dx to approximate the integral.
+        scaled_conv_f_f = conv_f_f * self.dx
+
+        # Take the maximum of the absolute value.
+        max_abs_conv = jnp.max(jnp.abs(scaled_conv_f_f))
+
+        c3_ratio = max_abs_conv / integral_f_sq_safe
+
+        # We want to MINIMIZE the ratio.
+        return c3_ratio
+
+    def train_step(self, f_values: jnp.ndarray, opt_state: optax.OptState) -> tuple:
+        """Performs a single training step."""
+        loss, grads = jax.value_and_grad(self._objective_fn)(f_values)
+        updates, opt_state = self.optimizer.update(grads, opt_state, f_values)
+        f_values = optax.apply_updates(f_values, updates)
+        return f_values, opt_state, loss
+
+    def run_optimization(self):
+        """Sets up and runs the full optimization process."""
+        schedule = optax.warmup_cosine_decay_schedule(
+            init_value=0.0,
+            peak_value=self.hypers.learning_rate,
+            warmup_steps=self.hypers.warmup_steps,
+            decay_steps=self.hypers.num_steps - self.hypers.warmup_steps,
+            end_value=self.hypers.learning_rate * 1e-4,
+        )
+        self.optimizer = optax.adam(learning_rate=schedule)
+
+        key = jax.random.PRNGKey(42)
+        f_values = jax.random.normal(key, (self.hypers.num_intervals,))
+
+        opt_state = self.optimizer.init(f_values)
+        print(
+            f"Number of intervals (N): {self.hypers.num_intervals}, Steps: {self.hypers.num_steps}"
+        )
+        train_step_jit = jax.jit(self.train_step)
+
+        loss = jnp.inf
+        for step in range(self.hypers.num_steps):
+            f_values, opt_state, loss = train_step_jit(f_values, opt_state)
+            if step % 1000 == 0 or step == self.hypers.num_steps - 1:
+                print(f"Step {step:5d} | C3 ≈ {loss:.8f}")
+
+        final_c3 = loss
+        print(f"Final C3 upper bound found: {final_c3:.8f}")
+        return f_values, final_c3
+
+
+def run():
+    """Entry point for running the optimization."""
+    hypers = Hyperparameters()
+    optimizer = C3Optimizer(hypers)
+    optimized_f, final_c3_val = optimizer.run_optimization()
+
+    loss_val = final_c3_val
+    f_values_np = np.array(optimized_f)
+
+    return f_values_np, float(final_c3_val), float(loss_val), hypers.num_intervals
+
+
+# EVOLVE-BLOCK-END
diff --git a/benchmarks/math/uncertainty_ineq/config.yaml b/benchmarks/math/uncertainty_ineq/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..91a826b02896ef79c4d2e7d60e3b28ab8732a28b
--- /dev/null
+++ b/benchmarks/math/uncertainty_ineq/config.yaml
@@ -0,0 +1,45 @@
+# Math benchmark: uncertainty_ineq
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: |
+    SETTING:
+    You are an expert in harmonic analysis, numerical optimization, and AI-driven mathematical discovery.
+    Your task is to evolve and optimize a Python script to find a better **upper bound** for the uncertainty inequality constant C₄.
+
+    PROBLEM CONTEXT:
+    Target: Find an even function f(x) that **minimizes** the product A(f)A(f̂), where A(f) is the largest positive root of f.
+    This minimal product provides a tight upper bound for the constant C₄.
+
+    Current best known upper bound: C₄ ≤ 0.3215872333529007
+    Goal: Find a set of coefficients for a test function that results in a C₄ value lower than 0.3215872333529007.
+
+    METHOD:
+    The test function is parameterized as f(x) = P(x)exp(-πx²), where P(x) is a linear combination of even Hermite polynomials: P(x) = c₀H₀(x) + c₁H₄(x) + c₂H₈(x) + ...
+    The problem simplifies to finding coefficients [c₀, c₁, c₂, ...] that minimize the largest positive root of P(x), subject to the constraint P(0) = 0.
+    The final C₄ bound is the square of this minimal root, (r_max)².
+
+    PERFORMANCE METRICS:
+    - c4_bound: bound found by the algorithm.
+    - combined_score: 0.3215872333529007 / c4_bound (The primary objective is to MAXIMIZE this value - a value > 1 means a new record).
+    - r_max: largest positive root found.
+    - coeffs: coefficients found in the optimization.
+    - eval_time: evaluation time of the script.
+
+    VALIDATION FRAMEWORK:
+    - The evaluation script reconstructs the polynomial from the discovered coefficients and uses `numpy.roots` to independently verify the largest positive root and the C4 bound.
+    - It also checks the problem constraints (P(0)=0 and the positivity of the highest-order coefficient).
+evaluator:
+  timeout: 600
+  max_retries: 3
diff --git a/benchmarks/math/uncertainty_ineq/evaluator/Dockerfile b/benchmarks/math/uncertainty_ineq/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/uncertainty_ineq/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/uncertainty_ineq/evaluator/evaluate.sh b/benchmarks/math/uncertainty_ineq/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/uncertainty_ineq/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/uncertainty_ineq/evaluator/evaluator.py b/benchmarks/math/uncertainty_ineq/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e7ea5dda4e60ee3184679d9e1fe726d73a2c9c3
--- /dev/null
+++ b/benchmarks/math/uncertainty_ineq/evaluator/evaluator.py
@@ -0,0 +1,137 @@
+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for the uncertainty inequality problem.
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+
+import sys, os, time, numpy as np
+import sympy as sp
+
+BENCHMARK = 0.3215872333529007
+
+x = sp.symbols("x")
+
+
+def _hermite_4k_polys(m: int):
+    degrees = [4 * k for k in range(m)]
+    Hs = [sp.polys.orthopolys.hermite_poly(n=d, x=x, polys=False) for d in degrees]
+    return Hs, degrees
+
+
+def _construct_P_with_forced_zero(coeffs: np.ndarray) -> sp.Expr:
+    """
+    Given m input coeffs (c0..c_{m-1}), build the Hermite combo
+    c0*H0 + ... + c_{m-1}*H_{4(m-1)} + c_last*H_{4m}
+    where c_last is chosen so that P(0) = 0.
+    Also flip sign if limit at +inf is negative.
+    """
+    m = len(coeffs)
+    Hs, _ = _hermite_4k_polys(m + 1)  # include the (m)-th term for solving P(0)=0
+    rc = [sp.Rational(c) for c in coeffs]
+
+    partial = sum(rc[i] * Hs[i] for i in range(m))
+    a = Hs[m].subs(x, 0)
+    b = -partial.subs(x, 0)
+    c_last = sp.Rational(b) / sp.Rational(a)
+
+    P = partial + c_last * Hs[m]
+
+    # Ensure positivity at +inf (all degrees are multiples of 4, so sign is well-defined)
+    if sp.limit(P, x, sp.oo) < 0:
+        P = -P
+
+    return sp.simplify(P)
+
+
+def _largest_positive_root_of_P_over_x2(P: sp.Expr) -> float:
+    # P is even and has P(0)=0; divide by x^2 and find the largest positive real root.
+    gq = sp.exquo(P, x**2)  # exact division (should divide cleanly if multiplicity >= 2)
+    roots = sp.real_roots(gq, x)
+    if not roots:
+        raise ValueError("No real roots for P(x)/x^2.")
+
+    # Validate sign change around each candidate
+    best = None
+    for r in roots:
+        r_approx = r.eval_rational(n=200)
+        eps = sp.Rational(1, 10**198)
+        left = gq.subs(x, r_approx - eps)
+        right = gq.subs(x, r_approx + eps)
+        if (left > 0 and right < 0) or (left < 0 and right > 0):
+            if best is None or r_approx > best:
+                best = r_approx
+
+    if best is None:
+        raise ValueError("No root with a verified sign change for P(x)/x^2.")
+    return float(best)
+
+
+def compute_c4_and_rmax(input_coeffs: np.ndarray):
+    P = _construct_P_with_forced_zero(input_coeffs)
+    # Quick sanity checks
+    assert P.subs(x, 0) == 0, "P(0) != 0 after forcing."
+    assert sp.limit(P, x, sp.oo) > 0, "Limit at +inf is not positive."
+
+    rmax = _largest_positive_root_of_P_over_x2(P)
+    c4 = (rmax**2) / (2.0 * np.pi)
+    return c4, rmax
+
+
+def verify_c4_solution_strict(
+    user_coeffs: np.ndarray, reported_c4: float, reported_rmax: float, atol=1e-9, rtol=1e-9
+):
+    c4, rmax = compute_c4_and_rmax(np.asarray(user_coeffs, dtype=float))
+
+    if not np.isclose(c4, reported_c4, rtol=rtol, atol=atol):
+        raise ValueError(f"C4 mismatch: reported {reported_c4:.12f}, recomputed {c4:.12f}")
+
+    if not np.isclose(rmax, reported_rmax, rtol=rtol, atol=atol):
+        raise ValueError(f"r_max mismatch: reported {reported_rmax:.12f}, recomputed {rmax:.12f}")
+
+    return c4, rmax
+
+
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+            t0 = time.time()
+            coeffs, c4_bound, r_max = program.run()
+            t1 = time.time()
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+
+        coeffs = np.asarray(coeffs, dtype=float)
+
+        c4, rmax = verify_c4_solution_strict(coeffs, float(c4_bound), float(r_max))
+
+        return {
+            "c4_bound": float(c4),
+            "combined_score": float(BENCHMARK / c4),
+            "r_max": float(rmax),
+            "coeffs": coeffs.tolist(),
+            "eval_time": float(t1 - t0),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/uncertainty_ineq/evaluator/requirements.txt b/benchmarks/math/uncertainty_ineq/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..48d8952e69a26a1327dd3fecc28f7b45d92bb41f
--- /dev/null
+++ b/benchmarks/math/uncertainty_ineq/evaluator/requirements.txt
@@ -0,0 +1,4 @@
+numpy
+sympy
+scipy
+tqdm
\ No newline at end of file
diff --git a/benchmarks/math/uncertainty_ineq/evaluator/wrapper.py b/benchmarks/math/uncertainty_ineq/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/uncertainty_ineq/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/uncertainty_ineq/initial_program.py b/benchmarks/math/uncertainty_ineq/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..0da75dd3ed271fdd6c55306081cd3ad83dd7fc83
--- /dev/null
+++ b/benchmarks/math/uncertainty_ineq/initial_program.py
@@ -0,0 +1,211 @@
+# Disable progress bar for cleaner output logs
+import os
+
+os.environ["TQDM_DISABLE"] = "1"
+
+# EVOLVE-BLOCK-START
+import jax
+import jax.numpy as jnp
+import optax
+import numpy as np
+from dataclasses import dataclass
+from scipy.special import hermite
+import tqdm
+
+
+@dataclass
+class Hyperparameters:
+    learning_rate: float = 0.001
+    num_steps: int = 100000
+    num_restarts: int = 20
+    num_hermite_coeffs: int = 4  # uses H0, H4, H8, H12
+
+
+class UncertaintyOptimizer:
+    """
+    Finds coefficients for a generalized Hermite polynomial P(x) that minimize
+    the largest positive root, providing an upper bound for C4.
+    """
+
+    def __init__(self, hypers: Hyperparameters):
+        self.hypers = hypers
+        self.degrees = [4 * k for k in range(hypers.num_hermite_coeffs)]
+        max_degree = self.degrees[-1]
+        hermite_polys = [hermite(d) for d in self.degrees]
+
+        basis = []
+        for poly in hermite_polys:
+            pad_amount = max_degree - poly.order
+            basis.append(jnp.array(np.pad(poly.coef, (pad_amount, 0))))
+        self.hermite_basis = jnp.stack(basis)
+
+        self.H_vals_at_zero = jnp.array([p(0) for p in hermite_polys])
+        self.x_grid = jnp.linspace(0.0, 10.0, 3000)
+        self.optimizer = optax.adam(self.hypers.learning_rate)
+
+    @staticmethod
+    def _objective_fn(params: jnp.ndarray, hermite_basis, H_vals_at_zero, x_grid):
+        """Penalize negative values of P(x) on [0, Xmax]; mild weighting to emphasize larger x."""
+        c_others, log_c_last = params[:-1], params[-1]
+        c_last = jnp.exp(log_c_last)
+
+        # Enforce P(0) = 0
+        c0 = (
+            -(jnp.sum(c_others * H_vals_at_zero[1:-1]) + c_last * H_vals_at_zero[-1])
+            / H_vals_at_zero[0]
+        )
+        hermite_coeffs = jnp.concatenate([jnp.array([c0]), c_others, jnp.array([c_last])])
+
+        poly_coeffs_std = jnp.sum(hermite_coeffs[:, None] * hermite_basis, axis=0)
+        p_values = jnp.polyval(poly_coeffs_std, x_grid)
+
+        # Slightly increasing weight toward the right end of the interval
+        weights = 1.0 + (x_grid / (x_grid[-1] + 1e-12))
+        loss = jnp.sum(weights * jax.nn.relu(-p_values))
+        return loss
+
+    @staticmethod
+    def train_step(
+        params: jnp.ndarray,
+        opt_state: optax.OptState,
+        optimizer,
+        hermite_basis,
+        H_vals_at_zero,
+        x_grid,
+    ):
+        loss, grads = jax.value_and_grad(UncertaintyOptimizer._objective_fn)(
+            params, hermite_basis, H_vals_at_zero, x_grid
+        )
+        updates, opt_state = optimizer.update(grads, opt_state, params)
+        params = optax.apply_updates(params, updates)
+        return params, opt_state, loss
+
+
+def run_single_trial(optimizer: UncertaintyOptimizer, key: jax.random.PRNGKey):
+    """Runs one full optimization from a near-good starting point with small noise."""
+    num_params_to_opt = optimizer.hypers.num_hermite_coeffs - 1  # = 3 when using H0,H4,H8,H12
+    assert num_params_to_opt == 3, "This initialization assumes num_hermite_coeffs == 4."
+
+    base_c1 = -0.01158510802599293
+    base_c2 = -8.921606035407065e-05
+    base_log_c_last = np.log(1e-6)
+
+    base = jnp.array([base_c1, base_c2, base_log_c_last], dtype=jnp.float32)
+    noise = jax.random.normal(key, (num_params_to_opt,)) * 1e-3
+    params = base + noise
+
+    opt_state = optimizer.optimizer.init(params)
+    jit_train_step = jax.jit(UncertaintyOptimizer.train_step, static_argnums=(2,))
+
+    for _ in range(optimizer.hypers.num_steps):
+        params, opt_state, _ = jit_train_step(
+            params,
+            opt_state,
+            optimizer.optimizer,
+            optimizer.hermite_basis,
+            optimizer.H_vals_at_zero,
+            optimizer.x_grid,
+        )
+    return params
+
+
+def _build_P_from_hermite_coeffs(hermite_coeffs: np.ndarray, degrees: list[int]) -> np.poly1d:
+    """Build monomial-basis polynomial P from Hermite-basis coefficients."""
+    max_degree = degrees[-1]
+    hermite_polys = [hermite(d) for d in degrees]
+
+    P_poly_coeffs = np.zeros(max_degree + 1)
+    for i, c in enumerate(hermite_coeffs):
+        poly = hermite_polys[i]
+        pad_amount = max_degree - poly.order
+        P_poly_coeffs[pad_amount:] += c * poly.coef
+
+    if P_poly_coeffs[0] < 0:
+        P_poly_coeffs = -P_poly_coeffs
+        hermite_coeffs[:] = -hermite_coeffs
+    return np.poly1d(P_poly_coeffs)
+
+
+def _c4_from_hermite_coeffs(hermite_coeffs: np.ndarray, num_hermite_coeffs: int):
+    """Compute r_max and C4 from full Hermite coefficient vector."""
+    degrees = [4 * k for k in range(num_hermite_coeffs)]
+    P = _build_P_from_hermite_coeffs(hermite_coeffs.copy(), degrees)
+
+    # Divide by x^2
+    Q, R = np.polydiv(P, np.poly1d([1.0, 0.0, 0.0]))
+    if np.max(np.abs(R.c)) > 1e-10:
+        return None, None
+
+    roots = Q.r
+    real_pos = roots[(np.isreal(roots)) & (roots.real > 0)].real
+    if real_pos.size == 0:
+        return None, None
+
+    # Tiny sign-change check around candidates
+    r_candidates = np.sort(real_pos)
+    r_max = None
+    for r in r_candidates:
+        eps = 1e-10 * max(1.0, abs(r))
+        left = np.polyval(Q, r - eps)
+        right = np.polyval(Q, r + eps)
+        if left * right < 0:
+            r_max = float(r)
+    if r_max is None:
+        r_max = float(r_candidates[-1])
+
+    c4 = (r_max**2) / (2 * np.pi)
+    return c4, r_max
+
+
+def get_c4_from_params(params: np.ndarray, hypers: Hyperparameters):
+    """Calculates the precise C4 bound from a final set of parameters."""
+    c_others, log_c_last = params[:-1], params[-1]
+    c_last = np.exp(log_c_last)
+
+    degrees = [4 * k for k in range(hypers.num_hermite_coeffs)]
+    hermite_polys = [hermite(d) for d in degrees]
+    H_vals_at_zero = np.array([p(0) for p in hermite_polys])
+
+    # Enforce P(0) = 0
+    c0 = (
+        -(np.sum(c_others * H_vals_at_zero[1:-1]) + c_last * H_vals_at_zero[-1]) / H_vals_at_zero[0]
+    )
+    hermite_coeffs = np.concatenate([[c0], np.array(c_others), [c_last]])
+
+    c4, rmax = _c4_from_hermite_coeffs(hermite_coeffs, hypers.num_hermite_coeffs)
+    if c4 is None:
+        return None, None, None
+    return hermite_coeffs, c4, rmax
+
+
+def run():
+    hypers = Hyperparameters()
+    optimizer = UncertaintyOptimizer(hypers)
+    main_key = jax.random.PRNGKey(42)
+    best_c4_bound = float("inf")
+    best_coeffs, best_r_max = None, None
+
+    print(f"Running {hypers.num_restarts} trials to find the best C4 upper bound...")
+    for _ in tqdm.tqdm(range(hypers.num_restarts), desc="Searching", disable=True):
+        main_key, restart_key = jax.random.split(main_key)
+        final_params = run_single_trial(optimizer, restart_key)
+
+        coeffs, c4_bound, r_max = get_c4_from_params(np.array(final_params), hypers)
+
+        if c4_bound is not None and c4_bound < best_c4_bound:
+            best_c4_bound = c4_bound
+            best_coeffs = coeffs
+            best_r_max = r_max
+
+    if best_coeffs is None:
+        raise RuntimeError("Failed to find a valid solution in any restart.")
+
+    print("\nSearch complete.")
+    print(f"Best Hermite coeffs: {best_coeffs}")
+    print(f"Best largest positive root r_max: {best_r_max:.8f}")
+    print(f"Resulting best C4 upper bound: {best_c4_bound:.8f}")
+
+    return best_coeffs, best_c4_bound, best_r_max
+
+
+# EVOLVE-BLOCK-END
diff --git a/skydiscover/context_builder/README.md b/skydiscover/context_builder/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..83df45c3529b16d64a8f49d9285927c5ed13e726
--- /dev/null
+++ b/skydiscover/context_builder/README.md
@@ -0,0 +1,104 @@
+# Context Builder
+
+The context builder turns program state into LLM prompts. It is called once per iteration and returns `{"system": ..., "user": ...}`.
+
+For most tasks you do not need to touch this. Just set the system prompt in `config.yaml`:
+
+```yaml
+prompt:
+  system_message: |-
+    You are an expert at optimizing load balancing algorithms.
+```
+
+Only write a custom builder if your algorithm has search-state data (tree path, island ID, rejection history) that the LLM should see.
+
+---
+
+## Structure
+
+```
+context_builder/
+  base.py              ContextBuilder ABC (one method: build_prompt)
+  utils.py             TemplateManager (loads .txt templates from directories)
+  human_feedback.py    File-based human feedback injection
+  default/
+    builder.py         DefaultContextBuilder (handles diff / rewrite / image / prompt modes)
+    templates/         .txt prompt templates
+  evox/
+    builder.py         EvoxContextBuilder (extends Default, adds LLM-generated summaries)
+    templates/         .txt templates (override default ones with the same filename)
+```
+
+Each builder owns its own `TemplateManager`. Later directories passed to `TemplateManager` override earlier ones on filename conflicts.
+
+---
+
+## Default templates
+
+| File | Role | When used |
+|------|------|-----------|
+| `system_message.txt` | system | Default system prompt (overridden by config) |
+| `diff_user_message.txt` | user | Diff-based generation (default mode) |
+| `full_rewrite_user_message.txt` | user | Full rewrite mode |
+| `full_rewrite_prompt_opt_user_message.txt` | user | Prompt optimization tasks |
+| `image_user_message.txt` | user | Image generation mode |
+| `evaluator_system_message.txt` | system | LLM judge (only with llm_as_judge) |
+| `evaluator_user_message.txt` | user | LLM judge user message |
+
+---
+
+## Writing a custom builder
+
+The most common pattern is extending `DefaultContextBuilder` and injecting extra guidance via the `{search_guidance}` placeholder. The default templates already include this slot; an empty string makes it disappear cleanly.
+
+```python
+from pathlib import Path
+from skydiscover.context_builder.default import DefaultContextBuilder
+from skydiscover.context_builder.utils import TemplateManager
+
+class MyContextBuilder(DefaultContextBuilder):
+
+    def __init__(self, config):
+        super().__init__(config)
+        # load your templates on top of the defaults
+        default_templates = str(Path(__file__).parent.parent / "default" / "templates")
+        my_templates = str(Path(__file__).parent / "templates")
+        self.template_manager = TemplateManager(default_templates, my_templates)
+
+    def build_prompt(self, current_program, context=None, **kwargs):
+        context = context or {}
+        # format whatever the manager put into the context dict
+        guidance = self._format_guidance(context.get("my_key"))
+        return super().build_prompt(current_program, context, search_guidance=guidance, **kwargs)
+
+    def _format_guidance(self, data):
+        if not data:
+            return ""
+        return f"## CONTEXT\n{data}"
+```
+
+The manager populates `context["my_key"]` before calling `build_prompt()`, and sets the builder in its `__init__`:
+
+```python
+self.context_builder = MyContextBuilder(self.config)
+```
+
+Example to copy: `adaevolve/builder.py` (adds evaluator feedback, paradigm guidance, and sibling context).
+
+---
+
+## Registration
+
+To make a builder available via config instead of hardcoding it in a manager, add it to `_init_context_builder()` in `search/default_discovery_controller.py`:
+
+```python
+elif template == "my_builder":
+    self.context_builder = MyContextBuilder(self.config)
+```
+
+Then activate with:
+
+```yaml
+prompt:
+  template: my_builder
+```
diff --git a/skydiscover/context_builder/__init__.py b/skydiscover/context_builder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cb80b7b8d3c0f6bb97fd7ef712f20d6de45b0eb
--- /dev/null
+++ b/skydiscover/context_builder/__init__.py
@@ -0,0 +1,15 @@
+"""Context builder module."""
+
+from skydiscover.context_builder.base import ContextBuilder
+from skydiscover.context_builder.default import DefaultContextBuilder
+from skydiscover.context_builder.evox import EvoxContextBuilder
+from skydiscover.context_builder.gepa_native import GEPANativeContextBuilder
+from skydiscover.context_builder.human_feedback import HumanFeedbackReader
+
+__all__ = [
+    "ContextBuilder",
+    "DefaultContextBuilder",
+    "EvoxContextBuilder",
+    "GEPANativeContextBuilder",
+    "HumanFeedbackReader",
+]
diff --git a/skydiscover/context_builder/default/__init__.py b/skydiscover/context_builder/default/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a799f05a564b7bfecd7316cc25deb57377a14229
--- /dev/null
+++ b/skydiscover/context_builder/default/__init__.py
@@ -0,0 +1,5 @@
+"""Default context builder module."""
+
+from .builder import DefaultContextBuilder
+
+__all__ = ["DefaultContextBuilder"]
diff --git a/skydiscover/context_builder/default/templates/diff_user_message.txt b/skydiscover/context_builder/default/templates/diff_user_message.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c83ee9a912a1d1bf0b2c3911c79203c9d299d9ee
--- /dev/null
+++ b/skydiscover/context_builder/default/templates/diff_user_message.txt
@@ -0,0 +1,49 @@
+# Current Solution Information
+- Main Metrics: 
+{metrics}
+- Focus areas: {improvement_areas}
+
+# Program Generation History
+## Previous Attempts
+
+{previous_attempts}
+
+{other_context_programs}
+
+{current_program}
+
+# Task
+Suggest improvements to the program that will improve its COMBINED_SCORE.
+The system maintains diversity across these dimensions: score, complexity.
+Different solutions with similar combined_score but different features are valuable.
+
+You MUST use the exact SEARCH/REPLACE diff format shown below to indicate changes:
+
+<<<<<<< SEARCH
+# Original code to find and replace (must match exactly)
+=======
+# New replacement code
+>>>>>>> REPLACE
+
+Example of valid diff format:
+<<<<<<< SEARCH
+for i in range(m):
+    for j in range(p):
+        for k in range(n):
+            C[i, j] += A[i, k] * B[k, j]
+=======
+# Reorder loops for better memory access pattern
+for i in range(m):
+    for k in range(n):
+        for j in range(p):
+            C[i, j] += A[i, k] * B[k, j]
+>>>>>>> REPLACE
+
+**CRITICAL**: You can suggest multiple changes. Each SEARCH section must EXACTLY match code in "# Current Solution" - copy it character-for-character, preserving all whitespace and indentation. Do NOT paraphrase or reformat.
+Be thoughtful about your changes and explain your reasoning thoroughly.
+Include a concise docstring at the start of functions describing the exact approach taken.
+
+IMPORTANT: If an instruction header of "## IMPORTANT: ..." is given below the "# Current Solution", you MUST follow it. Otherwise, 
+focus on targeted improvements of the program. 
+
+{timeout_warning}
\ No newline at end of file
diff --git a/skydiscover/context_builder/default/templates/full_rewrite_user_message.txt b/skydiscover/context_builder/default/templates/full_rewrite_user_message.txt
new file mode 100644
index 0000000000000000000000000000000000000000..38c6c275a25c6dddbd2307ddbe3e163844b090ce
--- /dev/null
+++ b/skydiscover/context_builder/default/templates/full_rewrite_user_message.txt
@@ -0,0 +1,35 @@
+# Current Solution Information
+- Main Metrics: 
+{metrics}
+- Focus areas: {improvement_areas}
+
+# Program Generation History
+## Previous Attempts
+
+{previous_attempts}
+
+{other_context_programs}
+
+{current_program}
+
+# Task
+Suggest improvements to the program that will improve its COMBINED_SCORE.
+The system maintains diversity across these dimensions: score, complexity.
+Different solutions with similar combined_score but different features are valuable.
+
+Provide the complete new program solution.
+
+IMPORTANT: Make sure your rewritten program maintains the same inputs and outputs
+as the original program, but with improved internal implementation.
+
+```{language}
+# Your rewritten program here
+```
+
+**CRITICAL**: Be thoughtful about your changes and explain your reasoning thoroughly.
+Include a concise docstring at the start of functions describing the exact approach taken.
+
+IMPORTANT: If an instruction header of "## IMPORTANT: ..." is given below the "# Current Solution", you MUST follow it. Otherwise, 
+focus on targeted improvements of the program. 
+
+{timeout_warning}
\ No newline at end of file
diff --git a/skydiscover/context_builder/evox/__init__.py b/skydiscover/context_builder/evox/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f996ffef5dd8a1c34e3d846be9e6966162b4e370
--- /dev/null
+++ b/skydiscover/context_builder/evox/__init__.py
@@ -0,0 +1,5 @@
+"""Evox context builder: co-evolves search algorithms and solutions."""
+
+from .builder import EvoxContextBuilder
+
+__all__ = ["EvoxContextBuilder"]
diff --git a/skydiscover/context_builder/evox/formatters.py b/skydiscover/context_builder/evox/formatters.py
new file mode 100644
index 0000000000000000000000000000000000000000..b13d5e5d9edebb9c36eb21b60fd137483fb78c31
--- /dev/null
+++ b/skydiscover/context_builder/evox/formatters.py
@@ -0,0 +1,626 @@
+"""
+Pure formatting functions for evox prompt generation.
+All functions are stateless — no class or LLM dependencies.
+"""
+
+import logging
+import os
+from typing import Any, Dict, List, Optional, Union
+
+from skydiscover.context_builder.utils import format_artifacts, prog_attr
+from skydiscover.search.base_database import Program
+
+logger = logging.getLogger(__name__)
+
+
+def filter_db_stats_by_horizon(db_stats: Dict[str, Any], horizon: int) -> Dict[str, Any]:
+    """Filter db_stats to only include the last 'horizon' entries for trajectory fields."""
+    if not db_stats or horizon <= 0:
+        return db_stats
+
+    filtered = dict(db_stats)
+    if recent := db_stats.get("recent_solution_stats"):
+        filtered_recent = dict(recent)
+        for key in ["execution_trace", "score_trajectory", "parent_scores"]:
+            if (val := recent.get(key)) and len(val) > horizon:
+                filtered_recent[key] = val[-horizon:]
+        filtered_recent["num_recent_iterations"] = min(
+            horizon, recent.get("num_recent_iterations", 0)
+        )
+        filtered["recent_solution_stats"] = filtered_recent
+    return filtered
+
+
+def format_execution_trace(execution_trace: list, window_start_score: float = None) -> str:
+    """Format execution trace with program/parent/context tuples."""
+    if not execution_trace:
+        return ""
+
+    def fmt_id(pid):
+        return pid[:8] if pid and len(pid) > 8 else (pid or "None")
+
+    def fmt_score(s):
+        return f"{s:.4f}" if s is not None else "N/A"
+
+    def unpack_tuple(t):
+        if not t:
+            return None, None, None
+        if len(t) >= 3:
+            return t[0], t[1], t[2]
+        return None, t[0], t[1]
+
+    def fmt_program_ref(t, prefix=""):
+        label, pid, score = unpack_tuple(t)
+        if pid is None:
+            return f"{prefix}=None (seed program)" if prefix else "None"
+        label_str = f'label="{label}", ' if label else ""
+        return (
+            f"{prefix} ({label_str}id={fmt_id(pid)}, score={fmt_score(score)})"
+            if prefix
+            else f"({label_str}id={fmt_id(pid)}, score={fmt_score(score)})"
+        )
+
+    lines = []
+    best = window_start_score
+
+    for entry in execution_trace:
+        prog_tuple = entry.get("program")
+        if prog_tuple is None:
+            continue
+
+        _, _, prog_score = unpack_tuple(prog_tuple)
+        _, _, parent_score = unpack_tuple(entry.get("parent"))
+
+        parent_str = fmt_program_ref(entry.get("parent"), "Parent")
+        ctx = entry.get("context") or []
+        context_str = f"Context=[{', '.join(fmt_program_ref(c) for c in ctx)}]"
+
+        if prog_score is not None:
+            prog_score, parent_score = round(prog_score, 4), (
+                round(parent_score, 4) if parent_score is not None else None
+            )
+            if best is None:
+                best, outcome = prog_score, "first program"
+            elif prog_score > best:
+                outcome, best = f"⭐ NEW BEST (was {best:.4f})", prog_score
+            elif parent_score is not None and prog_score > parent_score:
+                outcome = f"above parent, best still {best:.4f}"
+            elif parent_score is not None and prog_score < parent_score:
+                outcome = f"regression, best still {best:.4f}"
+            else:
+                outcome = f"no change, best still {best:.4f}"
+        else:
+            outcome = "N/A"
+
+        lines.extend(
+            [
+                f"Iter {entry.get('iteration', '?')}: {parent_str}, {context_str}",
+                f"       -> Generated child score={fmt_score(prog_score)} ({outcome})",
+                "",
+            ]
+        )
+
+    return "\n".join(lines[:-1]) if lines else ""
+
+
+def format_db_stats_diff(
+    start_stats: Dict[str, Any], end_stats: Dict[str, Any], horizon: Optional[int] = None
+) -> str:
+    """Format start -> end db_stats comparison for a search algorithm's window."""
+    if not start_stats or not end_stats:
+        return ""
+
+    lines = ["Population Statistics Change (Start -> End of Search Window):"]
+
+    start_pop = start_stats.get("population_size", "?")
+    end_pop = end_stats.get("population_size", "?")
+    lines.append(f"- population_size: {start_pop} -> {end_pop}")
+
+    start_summary = start_stats.get("solution_score_summary", {})
+    end_summary = end_stats.get("solution_score_summary", {})
+    if start_summary and end_summary:
+        parts = []
+        key_names = [
+            ("best", "current_best"),
+            ("q75", "75th_pct"),
+            ("q50", "50th_pct (median)"),
+            ("q25", "25th_pct"),
+            ("worst", "worst"),
+        ]
+        for key, display_name in key_names:
+            s = start_summary.get(key)
+            e = end_summary.get(key)
+            if s is not None and e is not None:
+                diff = e - s
+                sign = "+" if diff >= 0 else ""
+                parts.append(f"{display_name}: {s:.4f} -> {e:.4f} ({sign}{diff:.4f})")
+        if parts:
+            lines.append(f"- {', '.join(parts)}")
+
+    start_top = start_stats.get("top_solution_scores", [])
+    end_top = end_stats.get("top_solution_scores", [])
+    if start_top and end_top:
+        k = min(len(start_top), len(end_top))
+        start_fmt = [f"{s:.4f}" for s in start_top[:k]]
+        end_fmt = [f"{s:.4f}" for s in end_top[:k]]
+        lines.append(f"- top_{k}_solution_scores: {start_fmt} -> {end_fmt}")
+
+    start_avg = start_stats.get("avg_solutions_per_parent")
+    end_avg = end_stats.get("avg_solutions_per_parent")
+    if start_avg is not None and end_avg is not None and start_pop and end_pop:
+        start_pct = (start_avg / start_pop * 100) if start_pop != "?" else 0
+        end_pct = (end_avg / end_pop * 100) if end_pop != "?" else 0
+        lines.append(
+            f"- % of solutions share the same parent on average: {start_pct:.1f}% -> {end_pct:.1f}%"
+        )
+
+    sota = end_stats.get("SOTA_score")
+    if sota is not None and start_summary and end_summary:
+        start_best = start_summary.get("best")
+        end_best = end_summary.get("best")
+        if start_best is not None and end_best is not None:
+            start_gap = sota - start_best
+            end_gap = sota - end_best
+            gap_diff = end_gap - start_gap
+            sign = "+" if gap_diff >= 0 else ""
+            lines.append(
+                f"- gap_to_SOTA (lower is better): {start_gap:.4f} -> {end_gap:.4f} ({sign}{gap_diff:.4f})"
+            )
+
+    start_tiers = start_summary.get("score_tiers") if start_summary else None
+    end_tiers = end_summary.get("score_tiers") if end_summary else None
+    if start_tiers and end_tiers:
+        tier_parts = []
+        for tier_name in end_tiers.keys():
+            start_data = start_tiers.get(tier_name, {})
+            end_data = end_tiers.get(tier_name, {})
+            start_pct = start_data.get("pct_programs", 0)
+            end_pct = end_data.get("pct_programs", 0)
+            start_threshold = start_data.get("threshold", "")
+            end_threshold = end_data.get("threshold", "")
+            diff = end_pct - start_pct
+            sign = "+" if diff >= 0 else ""
+            tier_parts.append(
+                f"\n  {tier_name}: [{start_threshold}] {start_pct:.0f}% -> [{end_threshold}] {end_pct:.0f}% ({sign}{diff:.0f}%)"
+            )
+        lines.append(f"- programs_by_score_tier:{','.join(tier_parts)}")
+
+    end_recent = end_stats.get("recent_solution_stats", {})
+    if end_recent:
+        iters_no_improve = end_recent.get("iterations_without_improvement")
+        threshold = end_recent.get("improvement_threshold", 0.0)
+        if iters_no_improve is not None:
+            if threshold > 0:
+                lines.append(
+                    f"- iterations_without_improvement (improvement <= {threshold:.4f}): {iters_no_improve}"
+                )
+            else:
+                lines.append(f"- iterations_without_improvement: {iters_no_improve}")
+
+        execution_trace = end_recent.get("execution_trace")
+        if execution_trace:
+            if horizon:
+                execution_trace = execution_trace[-horizon:]
+
+            first_iter = execution_trace[0].get("iteration", "?")
+            last_iter = execution_trace[-1].get("iteration", "?")
+            lines.append(f"\n### Execution Trace (iterations {first_iter}-{last_iter})")
+            window_start_score = start_summary.get("best") if start_summary else None
+            lines.append(
+                format_execution_trace(execution_trace, window_start_score=window_start_score)
+            )
+        else:
+
+            def fmt_scores(scores):
+                return [f"{s:.4f}" if s is not None else "N/A" for s in scores]
+
+            if score_trajectory := end_recent.get("score_trajectory"):
+                lines.append(
+                    f"- recent_score_trajectory (last {len(score_trajectory)}): {fmt_scores(score_trajectory)}"
+                )
+                if parent_scores := end_recent.get("parent_scores"):
+                    lines.append(f"- recent_parent_scores: {fmt_scores(parent_scores)}")
+
+    return "\n".join(lines)
+
+
+def format_population_state(db_stats: Dict[str, Any]) -> str:
+    """Format the population state from db_stats into clean, actionable lines."""
+    if not db_stats:
+        return ""
+
+    def fmt_scores(scores):
+        return [f"{s:.4f}" if s is not None else "N/A" for s in scores]
+
+    lines = []
+    pop_size = db_stats.get("population_size")
+    lines.append(f"- population_size: {pop_size}")
+
+    score_summary = db_stats.get("solution_score_summary") or {}
+    sota = db_stats.get("SOTA_score")
+    best = score_summary.get("best")
+    q75, q50, q25 = (
+        score_summary.get("q75"),
+        score_summary.get("q50") or score_summary.get("median"),
+        score_summary.get("q25"),
+    )
+    worst = score_summary.get("worst")
+
+    if best is not None:
+        pct = lambda v: (v / best * 100) if best > 0 and v is not None else 0
+
+        dist_parts = [f"current_best={best:.4f}"]
+        for name, val in [("75th_pct", q75), ("50th_pct", q50), ("25th_pct", q25)]:
+            if val is not None:
+                dist_parts.append(f"{name}={val:.4f} ({pct(val):.0f}%)")
+        if worst is not None:
+            dist_parts.append(f"worst={worst:.4f}")
+
+        lines.append(f"- score_distribution: {', '.join(dist_parts)}")
+        if sota is not None:
+            lines.append(f"- gap_to_SOTA: SOTA={sota:.4f}, gap={sota - best:.4f}")
+
+        if tiers := score_summary.get("score_tiers"):
+            tier_parts = [
+                f"{n} ({d.get('threshold','')}): {d.get('pct_programs',0):.0f}%"
+                for n, d in tiers.items()
+            ]
+            lines.append(f"- programs_by_score_tier: {', '.join(tier_parts)}")
+
+        if (unique := score_summary.get("unique_scores")) is not None:
+            lines.append(f"- unique_score_values: {unique}")
+
+    if (avg := db_stats.get("avg_solutions_per_parent")) is not None and pop_size:
+        lines.append(f"- {avg / pop_size * 100:.1f}% of solutions share the same parent on average")
+
+    if top_scores := db_stats.get("top_solution_scores"):
+        best_score = top_scores[0]
+        best_count = (
+            sum(
+                1
+                for s in top_scores
+                if isinstance(s, (int, float)) and round(s, 4) == round(best_score, 4)
+            )
+            if isinstance(best_score, (int, float))
+            else 0
+        )
+        lines.append(f"- top_{len(top_scores)}_scores: {fmt_scores(top_scores)}")
+        if best_count > 1:
+            lines.append(f"  - Top score ({best_score:.4f}) repeated {best_count}x")
+        if best_count == len(top_scores):
+            lines.append(f"  (⚠️ ALL {best_count} identical)")
+
+    if recent := db_stats.get("recent_solution_stats"):
+        if (iters := recent.get("iterations_without_improvement")) and iters > 0:
+            thresh = recent.get("improvement_threshold", 0.0)
+            thresh_str = f" by more than {thresh:.4f}" if thresh > 0 else ""
+            lines.append(f"- No improvement{thresh_str} for {iters} iterations")
+
+        def score_bucket(score):
+            if score is None or best is None:
+                return None
+            if score >= best:
+                return "at best"
+            if q75 and score >= q75:
+                return "75-100th"
+            if q50 and score >= q50:
+                return "50-75th"
+            if q25 and score >= q25:
+                return "25-50th"
+            return "0-25th"
+
+        for key, label in [("most_reused_parent", "parent"), ("most_reused_context", "context")]:
+            if (ratio := recent.get(f"{key}_ratio")) and ratio > 0:
+                bucket = score_bucket(recent.get(f"{key}_score"))
+                score_str = f", score {bucket}" if bucket else ""
+                lines.append(f"- {label}: {ratio*100:.0f}% reuse rate{score_str}")
+
+        if traj := recent.get("score_trajectory"):
+            lines.append(f"- recent_scores (last {len(traj)}): {fmt_scores(traj)}")
+            if parent := recent.get("parent_scores"):
+                lines.append(f"- recent_parent_scores: {fmt_scores(parent)}")
+
+    return "\n".join(lines)
+
+
+def format_current_program(
+    current_program: Union[Program, Dict[str, Program]],
+    language: str,
+    improvement_areas: Optional[str] = None,
+) -> str:
+    """Format current program with metrics and solution."""
+    if not current_program:
+        return ""
+
+    if isinstance(current_program, dict) and current_program:
+        label = list(current_program.keys())[0] or "Current Search Program"
+        program = list(current_program.values())[0]
+    else:
+        label = "Current Search Program"
+        program = current_program
+    solution = prog_attr(program, "solution")
+    metrics = prog_attr(program, "metrics", {})
+
+    window_start = int(metrics.get("window_start_iteration", 0))
+    horizon = int(metrics.get("search_window_horizon") or 0)
+    window_end = window_start + horizon
+    start_score = metrics.get("search_window_start_score", 0.0)
+    end_score = metrics.get("search_window_end_score", 0.0)
+    combined_score = metrics.get("combined_score", 0.0)
+    improvement = end_score - start_score
+
+    lines = [f"## {label}\n", "### Metrics"]
+    if improvement_areas:
+        lines.append(f"Focus areas:\n{improvement_areas}\n")
+    lines.append(f"Search Algorithm Score = {combined_score:.4f}")
+    lines.append(
+        f"This search algorithm ran from iteration {window_start} to {window_end} ({horizon} iterations)"
+    )
+    lines.append(
+        f"This search algorithm changed the downstream solution combined_score by: {start_score:.4f} -> {end_score:.4f} (+{improvement:.4f})"
+    )
+    lines.append(f"\n### Solution\n```{language}")
+    lines.append(solution)
+    lines.append("```\n")
+
+    artifact_section = format_artifacts(program, heading="###")
+    if artifact_section:
+        lines.append(artifact_section)
+
+    return "\n".join(lines)
+
+
+def identify_search_improvement_areas(
+    current_program: Program,
+    metrics: Dict[str, float],
+    previous_programs: List[Program],
+    simplification_threshold: Optional[int] = None,
+) -> str:
+    """Identify improvement areas for search algorithms based on combined_score."""
+
+    def safe_float(val):
+        if val is None:
+            return 0.0
+        try:
+            return float(val)
+        except (ValueError, TypeError):
+            return 0.0
+
+    improvement_areas = []
+    current_score = safe_float(metrics.get("combined_score"))
+
+    if previous_programs:
+        prev_program = previous_programs[-1]
+        prev_metrics = prog_attr(prev_program, "metrics", {}) or {}
+        prev_score = safe_float(prev_metrics.get("combined_score"))
+
+        if current_score > prev_score:
+            improvement_areas.append(
+                f"Search algorithm score improved: {prev_score:.4f} → {current_score:.4f}"
+            )
+        elif current_score < prev_score:
+            improvement_areas.append(
+                f"Search algorithm score declined: {prev_score:.4f} → {current_score:.4f}. Consider revising."
+            )
+        else:
+            improvement_areas.append(f"Search algorithm score unchanged at {current_score:.4f}")
+
+    if not improvement_areas:
+        improvement_areas.append("Focus on improving the search algorithm score (combined_score)")
+
+    if simplification_threshold:
+        code_length = len(prog_attr(current_program, "solution"))
+        if code_length > simplification_threshold:
+            improvement_areas.append(
+                f"Consider simplifying - solution length exceeds {simplification_threshold} characters"
+            )
+
+    return "\n".join(f"- {area}" for area in improvement_areas)
+
+
+def format_search_window_context(context: Dict[str, Any]) -> str:
+    """Format the current search window context from context['search_stats']."""
+    stats = context.get("search_stats") or {}
+    window_start = int(stats.get("window_start_iteration") or 0)
+    total = int(stats.get("total_iterations") or 100)
+    horizon = int(stats.get("search_window_horizon", 0))
+    improvement_threshold = float(stats.get("improvement_threshold") or 0.0)
+
+    lines = []
+
+    window_line = f"- Your newly designed search algorithm will start at iteration {window_start} out of {total}."
+    if horizon > 0:
+        if improvement_threshold > 0:
+            window_line += f" It will run for at least {horizon} iterations (potentially more if improving), but will be cut to just {horizon} iterations if it fails to improve the solution score by more than {improvement_threshold:.4f}."
+        else:
+            window_line += f" It will run for at least {horizon} iterations (potentially more if improving), but will be cut to just {horizon} iterations if it fails to improve the solution score."
+    lines.append(window_line)
+
+    if improvement_threshold > 0:
+        lines.append(
+            f"- If your algorithm fails to improve the solution score by more than {improvement_threshold:.4f} during this window, it will be replaced."
+        )
+    else:
+        lines.append(
+            "- If your algorithm fails to improve the solution score during this window, it will be replaced."
+        )
+
+    lines.append(
+        "- Goal: Design a better search strategy (e.g. how to select and manage solution programs) to improve the downstream solution score."
+    )
+    lines.append(
+        "- NOTE: Exactly one program is generated per iteration. Keep the population size in mind when designing your search algorithm."
+    )
+
+    return "\n".join(lines)
+
+
+def format_problem_description(problem_config: Any) -> str:
+    """Format the problem description from the prompt config."""
+    if problem_config is None:
+        return "(No problem description provided)"
+    if isinstance(problem_config, str):
+        return problem_config
+    if hasattr(problem_config, "system_message") and problem_config.system_message:
+        return str(problem_config.system_message)
+    return str(problem_config) if problem_config else "(No problem description provided)"
+
+
+def format_evaluator_context(evaluator_path: Any) -> str:
+    """Format the evaluator context by reading the evaluator file."""
+    if evaluator_path is None:
+        return "(No evaluator context provided)"
+
+    if isinstance(evaluator_path, str):
+        if not evaluator_path.endswith(".py"):
+            if evaluator_path.strip().startswith("```"):
+                return evaluator_path
+            return f"```python\n{evaluator_path}\n```"
+        try:
+            if os.path.isfile(evaluator_path):
+                with open(evaluator_path, "r") as f:
+                    return f"```python\n{f.read()}\n```"
+        except Exception as e:
+            logger.warning(f"Failed to read evaluator file {evaluator_path}: {e}")
+
+    return f"Evaluator file: {evaluator_path}"
+
+
+def prepare_search_algorithms_data(
+    other_context_programs: Union[List[Program], Dict[str, List[Program]]],
+    format_stats_diff=format_db_stats_diff,
+    filter_by_horizon=filter_db_stats_by_horizon,
+) -> List[Dict[str, Any]]:
+    """Prepare data for batch summarization of context programs."""
+    if not other_context_programs:
+        return []
+
+    if isinstance(other_context_programs, dict):
+        flat_programs = []
+        for programs in other_context_programs.values():
+            if programs:
+                flat_programs.extend(programs)
+        programs_list = flat_programs
+    else:
+        programs_list = other_context_programs
+
+    all_programs_data = []
+
+    for idx, program in enumerate(programs_list, start=1):
+        solution = prog_attr(program, "solution")
+        metrics = prog_attr(program, "metrics", {})
+        metadata = prog_attr(program, "metadata", {})
+
+        start_db_stats = metadata.get("start_db_stats")
+        end_db_stats = metadata.get("end_db_stats")
+        horizon = int(metrics.get("search_window_horizon", 0))
+
+        if start_db_stats and end_db_stats:
+            start_db_stats = filter_by_horizon(start_db_stats, horizon)
+            end_db_stats = filter_by_horizon(end_db_stats, horizon)
+
+        if start_db_stats and end_db_stats:
+            db_stats_text = format_stats_diff(start_db_stats, end_db_stats, horizon=horizon)
+            all_programs_data.append(
+                {
+                    "program_num": idx,
+                    "solution": solution,
+                    "db_stats_text": db_stats_text,
+                    "combined_score": metrics.get("combined_score", 0.0),
+                    "improvement": metrics.get("search_window_end_score", 0.0)
+                    - metrics.get("search_window_start_score", 0.0),
+                }
+            )
+
+    return all_programs_data
+
+
+def format_single_program_section(
+    program: Program, idx: int, language: str, summaries_by_num: Dict[int, str]
+) -> List[str]:
+    """Format a single program with metrics and solution/summary."""
+    solution = prog_attr(program, "solution")
+    metrics = prog_attr(program, "metrics", {})
+
+    window_start = int(metrics.get("window_start_iteration", 0))
+    horizon = int(metrics.get("search_window_horizon", 0))
+    start_score = metrics.get("search_window_start_score", 0.0)
+    end_score = metrics.get("search_window_end_score", 0.0)
+    combined_score = metrics.get("combined_score", 0.0)
+
+    lines = [
+        f"### Program {idx}\n",
+        "#### Metrics",
+        f"Search Algorithm Score = {combined_score:.4f}",
+        f"Ran iterations {window_start} to {window_start + horizon} ({horizon} iterations)",
+        f"Score changed: {start_score:.4f} -> {end_score:.4f} (+{end_score - start_score:.4f})",
+    ]
+
+    if idx in summaries_by_num:
+        lines.append(f"\n#### Summary\n{summaries_by_num[idx]}\n")
+    else:
+        lines.extend(["\n#### Solution", f"```{language}", solution, "```\n"])
+
+    artifact_section = format_artifacts(program, heading="####")
+    if artifact_section:
+        lines.append(artifact_section)
+
+    return lines
+
+
+def format_search_algorithms(
+    other_context_programs: Union[List[Program], Dict[str, List[Program]]],
+    language: str,
+    summaries_by_num: Optional[Dict[int, str]] = None,
+) -> str:
+    """Format previous search algorithms with window context."""
+    if not other_context_programs:
+        return ""
+
+    summaries_by_num = summaries_by_num or {}
+    lines = []
+
+    if isinstance(other_context_programs, dict):
+        global_idx = 0
+        for label, programs in other_context_programs.items():
+            display_label = label or "Other Reference Programs"
+            lines.extend(
+                [f"\n## {display_label}\n", "Diverse search programs that may inspire new ideas:\n"]
+            )
+            for program in programs or []:
+                global_idx += 1
+                lines.extend(
+                    format_single_program_section(program, global_idx, language, summaries_by_num)
+                )
+    else:
+        lines.append("## Other Reference Programs\n")
+        for idx, program in enumerate(other_context_programs, start=1):
+            lines.extend(format_single_program_section(program, idx, language, summaries_by_num))
+
+    return "\n".join(lines)
+
+
+def parse_batch_summaries(response: str, programs_data: List[Dict]) -> Dict[int, str]:
+    """Parse batch summary response into individual summaries by program number."""
+    summaries = {}
+    if not response or not programs_data:
+        return summaries
+
+    for prog in programs_data:
+        num = prog["program_num"]
+        marker = f"[PROGRAM {num}]"
+        if marker in response:
+            start_idx = response.find(marker) + len(marker)
+            next_idx = len(response)
+            for other in programs_data:
+                if other["program_num"] != num:
+                    other_marker = f"[PROGRAM {other['program_num']}]"
+                    if other_marker in response:
+                        idx = response.find(other_marker)
+                        if start_idx < idx < next_idx:
+                            next_idx = idx
+            summaries[num] = response[start_idx:next_idx].strip()
+
+    if not summaries and response:
+        summaries[programs_data[0]["program_num"]] = response
+    return summaries
diff --git a/skydiscover/context_builder/evox/templates/problem_context_summary_system_message.txt b/skydiscover/context_builder/evox/templates/problem_context_summary_system_message.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0fb603ae1b4c35b9267291337cb8d654f522a217
--- /dev/null
+++ b/skydiscover/context_builder/evox/templates/problem_context_summary_system_message.txt
@@ -0,0 +1,18 @@
+Summarize the downstream problem context for a search algorithm designer.
+
+Provide a concise summary (under 100 words) covering:
+- [Read ## Problem Description] What problem is being solved. Be accurate and detailed; describe the problem in a way that is easy to understand at a high level.
+- [Read ## Problem Evaluator] How solutions are evaluated/scored
+
+INSTRUCTIONS FOR SCORING FORMULA:
+1. Find the FINAL return statement in the evaluate() function - this is the ONLY source of truth for metric names.
+2. Trace how `combined_score` is calculated using ONLY the metric names that appear as KEYS in that final return dict.
+3. DO NOT expose internal/intermediate variable names (e.g., internal variable names within the function) that don't appear in the return statement.
+4. In your answer, highlight the metric names in the return statement by ``
+
+OUTPUT FORMAT:
+**Task:** <one sentence: what the solution must optimize>
+
+**Scoring:** combined_score = <formula with weights and metric names>
+
+**Goal:** Maximize `combined_score`
\ No newline at end of file
diff --git a/skydiscover/context_builder/evox/templates/problem_template.txt b/skydiscover/context_builder/evox/templates/problem_template.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5f841919f981cf23d81c39890d1eb1022dd0757b
--- /dev/null
+++ b/skydiscover/context_builder/evox/templates/problem_template.txt
@@ -0,0 +1,5 @@
+## Problem Description
+{problem_description}
+
+## Problem Evaluator 
+{evaluator_context}
\ No newline at end of file
diff --git a/skydiscover/context_builder/evox/templates/stats_insight_system_message.txt b/skydiscover/context_builder/evox/templates/stats_insight_system_message.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dacc138d5f00c32ad4b202b4620922625ba7a099
--- /dev/null
+++ b/skydiscover/context_builder/evox/templates/stats_insight_system_message.txt
@@ -0,0 +1,26 @@
+Summarize the population state with NUMBER-BACKED observations.
+
+OUTPUT FORMAT:
+
+📊 **State:** [One-sentence description based on numbers]
+
+**Key Numbers:** [3-4 bullet points]
+• Report key metrics from the stats (score range, spread, trajectory)
+
+**Patterns Observed:** [2-3 bullet points]
+• Describe factual patterns in the data (gaps, trends, anything)
+• State what the numbers show, not what they mean
+• Show a few numbers in the text; do not repeat the full list of numbers.
+
+Example: 
+• Parent selection: what parents were typically chosen recently? 
+• Context selection: what context programs were used recently?
+• Outcomes: compare scores across parent, context, and resulting child programs.
+• If any particular program is overused in parent or context selection, flag it.
+• Label usage: when self.DIVERGE_LABEL / self.REFINE_LABEL was used (if any). 
+
+RULES:
+- Every statement MUST cite a specific number FROM THE STATS PROVIDED
+- Report observations only—no recommendations or interpretations
+- NO made-up numbers
+- NO summary paragraph at the end
\ No newline at end of file
diff --git a/skydiscover/context_builder/gepa_native/templates/diff_user_message.txt b/skydiscover/context_builder/gepa_native/templates/diff_user_message.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ef3f57b4f8b73842180af3adbd95dcd802a845dc
--- /dev/null
+++ b/skydiscover/context_builder/gepa_native/templates/diff_user_message.txt
@@ -0,0 +1,52 @@
+# Current Solution Information
+- Main Metrics: 
+{metrics}
+- Focus areas: {improvement_areas}
+
+# Program Generation History
+## Previous Attempts
+
+{previous_attempts}
+
+{other_context_programs}
+
+# Current Solution
+{current_program}
+
+{search_guidance}
+
+# Task
+Suggest improvements to the program that will improve its COMBINED_SCORE.
+The system maintains diversity across these dimensions: score, complexity.
+Different solutions with similar combined_score but different features are valuable.
+
+You MUST use the exact SEARCH/REPLACE diff format shown below to indicate changes:
+
+<<<<<<< SEARCH
+# Original code to find and replace (must match exactly)
+=======
+# New replacement code
+>>>>>>> REPLACE
+
+Example of valid diff format:
+<<<<<<< SEARCH
+for i in range(m):
+    for j in range(p):
+        for k in range(n):
+            C[i, j] += A[i, k] * B[k, j]
+=======
+# Reorder loops for better memory access pattern
+for i in range(m):
+    for k in range(n):
+        for j in range(p):
+            C[i, j] += A[i, k] * B[k, j]
+>>>>>>> REPLACE
+
+**CRITICAL**: You can suggest multiple changes. Each SEARCH section must EXACTLY match code in "# Current Solution" - copy it character-for-character, preserving all whitespace and indentation. Do NOT paraphrase or reformat.
+Be thoughtful about your changes and explain your reasoning thoroughly.
+Include a concise docstring at the start of functions describing the exact approach taken.
+
+IMPORTANT: If an instruction header of "## IMPORTANT: ..." is given below the "# Current Solution", you MUST follow it. Otherwise, 
+focus on targeted improvements of the program. 
+
+{timeout_warning}
\ No newline at end of file
diff --git a/skydiscover/context_builder/human_feedback.py b/skydiscover/context_builder/human_feedback.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dc5fde8a5e885472437bb94ef3df2fc3a35de8a
--- /dev/null
+++ b/skydiscover/context_builder/human_feedback.py
@@ -0,0 +1,161 @@
+"""
+File-based human feedback reader for human feedback during discovery process.
+
+The human edits a markdown file via the dashboard or any text editor.
+The discovery loop reads it each iteration -- if it has content,
+that content is appended to (or replaces) the LLM system message.
+"""
+
+import logging
+import os
+import time as _time
+
+logger = logging.getLogger(__name__)
+
+_INITIAL_TEMPLATE = """\
+# Human Feedback for SkyDiscover
+# Edit this file to guide the discovery process.
+# Your text will be APPENDED to the LLM system message at the next iteration.
+# Toggle between Append and Replace mode in the dashboard.
+# Clear this file (or delete all non-comment lines) to revert to the default.
+# Lines starting with # are ignored.
+#
+# Examples:
+#   Focus on hexagonal packing and computational geometry approaches.
+#   Use numpy vectorization, avoid loops. Prioritize cache-friendly access patterns.
+"""
+
+MAX_FEEDBACK_CHARS = 4000
+
+
+class HumanFeedbackReader:
+    """
+    Reads human feedback from a markdown file on disk.
+
+    The dashboard writes via write_from_dashboard(); the discovery loop
+    reads via read(). External editors can also modify the file directly.
+
+    Supports two modes:
+    - "append" (default): feedback is appended to the system message
+    - "replace": feedback replaces the system message entirely
+    """
+
+    def __init__(self, feedback_file_path: str, mode: str = "append"):
+        self.path = os.path.abspath(feedback_file_path)
+        self.mode = mode if mode in ("append", "replace") else "append"
+        self._last_content: str = ""
+        self._current_system_prompt: str = ""
+        self._history: list = []
+        self._create_initial_file()
+
+    def _create_initial_file(self) -> None:
+        """Create the feedback file with instructions if it doesn't exist."""
+        if not os.path.exists(self.path):
+            os.makedirs(os.path.dirname(self.path), exist_ok=True)
+            with open(self.path, "w") as f:
+                f.write(_INITIAL_TEMPLATE)
+            logger.info(f"Created human feedback file: {self.path}")
+
+    def read(self) -> str:
+        """
+        Read current feedback, stripping comment lines.
+        Returns empty string if file is empty, missing, or only has comments.
+        """
+        try:
+            with open(self.path, "r") as f:
+                raw = f.read()
+        except (FileNotFoundError, PermissionError):
+            return ""
+
+        lines = []
+        for line in raw.splitlines():
+            stripped = line.strip()
+            if stripped and not stripped.startswith("#"):
+                lines.append(line)
+
+        content = "\n".join(lines).strip()
+        if len(content) > MAX_FEEDBACK_CHARS:
+            content = content[:MAX_FEEDBACK_CHARS]
+
+        if content != self._last_content:
+            if content:
+                logger.info(f"Human feedback updated ({len(content)} chars)")
+            elif self._last_content:
+                logger.info("Human feedback cleared")
+            self._last_content = content
+
+        return content
+
+    def write_from_dashboard(self, text: str) -> None:
+        """
+        Write feedback from the dashboard UI.
+        Pass empty string to clear feedback.
+        """
+        self._write_feedback(text)
+
+    def set_mode(self, mode: str) -> None:
+        """Set feedback mode: 'append' or 'replace'."""
+        if mode not in ("append", "replace"):
+            logger.warning(f"Invalid human feedback mode '{mode}', ignoring")
+            return
+        self.mode = mode
+        logger.info(f"Human feedback mode set to: {mode}")
+
+    def apply_feedback(self, prompt: dict) -> dict:
+        """Apply current feedback to a prompt dict.
+
+        In append mode, feedback is added after the system message.
+        In replace mode, feedback replaces the system message entirely.
+        Returns the modified prompt.
+        """
+        feedback = self.read()
+        if not feedback:
+            return prompt
+
+        if self.mode == "replace":
+            prompt["system"] = feedback
+        else:
+            prompt["system"] = prompt["system"] + "\n\n## Human Guidance\n" + feedback
+        return prompt
+
+    def set_current_prompt(self, system_prompt: str) -> None:
+        """Store the current system prompt for dashboard visibility."""
+        self._current_system_prompt = system_prompt
+
+    def get_current_prompt(self) -> str:
+        """Return the current system prompt."""
+        return self._current_system_prompt
+
+    def log_usage(self, iteration: int, feedback_text: str, mode: str) -> None:
+        """Record that feedback was applied at a given iteration."""
+        entry = {
+            "iteration": iteration,
+            "timestamp": _time.time(),
+            "text": feedback_text,
+            "mode": mode,
+        }
+        self._history.append(entry)
+        logger.info(
+            f"Human feedback logged: iteration={iteration}, mode={mode}, "
+            f"chars={len(feedback_text)}"
+        )
+
+    def get_history(self) -> list:
+        """Return the full feedback usage history."""
+        return list(self._history)
+
+    def to_serializable(self) -> dict:
+        """Return current state for pickling to Island workers."""
+        return {
+            "feedback_text": self._last_content,
+            "mode": self.mode,
+            "current_prompt": self._current_system_prompt,
+        }
+
+    def _write_feedback(self, text: str) -> None:
+        """Write feedback text to the file, preserving the comment header."""
+        with open(self.path, "w") as f:
+            if text:
+                f.write(_INITIAL_TEMPLATE + "\n" + text + "\n")
+            else:
+                f.write(_INITIAL_TEMPLATE)
diff --git a/skydiscover/context_builder/utils.py b/skydiscover/context_builder/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b175bca715538323ffcd5fe83d44bcc2beab883
--- /dev/null
+++ b/skydiscover/context_builder/utils.py
@@ -0,0 +1,65 @@
+"""Shared utilities for context builders."""
+
+from pathlib import Path
+from typing import Any, Optional
+
+
+class TemplateManager:
+    """Loads .txt templates from one or more directories.
+
+    Directories are processed in order; later directories override
+    templates with the same name from earlier ones.
+    """
+
+    def __init__(self, *directories: Optional[str]):
+        """
+        Initializes the TemplateManager with the given directories.
+        If there are multiple directories, the templates from the later directories will override
+        the templates from the earlier directories.
+        """
+        self.templates: dict[str, str] = {}
+        for d in directories:
+            if d:
+                path = Path(d)
+                if path.exists():
+                    self._load_from_directory(path)
+
+    def _load_from_directory(self, directory: Path) -> None:
+        for txt_file in directory.glob("*.txt"):
+            with open(txt_file, "r") as f:
+                self.templates[txt_file.stem] = f.read()
+
+    def get_template(self, name: str) -> str:
+        if name not in self.templates:
+            raise ValueError(f"Template '{name}' not found")
+        return self.templates[name]
+
+
+def prog_attr(program: Any, key: str, default: Any = "") -> Any:
+    """Read an attribute from a Program object or a plain dict."""
+    if hasattr(program, key):
+        return getattr(program, key)
+    if isinstance(program, dict):
+        return program.get(key, default)
+    return default
+
+
+def format_artifacts(program: Any, heading: str = "##", max_len: int = 2000) -> str:
+    """Format evaluator artifacts (e.g. feedback) into markdown sections."""
+    artifacts = prog_attr(program, "artifacts", None)
+    if not artifacts:
+        return ""
+    sections = []
+    for key, value in artifacts.items():
+        if value is None:
+            continue
+        text = str(value)
+        if len(text) > max_len:
+            text = text[:max_len] + "\n... (truncated)"
+        if key == "feedback":
+            sections.append(f"{heading} Evaluator Feedback\n{text}")
+        else:
+            sections.append(f"{heading} {key}\n{text}")
+    if not sections:
+        return ""
+    return "\n" + "\n\n".join(sections) + "\n"
diff --git a/skydiscover/llm/agentic_generator.py b/skydiscover/llm/agentic_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f7468d47f31e713f6193b0d0105482fa5bfd8dc
--- /dev/null
+++ b/skydiscover/llm/agentic_generator.py
@@ -0,0 +1,513 @@
+"""Agentic code generator -- multi-turn tool-calling loop with read_file and search."""
+
+import asyncio
+import concurrent.futures
+import fnmatch
+import json
+import logging
+import os
+import re
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+from skydiscover.llm.openai import is_openai_reasoning_model
+from skydiscover.llm.responses_utils import (
+    convert_messages_to_responses_input,
+    extract_responses_output,
+)
+from skydiscover.utils.code_utils import build_repo_map
+
+logger = logging.getLogger(__name__)
+
+_TOOL_SCHEMAS_PATH = Path(__file__).parent / "tool_schemas" / "agentic_tools.json"
+with open(_TOOL_SCHEMAS_PATH, "r") as _f:
+    TOOL_SCHEMAS = json.load(_f)
+
+# Responses API uses a flattened tool format (name/description/parameters at top level)
+TOOL_SCHEMAS_RESPONSES = [
+    {
+        "type": "function",
+        "name": t["function"]["name"],
+        "description": t["function"]["description"],
+        "parameters": t["function"]["parameters"],
+    }
+    for t in TOOL_SCHEMAS
+]
+
+_AGENTIC_PROMPT_PATH = (
+    Path(__file__).parent.parent
+    / "context_builder"
+    / "default"
+    / "templates"
+    / "agentic_system_message.txt"
+)
+with open(_AGENTIC_PROMPT_PATH, "r") as _f:
+    _AGENTIC_SYSTEM_PROMPT = _f.read()
+
+
+class AgenticGenerator:
+    """
+    V0 [simple version]: Multi-turn tool-calling agent that explores a codebase before generating code.
+
+    Tools: read_file, search. When it stops calling tools, its text output
+    is the final answer. Returns None if no output is produced (caller falls
+    back to direct generation).
+    """
+
+    def __init__(self, llm_pool, config):
+        self.llm_pool = llm_pool
+        self.config = config
+
+    async def generate(self, system_message: str, user_message: str) -> Optional[str]:
+        """Run the agent loop. Returns generated text, or None on failure."""
+        cfg = self.config
+        files_read: set = set()
+        conversation: List[Dict[str, Any]] = []
+        t0 = time.time()
+
+        sys_prompt = f"{system_message}\n\n{_AGENTIC_SYSTEM_PROMPT}"
+        repo_map = build_repo_map(
+            cfg.codebase_root,
+            max_depth=cfg.repo_map_max_depth,
+            allowed_extensions=cfg.allowed_extensions,
+            excluded_dirs=cfg.excluded_dirs,
+        )
+
+        user_parts = [user_message]
+        if repo_map:
+            user_parts.append(f"\n## Project structure\n```\n{repo_map}\n```")
+        conversation.append({"role": "user", "content": "\n".join(user_parts)})
+
+        for step in range(cfg.max_steps):
+            if time.time() - t0 > cfg.overall_timeout:
+                logger.warning("Agent timed out at step %d", step)
+                break
+
+            if _context_chars(sys_prompt, conversation) > cfg.max_context_chars:
+                conversation.append(
+                    {
+                        "role": "user",
+                        "content": "Context limit reached. Output your improved program now.",
+                    }
+                )
+
+            try:
+                assistant_msg = await asyncio.wait_for(
+                    self._call_llm(sys_prompt, conversation),
+                    timeout=cfg.per_step_timeout,
+                )
+            except asyncio.TimeoutError:
+                logger.warning("Step %d: LLM timed out", step)
+                conversation.append(
+                    {
+                        "role": "user",
+                        "content": "Timed out. Output your solution or try a simpler action.",
+                    }
+                )
+                continue
+            except Exception as e:
+                logger.error("Step %d: LLM error: %s", step, e)
+                break
+
+            tool_calls = assistant_msg.get("tool_calls", [])
+            text_content = assistant_msg.get("content", "").strip()
+            conversation.append(assistant_msg)
+
+            if not tool_calls:
+                if text_content:
+                    logger.info(
+                        "Agent produced text at step %d (%d files read)", step, len(files_read)
+                    )
+                    return text_content
+                conversation.append(
+                    {
+                        "role": "user",
+                        "content": "Use a tool to explore, or output your improved program.",
+                    }
+                )
+                continue
+
+            for tc in tool_calls:
+                fn = tc.get("function", {})
+                name, raw, tc_id = fn.get("name", ""), fn.get("arguments", "{}"), tc.get("id", "")
+
+                try:
+                    args = json.loads(raw)
+                except (json.JSONDecodeError, TypeError) as e:
+                    conversation.append(
+                        {"role": "tool", "tool_call_id": tc_id, "content": f"Bad JSON: {e}"}
+                    )
+                    continue
+
+                logger.info(
+                    "Step %d: tool=%s args=%s",
+                    step,
+                    name,
+                    {
+                        k: (v[:80] + "...") if isinstance(v, str) and len(v) > 80 else v
+                        for k, v in args.items()
+                    },
+                )
+
+                result = self._run_tool(name, args, files_read)
+                conversation.append(
+                    {"role": "tool", "tool_call_id": tc_id, "content": result["content"]}
+                )
+
+        logger.warning("Agent loop ended without producing code")
+        return None
+
+    async def _call_llm(
+        self, system_message: str, conversation: List[Dict[str, Any]]
+    ) -> Dict[str, Any]:
+        """Call a sampled LLM with tool schemas.
+
+        Tries Chat Completions first; falls back to Responses API if the
+        deployment does not support Chat Completions (common on Azure).
+        """
+        model = self.llm_pool.models[
+            self.llm_pool.random_state.choices(
+                range(len(self.llm_pool.models)), weights=self.llm_pool.weights, k=1
+            )[0]
+        ]
+
+        if not hasattr(model, "client"):
+            raise RuntimeError(
+                f"Agentic mode requires an OpenAI-compatible LLM ({type(model).__name__} has no .client)"
+            )
+
+        # If we already know this model needs the Responses API, skip Chat Completions
+        if getattr(model, "_use_responses_api", False):
+            return await self._call_llm_responses(model, system_message, conversation)
+
+        messages = [{"role": "system", "content": system_message}] + conversation
+        is_reasoning = is_openai_reasoning_model(model.model, getattr(model, "api_base", "") or "")
+
+        params: Dict[str, Any] = {
+            "model": model.model,
+            "messages": messages,
+            "tools": TOOL_SCHEMAS,
+            "tool_choice": "auto",
+        }
+        if is_reasoning:
+            if model.max_tokens:
+                params["max_completion_tokens"] = model.max_tokens
+            if getattr(model, "reasoning_effort", None):
+                params["reasoning_effort"] = model.reasoning_effort
+        else:
+            if model.temperature is not None:
+                params["temperature"] = model.temperature
+            if model.top_p is not None:
+                params["top_p"] = model.top_p
+            if model.max_tokens is not None:
+                params["max_tokens"] = model.max_tokens
+
+        loop = asyncio.get_running_loop()
+        try:
+            resp = await loop.run_in_executor(
+                None, lambda: model.client.chat.completions.create(**params)
+            )
+        except Exception as exc:
+            if "unsupported" not in str(exc).lower() and "not found" not in str(exc).lower():
+                raise
+            logger.info("Chat Completions unsupported for agentic; falling back to Responses API")
+            model._use_responses_api = True
+            return await self._call_llm_responses(model, system_message, conversation)
+
+        msg = resp.choices[0].message
+        out: Dict[str, Any] = {"role": "assistant", "content": msg.content or ""}
+        if msg.tool_calls:
+            out["tool_calls"] = [
+                {
+                    "id": tc.id,
+                    "type": "function",
+                    "function": {"name": tc.function.name, "arguments": tc.function.arguments},
+                }
+                for tc in msg.tool_calls
+            ]
+        return out
+
+    async def _call_llm_responses(
+        self,
+        model,
+        system_message: str,
+        conversation: List[Dict[str, Any]],
+    ) -> Dict[str, Any]:
+        """Call the LLM via the Responses API (Azure-compatible) with tool support."""
+        is_reasoning = is_openai_reasoning_model(model.model, getattr(model, "api_base", "") or "")
+
+        input_items = convert_messages_to_responses_input(conversation)
+
+        resp_params: Dict[str, Any] = {
+            "model": model.model,
+            "input": input_items,
+            "instructions": system_message,
+            "tools": TOOL_SCHEMAS_RESPONSES,
+            "tool_choice": "auto",
+        }
+        if is_reasoning:
+            if model.max_tokens:
+                resp_params["max_output_tokens"] = model.max_tokens
+            if getattr(model, "reasoning_effort", None):
+                resp_params["reasoning"] = {"effort": model.reasoning_effort}
+        else:
+            if model.temperature is not None:
+                resp_params["temperature"] = model.temperature
+            if model.max_tokens is not None:
+                resp_params["max_output_tokens"] = model.max_tokens
+
+        loop = asyncio.get_running_loop()
+        resp = await loop.run_in_executor(
+            None, lambda: model.client.responses.create(**resp_params)
+        )
+
+        text, _, tool_calls = extract_responses_output(resp)
+        out: Dict[str, Any] = {"role": "assistant", "content": text}
+        if tool_calls:
+            out["tool_calls"] = tool_calls
+        return out
+
+    # ------------------------------------------------------------------
+    # Tools
+    # ------------------------------------------------------------------
+
+    def _run_tool(self, name: str, args: Dict[str, Any], files_read: set) -> Dict[str, Any]:
+        try:
+            if name == "read_file":
+                return self._tool_read_file(args, files_read)
+            elif name == "search":
+                return self._tool_search(args)
+            return _err(f"Unknown tool '{name}'. Available: read_file, search.")
+        except Exception as e:
+            return _err(f"Tool '{name}' error: {e}")
+
+    def _tool_read_file(self, args: Dict[str, Any], files_read: set) -> Dict[str, Any]:
+        path = args.get("path", "")
+        if not path:
+            return _err("'path' is required.")
+
+        root = self.config.codebase_root
+        if not root:
+            return _err("codebase_root not configured.")
+        full = os.path.join(root, path) if not os.path.isabs(path) else path
+
+        ok, resolved, err = _validate_path(
+            full, root, self.config.allowed_extensions, self.config.excluded_dirs
+        )
+        if not ok:
+            return _err(err)
+
+        if resolved not in files_read and len(files_read) >= self.config.max_files_read:
+            return _err(f"Read limit ({self.config.max_files_read}). Output your solution.")
+
+        try:
+            with open(resolved, "r", encoding="utf-8", errors="replace") as f:
+                lines = f.readlines()
+        except Exception as e:
+            return _err(f"Cannot read: {e}")
+
+        total = len(lines)
+        start = max(1, int(args.get("line_start") or 1)) - 1
+        end = min(total, int(args.get("line_end") or total))
+        content = "".join(lines[start:end])
+
+        if len(content) > self.config.max_file_chars:
+            half = self.config.max_file_chars // 2
+            content = (
+                content[:half]
+                + f"\n\n... ({len(content) - self.config.max_file_chars} chars truncated) ...\n\n"
+                + content[-half:]
+            )
+
+        files_read.add(resolved)
+        rel = os.path.relpath(resolved, root)
+        numbered = [
+            f"{i:4d} | {ln.rstrip(chr(10))}"
+            for i, ln in enumerate(content.splitlines(True), start=start + 1)
+        ]
+        return {"content": f"{rel} (lines {start + 1}-{end} of {total})\n" + "\n".join(numbered)}
+
+    def _tool_search(self, args: Dict[str, Any]) -> Dict[str, Any]:
+        pattern = args.get("pattern", "")
+        glob_pat = args.get("file_glob", "*.py")
+
+        if not pattern:
+            return _err("'pattern' is required.")
+        if len(pattern) > self.config.max_regex_length:
+            return _err(f"Pattern too long ({len(pattern)} > {self.config.max_regex_length}).")
+
+        safety_err = _check_regex_safety(pattern)
+        if safety_err:
+            return _err(safety_err)
+
+        try:
+            compiled = re.compile(pattern)
+        except re.error as e:
+            return _err(f"Invalid regex: {e}")
+
+        root = self.config.codebase_root
+        if not root:
+            return _err("codebase_root not configured.")
+        excluded = set(self.config.excluded_dirs)
+        allowed = set(self.config.allowed_extensions)
+        matches: List[str] = []
+        n_files = 0
+        max_results = self.config.max_search_results
+
+        for dirpath, dirnames, filenames in os.walk(root):
+            dirnames[:] = [d for d in dirnames if not d.startswith(".") and d not in excluded]
+            for fname in filenames:
+                if not fnmatch.fnmatch(fname, glob_pat):
+                    continue
+                if os.path.splitext(fname)[1].lower() not in allowed:
+                    continue
+                fpath = os.path.join(dirpath, fname)
+                try:
+                    if os.path.getsize(fpath) > self.config.max_file_chars:
+                        continue
+                    with open(fpath, "r", encoding="utf-8", errors="replace") as f:
+                        text = f.read()
+                except Exception:
+                    continue
+
+                n_files += 1
+                ok, hits, err = _safe_regex_search(compiled, text, self.config.regex_timeout)
+                if not ok:
+                    return _err(err)
+
+                rel = os.path.relpath(fpath, root)
+                for hit in hits:
+                    matches.append(f"{rel}:{hit}")
+                    if len(matches) >= max_results:
+                        break
+                if len(matches) >= max_results:
+                    break
+            if len(matches) >= max_results:
+                break
+
+        if not matches:
+            return {"content": f"No matches for '{pattern}' in {n_files} files."}
+
+        suffix = f"\n(capped at {max_results} results)" if len(matches) >= max_results else ""
+        return {"content": "\n".join(matches) + suffix}
+
+
+# ------------------------------------------------------------------
+# Helpers
+# ------------------------------------------------------------------
+
+
+def _err(msg: str) -> Dict[str, Any]:
+    return {"content": msg, "_error": True}
+
+
+def _context_chars(system: str, conversation: List[Dict[str, Any]]) -> int:
+    n = len(system)
+    for msg in conversation:
+        n += len(msg.get("content", ""))
+        for tc in msg.get("tool_calls", []):
+            n += len(tc.get("function", {}).get("arguments", ""))
+    return n
+
+
+_SENSITIVE_FILENAMES = frozenset(
+    {
+        ".env",
+        ".env.local",
+        ".env.production",
+        ".env.staging",
+        "secrets.json",
+        "secrets.yaml",
+        "secrets.yml",
+        "credentials.json",
+        "credentials.yaml",
+        "service-account.json",
+        "service_account.json",
+        ".netrc",
+        ".pgpass",
+        ".my.cnf",
+    }
+)
+
+
+def _validate_path(
+    requested: str, root: str, allowed_ext: tuple, excluded_dirs: tuple
+) -> Tuple[bool, str, str]:
+    """Validate a file path. Returns (ok, resolved_path, error_message)."""
+    try:
+        resolved = os.path.realpath(requested)
+    except (OSError, ValueError) as e:
+        return False, "", f"Invalid path: {e}"
+
+    root_abs = os.path.realpath(root)
+    if not resolved.startswith(root_abs + os.sep) and resolved != root_abs:
+        return False, "", "Path outside codebase root."
+
+    try:
+        rel = os.path.relpath(resolved, root_abs)
+        for part in Path(rel).parts:
+            if part in excluded_dirs:
+                return False, "", f"Path in excluded directory '{part}'."
+    except ValueError:
+        pass
+
+    basename = os.path.basename(resolved).lower()
+    if basename in _SENSITIVE_FILENAMES:
+        return False, "", f"Access denied: '{basename}' may contain secrets."
+
+    if not os.path.isfile(resolved):
+        parent_dir = os.path.dirname(resolved)
+        if os.path.isdir(parent_dir):
+            try:
+                siblings = sorted(os.listdir(parent_dir))[:15]
+                rel_dir = os.path.relpath(parent_dir, root_abs)
+                return (
+                    False,
+                    "",
+                    f"Not found: '{os.path.basename(resolved)}'. '{rel_dir}/' contains: {siblings}",
+                )
+            except OSError:
+                pass
+        return False, "", f"File not found: '{requested}'."
+
+    ext = os.path.splitext(resolved)[1].lower()
+    if ext not in allowed_ext:
+        return False, "", f"Extension '{ext}' not allowed."
+
+    return True, resolved, ""
+
+
+_NESTED_QUANTIFIER_RE = re.compile(r"\([^)]*[+*][^)]*\)\s*[+*?]|\([^)]*[+*][^)]*\)\s*\{")
+
+_MAX_SEARCH_LINE_LEN = 2000
+
+
+def _check_regex_safety(pattern: str) -> Optional[str]:
+    """Reject patterns with nested quantifiers that cause catastrophic backtracking."""
+    if _NESTED_QUANTIFIER_RE.search(pattern):
+        return "Nested quantifiers detected (e.g. '(a+)+'). Use a simpler pattern."
+    return None
+
+
+_REGEX_EXECUTOR = concurrent.futures.ThreadPoolExecutor(max_workers=1, thread_name_prefix="regex")
+
+
+def _safe_regex_search(
+    compiled: "re.Pattern", text: str, timeout: float = 2.0
+) -> Tuple[bool, List[str], str]:
+    """Regex search with thread-based timeout."""
+
+    def do_search():
+        return [
+            f"{i}: {line}"
+            for i, line in enumerate(text.splitlines(), 1)
+            if len(line) <= _MAX_SEARCH_LINE_LEN and compiled.search(line)
+        ]
+
+    fut = _REGEX_EXECUTOR.submit(do_search)
+    try:
+        result = fut.result(timeout=timeout)
+        return True, result, ""
+    except concurrent.futures.TimeoutError:
+        return False, [], f"Regex timed out ({timeout}s). Simplify the pattern."