diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..7b6ea64949b35dd2657ef827e1be14580557de3f 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/logo_vector.png filter=lfs diff=lfs merge=lfs -text
+assets/benchmarks.png filter=lfs diff=lfs merge=lfs -text
+assets/scaling_comparison.png filter=lfs diff=lfs merge=lfs -text
+assets/architecture.png filter=lfs diff=lfs merge=lfs -text
+assets/comparison.png filter=lfs diff=lfs merge=lfs -text
diff --git a/assets/architecture.png b/assets/architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..dc1ec9ae9a5fcada5d7f78d9f8b1211292dd271e
--- /dev/null
+++ b/assets/architecture.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b10c6bfb1734211abab7fa2e53b36931428d842ade3c96cbef255543b3889d8
+size 278055
diff --git a/assets/benchmarks.png b/assets/benchmarks.png
new file mode 100644
index 0000000000000000000000000000000000000000..0c31ca7f6b75ba642bf4fc0f9e997f2877cd3ed7
--- /dev/null
+++ b/assets/benchmarks.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42a69cb4c8119b79901ecfcdf93088e932643d6e0890d3c984dead40c407dc5b
+size 758496
diff --git a/assets/comparison.png b/assets/comparison.png
new file mode 100644
index 0000000000000000000000000000000000000000..bf6d0e10a9f2d1623855cb2939a350d1a9d05a05
--- /dev/null
+++ b/assets/comparison.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d68074ff5106764b1328b23ef5e949332aab3541172f8d91e2580d6f168e184
+size 399206
diff --git a/assets/logo_vector.png b/assets/logo_vector.png
new file mode 100644
index 0000000000000000000000000000000000000000..b71958f5bd2f67ac1ff69f9be6a0d7ad3e549390
--- /dev/null
+++ b/assets/logo_vector.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d74ce6a1024e519a5afc85706133e31bafeb06b48b603a11284845b549cb586e
+size 891106
diff --git a/assets/scaling_comparison.png b/assets/scaling_comparison.png
new file mode 100644
index 0000000000000000000000000000000000000000..3c947e5218baf688e7ea5ad7b51db0b17b35e8c8
--- /dev/null
+++ b/assets/scaling_comparison.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2aa00d9f59b5e14fc10d2569b872632fb992ab61fcfbba2ae946bef9deb22d8
+size 296874
diff --git a/benchmarks/ADRS/README.md b/benchmarks/ADRS/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f66aa7e1ba28c5c2b7e5303112087e2015469
--- /dev/null
+++ b/benchmarks/ADRS/README.md
@@ -0,0 +1,63 @@
+# ADRS: AI-Driven Research for Systems
+
+This directory contains the systems optimization benchmarks from the **AI-Driven Research for Systems (ADRS)** initiative at UC Berkeley.
+
+ADRS investigates how AI — large language models, evolutionary algorithms, and multi-agent architectures — can autonomously design, optimize, and evaluate computer systems. Instead of treating systems research as a purely manual process, ADRS frames it as a closed-loop optimization problem: propose candidate algorithms, evaluate them against system-level objectives, analyze failure modes, adapt the search strategy, and iterate.
+
+Each benchmark below defines a concrete systems task with a provided evaluator, initial program, and configuration. Solutions are evolved using SkyDiscover's evolutionary search loop.
+
+## Benchmarks
+
+### Cloudcast — Multi-Cloud Data Transfer
+
+**Directory:** `cloudcast/`
+
+Given a network of cloud regions with heterogeneous egress pricing and bandwidth, broadcast a dataset from a source region to multiple destinations at minimum total cost. The evolved algorithm must construct routing topologies (e.g., relay trees, Steiner-like structures) that exploit shared intermediate hops across transfers.
+
+### Expert Parallelism Load Balancer (EPLB)
+
+**Directory:** `eplb/`
+
+In Mixture-of-Experts (MoE) model inference, a small subset of experts handles each token, leading to GPU load imbalance when certain experts become disproportionately popular. This task evolves an algorithm that decides how many replicas each expert should have and how to assign them across GPUs, optimizing both load-balance quality and rebalancing runtime.
+
+### Model Placement (Prism)
+
+**Directory:** `prism/`
+
+Assign multiple LLM models to a fixed GPU cluster (80 GB per GPU) such that the worst-case KV-cache pressure ratio across GPUs is minimized. Lower pressure means more memory headroom for serving, improving throughput and stability under varying request loads.
+
+### LLM-SQL — Column Reordering for Prefix Caching
+
+**Directory:** `llm_sql/`
+
+When rows of a table are serialized into LLM prompts sequentially, consecutive rows that share leading column values can reuse cached prefixes. This task evolves a column-reordering strategy that maximizes prefix-cache hit rates across multiple real-world datasets without altering the underlying data.
+
+### Transaction Scheduling (TXN)
+
+**Directory:** `txn_scheduling/`
+
+Given a set of database transactions with read/write dependencies on shared keys, find an execution ordering that minimizes the total makespan. The evolved scheduler must respect conflict constraints (read-write and write-write on the same key) while compressing the overall completion time.
+
+### Telemetry Repair
+
+**Coming soon.** The Telemetry Repair benchmark is under active development and will be released in a future update. 
+
+## Quick Start
+
+Each benchmark directory contains:
+- `initial_program.py` — the seed solution for evolution
+- `evaluator.py` — the scoring function
+- `config.yaml` — run configuration
+
+Run any benchmark from the repo root:
+
+```bash
+uv run skydiscover-run \
+  benchmarks/ADRS/cloudcast/initial_program.py \
+  benchmarks/ADRS/cloudcast/evaluator.py \
+  -c benchmarks/ADRS/cloudcast/config.yaml \
+  -s [your_algorithm] \
+  -i 100
+```
+
+See the individual benchmark directories for task-specific setup instructions (e.g., dataset downloads, GPU dependencies).
diff --git a/benchmarks/ADRS/eplb/config.yaml b/benchmarks/ADRS/eplb/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f6bade257e9efee6c73478ab4da871d0a93776cd
--- /dev/null
+++ b/benchmarks/ADRS/eplb/config.yaml
@@ -0,0 +1,37 @@
+# Expert Parallelism Load Balancer (EPLB) — MoE Expert Rearrangement
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+# NOTE: Requires expert-load.json — see README.md for download instructions.
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 5
+max_solution_length: 60000
+
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+
+prompt:
+  system_message: |-
+    You are an expert programmer specializing in optimization algorithms. Your task
+    is to improve the Mixture-of-Expert models Expert Parallelism Load Balancer
+    (MoE EPLB) expert rearrangement algorithm.
+
+    This algorithm will take the load metrics recorded by the vLLM server, and
+    rearrange the experts to balance the load. It can make replicas of some experts
+    to achieve better load balancing.
+
+    Your goal will be two-fold:
+    1. Improve the algorithm to achieve better load balancing; while
+    2. Improve the algorithm to be more efficient, i.e. reduce the execution time
+       of the algorithm itself, since perfect load balancing is NP-hard.
+
+    The current algorithm is implemented in the `rebalance_experts` function.
+
+evaluator:
+  timeout: 360
+
diff --git a/benchmarks/ADRS/llm_sql/README.md b/benchmarks/ADRS/llm_sql/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d5e46187d5eba7e2bea23dc5dcfde36164937b5a
--- /dev/null
+++ b/benchmarks/ADRS/llm_sql/README.md
@@ -0,0 +1,56 @@
+# LLM-SQL — Column Reordering for Prefix Caching
+
+When rows of a table are serialized into LLM prompts sequentially, consecutive rows that share leading column values can reuse cached prefixes. This task evolves a column-reordering strategy that maximizes prefix-cache hit rates across multiple real-world datasets without altering the underlying data.
+
+## Setup
+
+1. **Download the datasets** (~69 MB total):
+
+   ```bash
+   cd benchmarks/ADRS/llm_sql
+   bash download_dataset.sh
+   ```
+
+   This downloads 5 CSV datasets into `datasets/`:
+   - `movies.csv` — Rotten Tomatoes movie reviews (~9 MB)
+   - `beer.csv` — Beer review dataset (~2.5 MB)
+   - `BIRD.csv` — BIRD text-to-SQL dataset (~34 MB)
+   - `PDMX.csv` — PDMX metadata dataset (~7.4 MB)
+   - `products.csv` — Amazon product catalog (~16 MB)
+
+2. **Set your API key:**
+
+   ```bash
+   export OPENAI_API_KEY=...
+   ```
+
+## Run
+
+From the repo root:
+
+```bash
+uv run skydiscover-run \
+  benchmarks/ADRS/llm_sql/initial_program.py \
+  benchmarks/ADRS/llm_sql/evaluator.py \
+  -c benchmarks/ADRS/llm_sql/config.yaml \
+  -s [your_algorithm] \
+  -i 100
+```
+
+## Scoring
+
+Combined score: `0.95 * average_hit_rate + 0.05 * (12 - min(12, avg_runtime)) / 12`
+
+- **Hit rate** (95% weight): prefix-cache hit count normalized across 5 datasets
+- **Runtime** (5% weight): wall-clock seconds for the reordering algorithm
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `initial_program.py` | Baseline `Evolved` class with `reorder()` method to evolve |
+| `evaluator.py` | Scores programs on prefix hit rate and runtime across 5 datasets |
+| `config.yaml` | Task-specific config (LLM, evaluator timeout, system prompt) |
+| `solver.py` | Base `Algorithm` class and greedy baseline |
+| `utils.py` | Prefix hit count evaluation utilities |
+| `download_dataset.sh` | Script to download required CSV datasets |
diff --git a/benchmarks/ADRS/llm_sql/initial_program.py b/benchmarks/ADRS/llm_sql/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..c42922f61b0704fd18da9bb0dd3cae498f93c036
--- /dev/null
+++ b/benchmarks/ADRS/llm_sql/initial_program.py
@@ -0,0 +1,365 @@
+# EVOLVE-BLOCK-START
+import pandas as pd
+from solver import Algorithm
+from typing import Tuple, List, Dict
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from functools import lru_cache
+from collections import Counter
+import networkx as nx
+
+
+class Evolved(Algorithm):
+    """
+    GGR algorithm
+    """
+
+    def __init__(self, df: pd.DataFrame = None):
+        self.df = df
+
+        self.dep_graph = None  # NOTE: not used, for one way dependency
+
+        self.num_rows = 0
+        self.num_cols = 0
+        self.column_stats = None
+        self.val_len = None
+        self.row_stop = None
+        self.col_stop = None
+        self.base = 2000
+
+    def find_max_group_value(self, df: pd.DataFrame, value_counts: Dict, early_stop: int = 0) -> str:
+        # NOTE: recalculate value counts and length for each value
+        value_counts = Counter(df.stack())
+        weighted_counts = {val: self.val_len[val] * (count - 1) for val, count in value_counts.items()}  # if count > 1
+        if not weighted_counts:
+            return None
+        max_group_val, max_weighted_count = max(weighted_counts.items(), key=lambda x: x[1])
+        if max_weighted_count < early_stop:
+            return None
+        return max_group_val
+
+    def reorder_columns_for_value(self, row, value, column_names, grouped_rows_len: int = 1):
+        # cols_with_value will now use attribute access instead of indexing with row[]
+        cols_with_value = []
+        for idx, col in enumerate(column_names):
+            if hasattr(row, col) and getattr(row, col) == value:
+                cols_with_value.append(col)
+            elif hasattr(row, col.replace(" ", "_")) and getattr(row, col.replace(" ", "_")) == value:
+                cols_with_value.append(col)
+            else:
+                attr_name = f"_{idx}"
+                if hasattr(row, attr_name) and getattr(row, attr_name) == value:
+                    cols_with_value.append(attr_name)
+
+        if self.dep_graph is not None and grouped_rows_len > 1:
+            # NOTE: experimental
+            reordered_cols = []
+            for col in cols_with_value:
+                dependent_cols = self.get_dependent_columns(col)
+
+                # check if dependent columns are in row, and if column exists in row attributes
+                valid_dependent_cols = []
+                for idx, dep_col in enumerate(dependent_cols):
+                    if hasattr(row, dep_col):
+                        valid_dependent_cols.append(dep_col)
+                    elif hasattr(row, dep_col.replace(" ", "_")):
+                        valid_dependent_cols.append(dep_col)
+                    else:
+                        attr_name = f"_{idx}"
+                        if hasattr(row, attr_name):
+                            valid_dependent_cols.append(dep_col)
+
+                reordered_cols.extend([col] + valid_dependent_cols)
+            cols_without_value = [col for col in column_names if col not in reordered_cols]
+            reordered_cols.extend(cols_without_value)
+            assert len(reordered_cols) == len(
+                column_names
+            ), f"Reordered cols len: {len(reordered_cols)}  Original cols len: {len(column_names)}"
+            return [getattr(row, col) for col in reordered_cols], cols_with_value
+        else:
+            cols_without_value = []
+            for idx, col in enumerate(column_names):
+                if hasattr(row, col) and getattr(row, col) != value:
+                    cols_without_value.append(col)
+                elif hasattr(row, col.replace(" ", "_")) and getattr(row, col.replace(" ", "_")) != value:
+                    cols_without_value.append(col)
+                else:
+                    # Handle some edge cases
+                    attr_name = f"_{idx}"
+                    if hasattr(row, attr_name) and getattr(row, attr_name) != value:
+                        cols_without_value.append(attr_name)
+
+            reordered_cols = cols_with_value + cols_without_value
+            assert len(reordered_cols) == len(
+                column_names
+            ), f"Reordered cols len: {len(reordered_cols)}  Original cols len: {len(column_names)}"
+            return [getattr(row, col) for col in reordered_cols], cols_with_value
+
+    def get_dependent_columns(self, col: str) -> List[str]:
+        if self.dep_graph is None or not self.dep_graph.has_node(col):
+            return []
+        return list(nx.descendants(self.dep_graph, col))
+
+    @lru_cache(maxsize=None)
+    def get_cached_dependent_columns(self, col: str) -> List[str]:
+        return self.get_dependent_columns(col)
+
+    def fixed_reorder(self, df: pd.DataFrame, row_sort: bool = True) -> Tuple[pd.DataFrame, List[List[str]]]:
+        num_rows, column_stats = self.calculate_col_stats(df, enable_index=True)
+        reordered_columns = [col for col, _, _, _ in column_stats]
+        reordered_df = df[reordered_columns]
+
+        assert reordered_df.shape == df.shape
+        column_orderings = [reordered_columns] * num_rows
+
+        if row_sort:
+            reordered_df = reordered_df.sort_values(by=reordered_columns, axis=0)
+
+        return reordered_df, column_orderings
+
+    def column_recursion(self, result_df, max_value, grouped_rows, row_stop, col_stop, early_stop):
+        cols_settled = []
+        with ThreadPoolExecutor() as executor:
+            futures = [
+                executor.submit(self.reorder_columns_for_value, row, max_value, grouped_rows.columns.tolist(), len(grouped_rows))
+                for row in grouped_rows.itertuples(index=False)
+            ]
+            for i, future in enumerate(as_completed(futures)):
+                reordered_row, cols_settled = future.result()
+                result_df.loc[i] = reordered_row
+
+        grouped_value_counts = Counter()
+
+        if not result_df.empty:
+            # Group by the first column
+            grouped_result_df = result_df.groupby(result_df.columns[0])
+            grouped_value_counts = Counter(grouped_rows.stack())  # this is still faster than updating from cached value counts
+
+            for _, group in grouped_result_df:
+                if group[group.columns[0]].iloc[0] != max_value:
+                    continue
+
+                dependent_cols = self.get_cached_dependent_columns(group.columns[0])
+                length_of_settle_cols = len(cols_settled)
+
+                if dependent_cols:
+                    assert length_of_settle_cols >= 1, f"Dependent columns should be no less than 1, but got {length_of_settle_cols}"
+
+                    # test the first length_of_settle_cols columns, each column has nunique == 1
+                    for col in group.columns[:length_of_settle_cols]:
+                        assert group[col].nunique() == 1, f"Column {col} should have nunique == 1, but got {group[col].nunique()}"
+
+                    # drop all the settled columns and reorder the rest
+                    group_remainder = group.iloc[:, length_of_settle_cols:]
+                else:
+                    group_remainder = group.iloc[:, 1:]
+
+                grouped_remainder_value_counts = Counter(group_remainder.stack())
+
+                reordered_group_remainder, _ = self.recursive_reorder(
+                    group_remainder, grouped_remainder_value_counts, early_stop=early_stop, row_stop=row_stop, col_stop=col_stop + 1
+                )
+                # Update the group with the reordered columns
+                if dependent_cols:
+                    group.iloc[:, length_of_settle_cols:] = reordered_group_remainder.values
+                else:
+                    group.iloc[:, 1:] = reordered_group_remainder.values
+
+                result_df.update(group)
+                break
+
+        return result_df, grouped_value_counts
+
+    def recursive_reorder(
+        self,
+        df: pd.DataFrame,
+        value_counts: Dict,
+        early_stop: int = 0,
+        original_columns: List[str] = None,
+        row_stop: int = 0,
+        col_stop: int = 0,
+    ) -> Tuple[pd.DataFrame, List[List[str]]]:
+        if df.empty or len(df.columns) == 0 or len(df) == 0:
+            return df, []
+
+        if self.row_stop is not None and row_stop >= self.row_stop:
+            return self.fixed_reorder(df)
+
+        if self.col_stop is not None and col_stop >= self.col_stop:
+            return self.fixed_reorder(df)
+
+        if original_columns is None:
+            original_columns = df.columns.tolist()
+
+        # Find the max group value using updated counts
+        max_value = self.find_max_group_value(df, value_counts, early_stop=early_stop)
+        if max_value is None:
+            # If there is no max value, then fall back to fixed reorder
+            return self.fixed_reorder(df)
+
+        grouped_rows = df[df.isin([max_value]).any(axis=1)]
+        remaining_rows = df[~df.isin([max_value]).any(axis=1)]
+
+        # If there is no grouped rows, return the original DataFrame
+        if grouped_rows.empty:
+            return self.fixed_reorder(df)
+
+        result_df = pd.DataFrame(columns=df.columns)
+
+        reordered_remaining_rows = pd.DataFrame(columns=df.columns)  # Initialize empty dataframe first
+
+        # Column Recursion
+        result_df, grouped_value_counts = self.column_recursion(result_df, max_value, grouped_rows, row_stop, col_stop, early_stop)
+
+        remaining_value_counts = value_counts - grouped_value_counts  # Approach 1 - update remaining value counts with subtraction
+
+        # Row Recursion
+        reordered_remaining_rows, _ = self.recursive_reorder(
+            remaining_rows, remaining_value_counts, early_stop=early_stop, row_stop=row_stop + 1, col_stop=col_stop
+        )
+        old_column_names = result_df.columns.tolist()
+        result_cols_reset = result_df.reset_index(drop=True)
+        result_rows_reset = reordered_remaining_rows.reset_index(drop=True)
+        final_result_df = pd.DataFrame(result_cols_reset.values.tolist() + result_rows_reset.values.tolist())
+
+        if row_stop == 0 and col_stop == 0:
+            final_result_df.columns = old_column_names
+            final_result_df.columns = final_result_df.columns.tolist()[:-1] + ["original_index"]
+
+        return final_result_df, []
+
+    def recursive_split_and_reorder(self, df: pd.DataFrame, original_columns: List[str] = None, early_stop: int = 0):
+        """
+        Recursively split the DataFrame into halves until the size is <= 1000, then apply the recursive reorder function.
+        """
+        if len(df) <= self.base:
+            initial_value_counts = Counter(df.stack())
+            return self.recursive_reorder(df, initial_value_counts, early_stop, original_columns, row_stop=0, col_stop=0)[0]
+
+        mid_index = len(df) // 2
+        df_top_half = df.iloc[:mid_index]
+        df_bottom_half = df.iloc[mid_index:]
+
+        with ThreadPoolExecutor() as executor:
+            future_top = executor.submit(self.recursive_split_and_reorder, df_top_half, original_columns, early_stop)
+            future_bottom = executor.submit(self.recursive_split_and_reorder, df_bottom_half, original_columns, early_stop)
+
+        reordered_top_half = future_top.result()
+        reordered_bottom_half = future_bottom.result()
+
+        assert reordered_bottom_half.shape == df_bottom_half.shape
+        reordered_df = pd.concat([reordered_top_half, reordered_bottom_half], axis=0, ignore_index=True)
+
+        assert reordered_df.shape == df.shape
+
+        return reordered_df
+
+    @lru_cache(maxsize=None)
+    def calculate_length(self, value):
+        if isinstance(value, bool):
+            return 4**2
+        if isinstance(value, (int, float)):
+            return len(str(value)) ** 2
+        if isinstance(value, str):
+            return len(value) ** 2
+        return 0
+
+    def reorder(
+        self,
+        df: pd.DataFrame,
+        early_stop: int = 0,
+        row_stop: int = None,
+        col_stop: int = None,
+        col_merge: List[List[str]] = [],
+        one_way_dep: List[Tuple[str, str]] = [],
+        distinct_value_threshold: float = 0.8,
+        parallel: bool = True,
+    ) -> Tuple[pd.DataFrame, List[List[str]]]:
+        # Prepare
+        initial_df = df.copy()
+        if col_merge:
+            self.num_rows, self.column_stats = self.calculate_col_stats(df, enable_index=True)
+            reordered_columns = [col for col, _, _, _ in self.column_stats]
+            for col_to_merge in col_merge:
+                final_col_order = [col for col in reordered_columns if col in col_to_merge]
+                df = self.merging_columns(df, final_col_order, prepended=False)
+        self.num_rows, self.column_stats = self.calculate_col_stats(df, enable_index=True)
+        self.column_stats = {col: (num_groups, avg_len, score) for col, num_groups, avg_len, score in self.column_stats}
+
+        # One way dependency statistics [not used]
+        if one_way_dep is not None and len(one_way_dep) > 0:
+            self.dep_graph = nx.DiGraph()
+            for dep in one_way_dep:
+                col1 = [col for col in df.columns if dep[0] in col]
+                col2 = [col for col in df.columns if dep[1] in col]
+                assert len(col1) == 1, f"Expected one column to match {dep[0]}, but got {len(col1)}"
+                assert len(col2) == 1, f"Expected one column to match {dep[1]}, but got {len(col2)}"
+                col1 = col1[0]
+                col2 = col2[0]
+                self.dep_graph.add_edge(col1, col2)
+
+        # Discard too distinct columns by threshold [optional]
+        nunique_threshold = len(df) * distinct_value_threshold
+        columns_to_discard = [col for col in df.columns if df[col].nunique() > nunique_threshold]
+        columns_to_discard = sorted(columns_to_discard, key=lambda x: self.column_stats[x][2], reverse=True)
+        columns_to_recurse = [col for col in df.columns if col not in columns_to_discard]
+        df["original_index"] = range(len(df))
+        discarded_columns_df = df[columns_to_discard + ["original_index"]]
+        df_to_recurse = df[columns_to_recurse + ["original_index"]]
+        recurse_df = df_to_recurse
+
+        self.column_stats = {col: stats for col, stats in self.column_stats.items() if col not in columns_to_discard}
+        initial_value_counts = Counter(recurse_df.stack())
+        self.val_len = {val: self.calculate_length(val) for val in initial_value_counts.keys()}
+
+        self.row_stop = row_stop if row_stop else len(recurse_df)
+        self.col_stop = col_stop if col_stop else len(recurse_df.columns.tolist())
+        print("*" * 80)
+        print(f"DF columns = {df.columns}")
+        # print(f"Early stop = {early_stop}")
+        # print(f"Row recursion stop depth = {self.row_stop}, Column recursion stop depth = {self.col_stop}")
+        print("*" * 80)
+
+        # Eary stop and fall back
+        recurse_df, _ = self.fixed_reorder(recurse_df)
+
+        # Recursive reordering
+        self.num_cols = len(recurse_df.columns)
+        if parallel:
+            reordered_df = self.recursive_split_and_reorder(recurse_df, original_columns=columns_to_recurse, early_stop=early_stop)
+        else:
+            reordered_df, _ = self.recursive_reorder(
+                recurse_df,
+                initial_value_counts,
+                early_stop=early_stop,
+            )
+
+        assert (
+            reordered_df.shape == recurse_df.shape
+        ), f"Reordered DataFrame shape {reordered_df.shape} does not match original DataFrame shape {recurse_df.shape}"
+        assert recurse_df["original_index"].is_unique, "Passed in recurse index contains duplicates!"
+        assert reordered_df["original_index"].is_unique, "Reordered index contains duplicates!"
+
+        if len(columns_to_discard) > 0:
+            final_df = pd.merge(reordered_df, discarded_columns_df, on="original_index", how="left")
+        else:
+            final_df = reordered_df
+
+        final_df = final_df.drop(columns=["original_index"])
+
+        if not col_merge:
+            assert (
+                final_df.shape == initial_df.shape
+            ), f"Final DataFrame shape {final_df.shape} does not match original DataFrame shape {initial_df.shape}"
+        else:
+            assert (
+                final_df.shape[0] == initial_df.shape[0]
+            ), f"Final DataFrame shape {final_df.shape} does not match original DataFrame shape {initial_df.shape}"
+            assert (
+                final_df.shape[1] == recurse_df.shape[1] + len(columns_to_discard) - 1
+            ), f"Final DataFrame shape {final_df.shape} does not match original DataFrame shape {recurse_df.shape}"
+
+        # sort by the first column to get the final order
+        final_df = final_df.sort_values(by=final_df.columns.to_list(), axis=0)
+        return final_df, []
+
+# EVOLVE-BLOCK-END
\ No newline at end of file
diff --git a/benchmarks/ADRS/prism/config.yaml b/benchmarks/ADRS/prism/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8606e18c28146db6bbc073acb15863388f17601a
--- /dev/null
+++ b/benchmarks/ADRS/prism/config.yaml
@@ -0,0 +1,24 @@
+# Prism (GPU Model Placement) — Prompt Caching Column Reordering Optimization
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 5
+max_solution_length: 60000
+
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+
+prompt:
+  system_message: |-
+    You are an expert for model placement on GPUs. Your task is to improve a model placement algorithm by improve the function named compute_model_placement in the intial program that places models to available GPUs.
+    The algorithm must MINIMIZE the maximum KVPR across all GPUs while ensuring models can fit into the GPUs' memory. Note that KVPR is KV cache pressure for a GPU. It indicates how crowded a GPU is. For a specific GPU, its KVPR is computed as sum(model.req_rate/model.slo for model in models) / (GPU_MEM_SIZE - sum(model.model_size for model in models)), where models are the models on this GPU. The generated program should be as simple as possible and the code should be executed correctly without errors.
+
+evaluator:
+  timeout: 360
+
diff --git a/benchmarks/ADRS/prism/evaluator/evaluate.sh b/benchmarks/ADRS/prism/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/ADRS/prism/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/ADRS/prism/evaluator/evaluator.py b/benchmarks/ADRS/prism/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..8689033cbbcb0bb74c1dcb50b73604fa59c287ba
--- /dev/null
+++ b/benchmarks/ADRS/prism/evaluator/evaluator.py
@@ -0,0 +1,259 @@
+import importlib.util
+import numpy as np
+import time
+import concurrent.futures
+import traceback
+from dataclasses import dataclass
+
+GPU_MEM_SIZE = 80 # GB
+MIN_INT = float('-inf')  # Define MIN_INT as negative infinity
+
+@dataclass
+class Model:
+    model_name: str
+    model_size: int
+    req_rate: int
+    slo: int
+    cur_gpu_id: int
+
+
+def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=30):
+    """
+    Run a function with a timeout using concurrent.futures
+    """
+    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+        future = executor.submit(func, *args, **kwargs)
+        try:
+            result = future.result(timeout=timeout_seconds)
+            return result
+        except concurrent.futures.TimeoutError:
+            raise TimeoutError(f"Function timed out after {timeout_seconds} seconds")
+
+
+def safe_float(value):
+    """Convert a value to float safely"""
+    try:
+        if np.isnan(value) or np.isinf(value):
+            return 0.0
+        return float(value)
+    except (TypeError, ValueError):
+        return 0.0
+
+def verify_gpu_mem_constraint(placement_data: dict[int, list[Model]]) -> bool:
+    """
+    Verify the whether models can fit into GPU memory
+    """
+    # Check if the placement data is valid
+    if placement_data is None:
+        return False
+
+    # Check if the placement data is valid
+    for gpu_id, models in placement_data.items():
+        if sum(model.model_size for model in models) > GPU_MEM_SIZE:
+            return False
+
+    return True
+
+
+def calculate_kvcache_pressure(placement_data: dict[int, list[Model]]) -> float:
+    """
+    Calculate the KVCache pressure
+    """
+    max_kvpr = MIN_INT
+    for gpu_id, models in placement_data.items():
+        total_model_size = sum(model.model_size for model in models)
+        total_weighted_req_rate = sum(model.req_rate / model.slo for model in models)
+        if GPU_MEM_SIZE - total_model_size > 0:
+            kvpr = total_weighted_req_rate / (GPU_MEM_SIZE - total_model_size)
+        else:
+            kvpr = 1000000
+        max_kvpr = max(max_kvpr, kvpr)
+
+    return max_kvpr
+
+
+def generate_test_gpu_models(num_tests=50):
+    """
+    Generate multiple test signals with different characteristics
+    """
+    test_cases = []
+    np.random.seed(42)
+
+    for i in range(num_tests):
+        gpu_num = np.random.randint(5, 10)
+        gpu_models = []
+        for j in range(gpu_num*2):
+            model_size = np.random.randint(10, 30)
+            req_rate = np.random.randint(1, 10)
+            slo = np.random.randint(5, 10)
+            gpu_models.append(Model(model_name=f"model_{j}", model_size=model_size, req_rate=req_rate, slo=slo, cur_gpu_id=j))
+
+        test_cases.append((gpu_num, gpu_models))
+
+    return test_cases
+
+def evaluate(program_path):
+    """
+    Main evaluation function that tests the signal processing algorithm
+    on multiple test signals and calculates the composite performance metric.
+    """
+    try:
+        # Load the program
+        spec = importlib.util.spec_from_file_location("program", program_path)
+        program = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(program)
+
+        # Check if required function exists
+        if not hasattr(program, "compute_model_placement"):
+            return {
+                "max_kvpr": 0.0,
+                "success_rate": 0.0,
+                "combined_score": 0.0,
+                "error": "Missing compute_model_placement function",
+                }
+
+        # Generate test gpu and models
+        test_gpu_models = generate_test_gpu_models()
+
+        # Collect metrics across all tests
+        all_kvpr = []
+        all_metrics = []
+        successful_runs = 0
+
+        for i, (gpu_num, gpu_models) in enumerate(test_gpu_models):
+            try:
+                # Run the algorithm with timeout
+                start_time = time.time()
+
+                # Call the program's main function
+                result = run_with_timeout(
+                    program.compute_model_placement,
+                    kwargs={
+                        'gpu_num': gpu_num,
+                        'models': gpu_models
+                    },
+                    timeout_seconds=10
+                )
+
+                execution_time = time.time() - start_time
+
+                # Validate result format
+                if not isinstance(result, dict):
+                    return {
+                        "max_kvpr": 0.0,
+                        "success_rate": 0.0,
+                        "combined_score": 0.0,
+                        "error": f"Placement {i}: Expected dict, got {type(result).__name__}",
+                    }
+
+                # Validate all models are placed
+                placed_models = []
+                for gpu_id, assigned_models in result.items():
+                    if not isinstance(assigned_models, list):
+                        return {
+                            "max_kvpr": 0.0,
+                            "success_rate": 0.0,
+                            "combined_score": 0.0,
+                            "error": f"GPU {gpu_id} value must be list, got {type(assigned_models).__name__}",
+                        }
+                    placed_models.extend(assigned_models)
+
+                if len(placed_models) != len(gpu_models):
+                    return {
+                        "max_kvpr": 0.0,
+                        "success_rate": 0.0,
+                        "combined_score": 0.0,
+                        "error": f"Not all models placed: {len(placed_models)}/{len(gpu_models)}",
+                    }
+
+                # Check for duplicate placements (by object identity)
+                placed_ids = [id(m) for m in placed_models]
+                if len(set(placed_ids)) != len(placed_ids):
+                    return {
+                        "max_kvpr": 0.0,
+                        "success_rate": 0.0,
+                        "combined_score": 0.0,
+                        "error": f"Duplicate models detected",
+                    }
+
+                # Check placed models are the exact input objects
+                original_ids = {id(m) for m in gpu_models}
+                if set(placed_ids) != original_ids:
+                    return {
+                        "max_kvpr": 0.0,
+                        "success_rate": 0.0,
+                        "combined_score": 0.0,
+                        "error": "Placed models don't match input models (missing or foreign models)",
+                    }
+
+                # Verify GPU memory constraints
+                if not verify_gpu_mem_constraint(result):
+                    return {
+                        "max_kvpr": 0.0,
+                        "success_rate": 0.0,
+                        "combined_score": 0.0,
+                        "error": f"GPU memory constraint violated",
+                    }
+
+                # Calculate metrics using the generated test signal
+                max_kvpr = calculate_kvcache_pressure(result)
+
+                # Store metrics
+                metrics = {
+                    'max_kvpr': safe_float(max_kvpr),
+                    'execution_time': safe_float(execution_time),
+                }
+
+                all_kvpr.append(safe_float(max_kvpr))
+                all_metrics.append(metrics)
+                successful_runs += 1
+
+            except TimeoutError:
+                print(f"Placement {i}: Timeout")
+                continue
+            except Exception as e:
+                print(f"Placement {i}: Error - {str(e)}")
+                continue
+
+        # If no successful runs, return minimal scores
+        if successful_runs == 0:
+            return {
+                    "max_kvpr": 0.0,
+                    "success_rate": 0.0,
+                    "combined_score": 0.0,
+                    "error": "All test signals failed"
+                }
+
+        print(all_metrics)
+        # Calculate aggregate metrics
+        avg_kvpr = np.mean(all_kvpr)
+        if avg_kvpr != 0:
+            avg_kvpr = 1.0 / avg_kvpr
+        avg_execution_time = np.mean([m['execution_time'] for m in all_metrics])
+        success_rate = successful_runs / len(test_gpu_models)
+
+        return {
+                "max_kvpr": safe_float(avg_kvpr),
+                "execution_time": safe_float(avg_execution_time),
+                "success_rate": safe_float(success_rate),
+                "combined_score": safe_float(avg_kvpr) + safe_float(success_rate),
+            }
+
+    except Exception as e:
+        print(f"Evaluation failed: {str(e)}")
+        print(traceback.format_exc())
+        return {
+                "max_kvpr": 0.0,
+                "success_rate": 0.0,
+                "combined_score": 0.0,
+                "error": str(e)
+            }
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is auto-injected at build time from
+    # skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/ADRS/prism/initial_program.py b/benchmarks/ADRS/prism/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3f372884d04e875ddb136a2127be586c704cbfe
--- /dev/null
+++ b/benchmarks/ADRS/prism/initial_program.py
@@ -0,0 +1,75 @@
+GPU_MEM_SIZE = 80 # GB
+
+# EVOLVE-BLOCK-START
+
+def compute_model_placement(gpu_num, models):
+    """
+    Compute a model placement that minimizes the maximum KVPR across all GPUs.
+
+    Args:
+        gpu_num: Number of GPUs
+        models: List of models to place
+
+    Returns:
+        A placement of models to GPUs
+    """
+
+    # Greedy KVPR-minimizing placement based on Algorithm 1 (without τ check)
+    # 1) Sort models by r_j / s_j in descending order
+    sorted_models = sorted(models, key=lambda m: (m.req_rate / m.slo), reverse=True)
+
+    # 2) Initialize per-GPU states
+    placement = {gpu_id: [] for gpu_id in range(gpu_num)}
+    shared_kv = [GPU_MEM_SIZE for _ in range(gpu_num)]  # remaining memory per GPU
+    weighted_req_rate = [0.0 for _ in range(gpu_num)]   # sum of r_j / s_j per GPU
+
+    # 3) Assign each model to the GPU that minimizes current KVPR while fitting in memory
+    for model in sorted_models:
+        best_idx = None
+        best_ratio = float('inf')
+
+        for gpu_id in range(gpu_num):
+            if model.model_size <= shared_kv[gpu_id] and shared_kv[gpu_id] > 0:
+                current_ratio = weighted_req_rate[gpu_id] / shared_kv[gpu_id]
+                if current_ratio < best_ratio:
+                    best_ratio = current_ratio
+                    best_idx = gpu_id
+
+        # Failure: if no GPU can fit, raise an error instead of overcommitting
+        if best_idx is None:
+            raise ValueError(
+                f"Unable to place model of size {model.model_size} GB on any GPU. "
+                f"Remaining per-GPU memory: {shared_kv}"
+            )
+
+        placement[best_idx].append(model)
+        weighted_req_rate[best_idx] += model.req_rate / model.slo
+        shared_kv[best_idx] -= model.model_size
+
+    return placement
+
+# EVOLVE-BLOCK-END
+
+
+if __name__ == "__main__":
+    # Test the algorithm
+
+    from evaluator import generate_test_gpu_models
+    from evaluator import calculate_kvcache_pressure
+    from evaluator import safe_float
+    import numpy as np
+
+    test_cases = generate_test_gpu_models()
+    all_kvpr = []
+    for i, (gpu_num, gpu_models) in enumerate(test_cases):
+
+        results = compute_model_placement(gpu_num, gpu_models)
+        max_kvpr = calculate_kvcache_pressure(results)
+        all_kvpr.append(safe_float(max_kvpr))
+
+    avg_kvpr = np.mean(all_kvpr)
+    if avg_kvpr != 0:
+        avg_kvpr = 1.0 / avg_kvpr
+
+
+    print(f"Max KVPR: {avg_kvpr:.3f}")
diff --git a/benchmarks/ADRS/prism/initial_program_naive.py b/benchmarks/ADRS/prism/initial_program_naive.py
new file mode 100644
index 0000000000000000000000000000000000000000..01013917e811741d8041fbdce5569d36e7dc6a1c
--- /dev/null
+++ b/benchmarks/ADRS/prism/initial_program_naive.py
@@ -0,0 +1,30 @@
+# EVOLVE-BLOCK-START
+
+GPU_MEM_SIZE = 80 # GB
+
+def compute_model_placement(gpu_num, models):
+    """
+    Compute a model placement that minimizes the maximum KVPR across all GPUs.
+
+    Args:
+        gpu_num: Number of GPUs
+        models: List of models to place
+
+    Returns:
+        A placement of models to GPUs
+    """
+
+    # gready algorithm to place models to the GPUs with smallest gpu_id first
+
+    placement = dict()
+    for gpu_id in range(gpu_num):
+        placement[gpu_id] = []
+
+    for model in models:
+        for gpu_id in range(gpu_num):
+            if model.model_size <= GPU_MEM_SIZE - sum(model.model_size for model in placement[gpu_id]):
+                placement[gpu_id].append(model)
+                break
+    return placement
+
+# EVOLVE-BLOCK-END
diff --git a/benchmarks/arc_benchmark/README.md b/benchmarks/arc_benchmark/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ce915d186b4c6626b41675fac9805b8d194896fb
--- /dev/null
+++ b/benchmarks/arc_benchmark/README.md
@@ -0,0 +1,108 @@
+# ARC Benchmark
+
+Evolves ARC-AGI visual reasoning task solutions using SkyDiscover.
+
+## Setup
+
+### 1. Download ARC data
+
+Clone the ARC-AGI-2 repo and convert the data:
+
+```bash
+cd benchmarks/arc_benchmark
+git clone https://github.com/arcprize/ARC-AGI-2.git /tmp/ARC-AGI-2
+OUT_DIR=./data uv run python convert_arc_agi2_data.py /tmp/ARC-AGI-2
+rm -rf /tmp/ARC-AGI-2
+```
+
+This creates 4 files in `data/`:
+- `arc-agi_training_challenges.json` (1000 tasks)
+- `arc-agi_training_solutions.json`
+- `arc-agi_evaluation_challenges.json` (120 tasks)
+- `arc-agi_evaluation_solutions.json`
+
+### 2. Set your API key
+
+```bash
+export OPENAI_API_KEY=...
+```
+
+## Run a single task
+
+ARC requires a per-task config (each task has unique training examples as the prompt). Use `generate_config.py` to create one, then run with any search backend:
+
+```bash
+cd benchmarks/arc_benchmark
+
+# Generate task-specific config
+TASK_NUM=0 ARC_TASK_FILE=training CONFIG_OUT=./config_task_0.yaml \
+  uv run python generate_config.py
+
+# Run with any backend
+uv run skydiscover-run initial_program.py evaluator.py \
+  -c config_task_0.yaml -s [your_algorithm] -i 30
+
+# Or with evox, openevolve, gepa:
+uv run skydiscover-run initial_program.py evaluator.py \
+  -c config_task_0.yaml -s [your_algorithm] -i 30
+```
+
+## Run all evaluation tasks
+
+```bash
+cd benchmarks/arc_benchmark
+export ARC_TASK_FILE=evaluation
+
+NUM_TASKS=$(uv run python -c "import json; print(len(json.load(open('data/arc-agi_evaluation_challenges.json'))))")
+
+for i in $(seq 0 $((NUM_TASKS - 1))); do
+  TASK_NUM=$i CONFIG_OUT=./config_task_${i}.yaml uv run python generate_config.py
+  TASK_NUM=$i uv run skydiscover-run initial_program.py evaluator.py \
+    -c config_task_${i}.yaml -s [your_algorithm] -i 30 \
+    -o outputs/eval_task_${i}
+done
+```
+
+## Post-discovery test evaluation
+
+After the discovery process, evaluate the best program on held-out test inputs:
+
+```bash
+TASK_NUM=0 ARC_TASK_FILE=evaluation \
+  OUTS_DIR=./outputs/eval_task_0/adaevolve \
+  uv run python post_discovery_eval.py
+```
+
+## Config: GPT vs Gemini
+
+Edit `config.yaml` — comment the GPT block and uncomment the Gemini block, or override with `--model`:
+
+```bash
+uv run skydiscover-run ... -m gemini/gemini-3-pro-preview
+```
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `initial_program.py` | Seed program with two transform functions to evolve |
+| `evaluator.py` | Scores programs on pass@2 + cell accuracy |
+| `config.yaml` | Base config template (prompt injected by generate_config.py) |
+| `generate_config.py` | Injects task-specific training examples into config as system prompt |
+| `post_discovery_eval.py` | Evaluates best program on held-out test inputs |
+| `convert_arc_agi2_data.py` | Converts raw ARC-AGI-2 data to benchmark format |
+| `requirements.txt` | Dependencies (numpy) |
+
+## Environment variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `OPENAI_API_KEY` | (required) | API key |
+| `ARC_TASK_FILE` | `training` | `training` or `evaluation` |
+| `TASK_NUM` | `0` | Task index within the dataset |
+| `BASE_CONFIG` | `./config.yaml` | Base config template path |
+| `CONFIG_OUT` | `./config_task_{N}.yaml` | Output path for generated config |
+| `DATA_ROOT` | `./data` | Path to ARC data directory |
+| `MAX_ITERATIONS` | (from config) | Override `max_iterations` at runtime |
+| `ARC_EVAL_INCLUDE_TEST` | `0` | Set to `1` to also run the held-out test inputs during evolution |
+| `ARC_EVAL_USE_TEST_FOR_SCORE` | `0` | Set to `1` to average train and test scores into `combined_score` (only used when `ARC_EVAL_INCLUDE_TEST=1`) |
diff --git a/benchmarks/arc_benchmark/config.yaml b/benchmarks/arc_benchmark/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6b11cef587896cdb13bc50a172b0470fd29675bc
--- /dev/null
+++ b/benchmarks/arc_benchmark/config.yaml
@@ -0,0 +1,51 @@
+# ARC Benchmark base config
+# This file is used by generate_config.py to inject a task-specific prompt.
+# Switch models by editing the 'llm' section below.
+
+# General settings
+max_iterations: 30
+checkpoint_interval: 10
+log_level: "INFO"
+random_seed: 42
+diff_based_generation: true
+max_solution_length: 50000
+
+# LLM configuration
+llm:
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  api_base: "https://api.openai.com/v1"
+  temperature: 0.7
+  # top_p: 0.95  # omitted by default; some providers (e.g. Anthropic) reject both temperature and top_p
+  max_tokens: 32768
+  timeout: 3000
+
+# Option B: Gemini 3 Pro (comment Option A and uncomment below)
+# llm:
+#   models:
+#     - name: "gemini-3-pro-preview"
+#       weight: 1.0
+#   api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
+#   temperature: 0.7
+#   top_p: 0.95
+#   max_tokens: 32768
+#   timeout: 3000
+
+# Search configuration (default: top-k)
+search:
+  type: "topk"
+  database:
+    random_seed: 42
+  num_context_programs: 4
+
+# Prompt configuration
+# NOTE: generate_config.py overwrites prompt.system_message per task.
+prompt:
+  system_message: "PLACEHOLDER_REPLACED_BY_GENERATE_CONFIG"
+
+# Evaluator configuration
+evaluator:
+  timeout: 360
+  max_retries: 3
+  cascade_evaluation: false
diff --git a/benchmarks/arc_benchmark/convert_arc_agi2_data.py b/benchmarks/arc_benchmark/convert_arc_agi2_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..78d588d4fa3c7f2260d553d571dad5d3b8bbc0fa
--- /dev/null
+++ b/benchmarks/arc_benchmark/convert_arc_agi2_data.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+"""
+Convert ARC-AGI-2-style data (data/training/*.json, data/evaluation/*.json)
+into the format expected by this benchmark:
+  - arc-agi_{split}_challenges.json  (task_id -> { train, test with inputs only })
+  - arc-agi_{split}_solutions.json   (task_id -> list of test output grids)
+
+Usage (from benchmarks/arc_benchmark, with data already in ./data/training and ./data/evaluation):
+  OUT_DIR=./data python3 convert_arc_agi2_data.py .
+
+Or with an external ARC-AGI-2 clone:
+  python3 convert_arc_agi2_data.py /path/to/ARC-AGI-2
+  # Writes into that path by default; set OUT_DIR to write elsewhere.
+"""
+import json
+import os
+import sys
+
+
+def convert_split(repo_root: str, split: str, out_dir: str) -> None:
+    """Convert data/{split}/*.json into challenges + solutions JSON."""
+    split_dir = os.path.join(repo_root, "data", split)
+    if not os.path.isdir(split_dir):
+        print(f"Skip {split}: no directory {split_dir}")
+        return
+
+    challenges = {}
+    solutions = {}
+
+    for name in sorted(os.listdir(split_dir)):
+        if not name.endswith(".json"):
+            continue
+        task_id = name[:-5]  # strip .json
+        path = os.path.join(split_dir, name)
+        with open(path, "r") as f:
+            task = json.load(f)
+        # Challenge: train as-is; test with only "input" (no output)
+        challenges[task_id] = {
+            "train": task["train"],
+            "test": [{"input": p["input"]} for p in task["test"]],
+        }
+        # Solutions: list of test output grids
+        solutions[task_id] = [p["output"] for p in task["test"]]
+
+    challenges_path = os.path.join(out_dir, f"arc-agi_{split}_challenges.json")
+    solutions_path = os.path.join(out_dir, f"arc-agi_{split}_solutions.json")
+    with open(challenges_path, "w") as f:
+        json.dump(challenges, f)
+    with open(solutions_path, "w") as f:
+        json.dump(solutions, f)
+    print(f"Wrote {challenges_path} ({len(challenges)} tasks)")
+    print(f"Wrote {solutions_path} ({len(solutions)} tasks)")
+
+
+def main():
+    repo_root = os.path.abspath(sys.argv[1] if len(sys.argv) > 1 else ".")
+    out_dir = os.getenv("OUT_DIR", repo_root)
+    for split in ("training", "evaluation"):
+        convert_split(repo_root, split, out_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/arc_benchmark/evaluator/Dockerfile b/benchmarks/arc_benchmark/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/arc_benchmark/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/arc_benchmark/evaluator/evaluate.sh b/benchmarks/arc_benchmark/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/arc_benchmark/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/arc_benchmark/evaluator/evaluator.py b/benchmarks/arc_benchmark/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..de18fd5bcb88cca5a5a01da83863541956ebee8d
--- /dev/null
+++ b/benchmarks/arc_benchmark/evaluator/evaluator.py
@@ -0,0 +1,407 @@
+import numpy as np
+from typing import List, Tuple, Dict, Any
+import json
+import os
+
+try:
+    from skydiscover.evaluation.evaluation_result import EvaluationResult
+except ImportError:
+    from dataclasses import dataclass, field
+    from typing import Union
+
+    @dataclass
+    class EvaluationResult:
+        metrics: Dict[str, float]
+        artifacts: Dict[str, Union[str, bytes]] = field(default_factory=dict)
+import importlib.util
+
+TASK_FILE = os.getenv("ARC_TASK_FILE", "training")
+TASK_NUM = os.getenv("TASK_NUM", 0)
+DATA_ROOT = os.getenv("DATA_ROOT", os.path.join(os.path.dirname(os.path.abspath(__file__)), "data"))
+INCLUDE_TEST = os.getenv("ARC_EVAL_INCLUDE_TEST", "0").lower() in ("1", "true", "yes")
+USE_TEST_IN_SCORE = os.getenv("ARC_EVAL_USE_TEST_FOR_SCORE", "0").lower() in ("1", "true", "yes")
+
+
+def cell_accuracy_single(pred: np.ndarray, gt: np.ndarray) -> float:
+    """
+    Compute continuous cell-level accuracy between prediction and ground truth.
+    Returns a float in [0, 1]. Handles shape mismatches gracefully.
+    """
+    if pred.shape != gt.shape:
+        # Partial credit for getting shape partially right
+        shape_score = 0.0
+        if len(pred.shape) == len(gt.shape) == 2:
+            row_match = 1.0 if pred.shape[0] == gt.shape[0] else 0.0
+            col_match = 1.0 if pred.shape[1] == gt.shape[1] else 0.0
+            shape_score = (row_match + col_match) * 0.1  # up to 0.2 for correct dimensions
+        return shape_score
+    # Cell-level accuracy
+    total_cells = gt.size
+    if total_cells == 0:
+        return 1.0
+    correct_cells = int(np.sum(pred == gt))
+    return correct_cells / total_cells
+
+
+def best_attempt_cell_accuracy(attempts: List[np.ndarray], gt: np.ndarray) -> float:
+    """Return the best cell accuracy across all attempts for one example."""
+    return max(cell_accuracy_single(a, gt) for a in attempts)
+
+
+def pass_at_2_accuracy_single(
+    attempts: List[np.ndarray],
+    gt: np.ndarray
+) -> Tuple[int, Dict[int, Any]]:
+    """
+    Compute pass@2 accuracy for a single ARC test case.
+
+    Args:
+        attempts: List of 2 numpy arrays representing model attempts.
+        gt: Ground-truth output as a 2D numpy array.
+
+    Returns:
+        pass_at_2: int (1 if any attempt is perfectly correct, else 0)
+        diagnostics: dict mapping attempt index -> diagnostic info.
+                     If sizes match, includes indices of incorrect cells.
+    """
+    assert len(attempts) == 2, "Expected exactly 2 attempts for pass@2 evaluation."
+
+    diagnostics = {}
+    passed = False
+
+    for i, pred in enumerate(attempts):
+        attempt_info = {}
+
+        # Size check
+        if pred.shape != gt.shape:
+            attempt_info["size_match"] = False
+            attempt_info["pred_shape"] = list(pred.shape)
+            attempt_info["gt_shape"] = list(gt.shape)
+            attempt_info["incorrect_indices"] = None
+            attempt_info["cell_accuracy"] = 0.0
+            attempt_passed = False
+        else:
+            attempt_info["size_match"] = True
+
+            # Find incorrect cells
+            incorrect_mask = pred != gt
+            incorrect_indices = np.argwhere(incorrect_mask)
+
+            attempt_info["incorrect_indices"] = incorrect_indices.tolist()
+            attempt_info["num_incorrect"] = int(incorrect_mask.sum())
+            attempt_info["num_total"] = int(gt.size)
+            attempt_info["cell_accuracy"] = float(np.sum(~incorrect_mask)) / gt.size
+
+            # Perfect match
+            if incorrect_mask.sum() == 0:
+                attempt_passed = True
+            else:
+                attempt_passed = False
+
+        attempt_info["perfect_match"] = attempt_passed
+        passed = attempt_passed or passed
+
+        diagnostics[i] = attempt_info
+
+    pass_at_2 = 1 if passed else 0
+
+    return pass_at_2, diagnostics
+
+def pass_at_2_accuracy_multi_test(
+    all_attempts: List[List[np.ndarray]],
+    all_gt: List[np.ndarray]
+) -> Tuple[List[int], List[Dict[int, Any]]]:
+    """
+    Compute pass@2 accuracy across multiple ARC test cases.
+
+    Args:
+        all_attempts: List of lists of 2 numpy arrays for each test case.
+        all_gt: List of ground-truth outputs as 2D numpy arrays.
+    """
+    assert len(all_attempts) == len(all_gt), "Mismatched number of test cases."
+
+    all_diagnostics = []
+    all_pass = []
+
+    for attempts, gt in zip(all_attempts, all_gt):
+        pass_at_2, diagnostics = pass_at_2_accuracy_single(attempts, gt)
+        all_pass.append(pass_at_2)
+        all_diagnostics.append(diagnostics)
+
+    return all_pass, all_diagnostics
+
+def extract_failure_artifacts(diagnostics, pred=None, gt=None):
+    """
+    Extract failure artifacts from diagnostics for a given example.
+    Includes actual vs expected output snippets for better LLM feedback.
+    """
+    artifacts = {}
+    if not diagnostics["size_match"]:
+        artifacts["error_type"] = "SizeMismatch"
+        artifacts["error_message"] = (
+            f"Output shape {diagnostics['pred_shape']} does not match "
+            f"expected shape {diagnostics['gt_shape']}."
+        )
+        artifacts["suggestion"] = (
+            f"Your output has shape {diagnostics['pred_shape']} but the correct output "
+            f"has shape {diagnostics['gt_shape']}. Review how you determine output dimensions."
+        )
+    else:
+        num_incorrect = diagnostics['num_incorrect']
+        num_total = diagnostics['num_total']
+        accuracy = diagnostics['cell_accuracy']
+        artifacts["error_type"] = "IncorrectCells"
+        artifacts["error_message"] = (
+            f"{num_incorrect}/{num_total} cells incorrect "
+            f"(cell accuracy: {accuracy:.1%})."
+        )
+        # Show a compact diff of expected vs actual for first few wrong cells
+        if diagnostics['incorrect_indices'] and pred is not None and gt is not None:
+            wrong = diagnostics['incorrect_indices'][:8]  # first 8 wrong cells
+            diff_lines = []
+            for r, c in wrong:
+                diff_lines.append(f"  [{r},{c}]: got {int(pred[r,c])}, expected {int(gt[r,c])}")
+            artifacts["cell_diffs"] = "\n".join(diff_lines)
+            if len(diagnostics['incorrect_indices']) > 8:
+                artifacts["cell_diffs"] += f"\n  ... and {len(diagnostics['incorrect_indices'])-8} more"
+        artifacts["suggestion"] = (
+            f"Your solution gets {accuracy:.1%} of cells correct. "
+            f"Review the transformation logic for the failing cells."
+        )
+
+    return artifacts
+
+def evaluate(program_path):
+    """
+    Evaluate the program on ARC task training (and optionally test) examples.
+
+    Returns a combined_score that blends:
+      - pass@2 (binary perfect-match, weighted 0.6)
+      - cell accuracy (continuous partial credit, weighted 0.4)
+    This gives evolution gradient signal even when no example is solved perfectly.
+    """
+    spec = importlib.util.spec_from_file_location("program_module", program_path)
+    program_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(program_module)
+
+    if not hasattr(program_module, 'transform_grid_attempt_1') or not hasattr(program_module, 'transform_grid_attempt_2'):
+        print(f"Stage 1 validation failed: Program must define 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.")
+
+        error_artifacts = {
+                "error_type": "MissingFunction",
+                "error_message": "Stage 1: Program is missing required 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.",
+                "suggestion": "Make sure your program includes a functions named 'transform_grid_attempt_1' and 'transform_grid_attempt_2' that take as an argument a 2D numpy array and return a 2D numpy array."
+            }
+
+        return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0,
+                    "combined_score": 0.0,
+                    "error": "Missing transform_grid_attempt_1 and transform_grid_attempt_2 functions"
+                },
+                artifacts=error_artifacts
+            )
+
+    # Load ARC tasks
+    challenge_path = os.path.join(DATA_ROOT, f"arc-agi_{TASK_FILE}_challenges.json")
+
+    with open(challenge_path, 'r') as f:
+        tasks = json.load(f)
+
+    task_id = list(tasks.keys())[int(TASK_NUM)]
+    task = tasks[task_id]
+
+    train_inputs = [np.array(inp["input"]) for inp in task['train']]
+    train_gts = [np.array(gt["output"]) for gt in task['train']]
+
+    train_attempts = []
+
+    # Generate attempts for training data
+    for inp in train_inputs:
+        attempt_1 = program_module.transform_grid_attempt_1(inp)
+        if not isinstance(attempt_1, np.ndarray):
+            print(f"transform_grid_attempt_1 did not return a numpy array")
+
+            error_artifacts = {
+                "error_type": "InvalidReturnType",
+                "error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array.",
+                "suggestion": "Make sure your transform_grid_attempt_1 function returns a 2D numpy array."
+            }
+
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0,
+                    "combined_score": 0.0,
+                    "error": "transform_grid_attempt_1 did not return a numpy array"
+                },
+                artifacts=error_artifacts
+            )
+
+        attempt_2 = program_module.transform_grid_attempt_2(inp)
+        if not isinstance(attempt_2, np.ndarray):
+            print(f"transform_grid_attempt_2 did not return a numpy array")
+
+            error_artifacts = {
+                "error_type": "InvalidReturnType",
+                "error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array.",
+                "suggestion": "Make sure your transform_grid_attempt_2 function returns a 2D numpy array."
+            }
+
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0,
+                    "combined_score": 0.0,
+                    "error": "transform_grid_attempt_2 did not return a numpy array"
+                },
+                artifacts=error_artifacts
+            )
+        train_attempts.append([attempt_1, attempt_2])
+
+    pass_at_2_train, train_diagnostics_list = pass_at_2_accuracy_multi_test(train_attempts, train_gts)
+
+    # Compute both binary pass@2 and continuous cell accuracy
+    train_pass_score = sum(pass_at_2_train) / len(pass_at_2_train)
+    train_cell_acc = sum(
+        best_attempt_cell_accuracy(attempts, gt)
+        for attempts, gt in zip(train_attempts, train_gts)
+    ) / len(train_gts)
+
+    # Blended score: pass@2 (60%) + cell accuracy (40%) gives gradient signal
+    train_score = 0.6 * train_pass_score + 0.4 * train_cell_acc
+
+    metrics = {
+        "runs_successfully": 1.0,
+        "combined_score": train_score,
+        "train_combined_score": train_score,
+        "train_pass_at_2_score": train_pass_score,
+        "train_cell_accuracy": round(train_cell_acc, 4),
+    }
+    error_artifacts = {}
+    for i, (train_pass, train_diagnostics) in enumerate(zip(pass_at_2_train, train_diagnostics_list)):
+        example_name = f"train_example_{i}"
+        metrics[f"{example_name}_pass_at_2"] = train_pass
+        best_acc = best_attempt_cell_accuracy(train_attempts[i], train_gts[i])
+        metrics[f"{example_name}_cell_accuracy"] = round(best_acc, 4)
+        for attempt in train_diagnostics:
+            attempt_pass = train_diagnostics[attempt]["perfect_match"]
+            metrics[f"{example_name}_attempt_{attempt}"] = attempt_pass
+            if not attempt_pass:
+                pred = train_attempts[i][attempt]
+                gt = train_gts[i]
+                error_artifacts[f"{example_name}_attempt_{attempt}_diagnostics"] = extract_failure_artifacts(
+                    train_diagnostics[attempt], pred=pred, gt=gt
+                )
+
+    # Optional: include test feedback (uses solutions if available)
+    if INCLUDE_TEST:
+        solution_path = os.path.join(DATA_ROOT, f"arc-agi_{TASK_FILE}_solutions.json")
+        if os.path.isfile(solution_path):
+            with open(solution_path, 'r') as f:
+                solutions = json.load(f)
+            task_id = list(tasks.keys())[int(TASK_NUM)]
+            solution = solutions.get(task_id)
+            if solution is not None and "test" in task:
+                if len(task["test"]) != len(solution):
+                    raise ValueError(
+                        f"Train/test data mismatch: task {task_id} has {len(task['test'])} test inputs "
+                        f"but {len(solution)} solution outputs. Check that arc-agi_{TASK_FILE}_challenges.json "
+                        f"and arc-agi_{TASK_FILE}_solutions.json were generated together."
+                    )
+                test_inputs = [np.array(inp["input"]) for inp in task['test']]
+                test_gts = [np.array(gt) for gt in solution]
+
+                test_attempts = []
+                for inp in test_inputs:
+                    attempt_1 = program_module.transform_grid_attempt_1(inp)
+                    if not isinstance(attempt_1, np.ndarray):
+                        print(f"transform_grid_attempt_1 did not return a numpy array (test)")
+                        return EvaluationResult(
+                            metrics={
+                                "runs_successfully": 0.0,
+                                "combined_score": 0.0,
+                                "error": "transform_grid_attempt_1 did not return a numpy array (test)"
+                            },
+                            artifacts={
+                                "error_type": "InvalidReturnType",
+                                "error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array (test).",
+                                "suggestion": "Make sure transform_grid_attempt_1 returns a 2D numpy array."
+                            }
+                        )
+
+                    attempt_2 = program_module.transform_grid_attempt_2(inp)
+                    if not isinstance(attempt_2, np.ndarray):
+                        print(f"transform_grid_attempt_2 did not return a numpy array (test)")
+                        return EvaluationResult(
+                            metrics={
+                                "runs_successfully": 0.0,
+                                "combined_score": 0.0,
+                                "error": "transform_grid_attempt_2 did not return a numpy array (test)"
+                            },
+                            artifacts={
+                                "error_type": "InvalidReturnType",
+                                "error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array (test).",
+                                "suggestion": "Make sure transform_grid_attempt_2 returns a 2D numpy array."
+                            }
+                        )
+                    test_attempts.append([attempt_1, attempt_2])
+
+                pass_at_2_test, test_diagnostics_list = pass_at_2_accuracy_multi_test(test_attempts, test_gts)
+                test_pass_score = sum(pass_at_2_test) / len(pass_at_2_test)
+                test_cell_acc = sum(
+                    best_attempt_cell_accuracy(attempts, gt)
+                    for attempts, gt in zip(test_attempts, test_gts)
+                ) / len(test_gts)
+                test_score = 0.6 * test_pass_score + 0.4 * test_cell_acc
+
+                metrics["test_combined_score"] = test_score
+                metrics["test_pass_at_2_score"] = test_pass_score
+                metrics["test_cell_accuracy"] = round(test_cell_acc, 4)
+                metrics["test_included"] = 1
+
+                for i, (test_pass, test_diagnostics) in enumerate(zip(pass_at_2_test, test_diagnostics_list)):
+                    example_name = f"test_example_{i}"
+                    metrics[f"{example_name}_pass_at_2"] = test_pass
+                    best_acc = best_attempt_cell_accuracy(test_attempts[i], test_gts[i])
+                    metrics[f"{example_name}_cell_accuracy"] = round(best_acc, 4)
+                    for attempt in test_diagnostics:
+                        metrics[f"{example_name}_attempt_{attempt}"] = test_diagnostics[attempt]["perfect_match"]
+                    if test_pass == 0:
+                        first_failing_idx = next(
+                            (a for a in test_diagnostics if not test_diagnostics[a]["perfect_match"]),
+                            0,
+                        )
+                        pred = test_attempts[i][first_failing_idx]
+                        gt = test_gts[i]
+                        error_artifacts[f"{example_name}"] = extract_failure_artifacts(
+                            test_diagnostics[first_failing_idx], pred=pred, gt=gt
+                        )
+
+                if USE_TEST_IN_SCORE:
+                    metrics["combined_score"] = (train_score + test_score) / 2.0
+            else:
+                metrics["test_included"] = 0
+        else:
+            metrics["test_included"] = 0
+
+    return EvaluationResult(
+        metrics=metrics,
+        artifacts=error_artifacts
+    )
+
+
+def _evaluate_as_dict(program_path):
+    """Adapter: calls evaluate() and converts EvaluationResult to a plain dict."""
+    result = evaluate(program_path)
+    d = dict(result.metrics)
+    for k, v in result.artifacts.items():
+        d[k] = v
+    return d
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> EvaluationResult to the
+    # container JSON protocol.  wrapper.py is copied from
+    # skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(_evaluate_as_dict)
diff --git a/benchmarks/arc_benchmark/evaluator/requirements.txt b/benchmarks/arc_benchmark/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..296d654528b719e554528b956c4bf5a1516e812c
--- /dev/null
+++ b/benchmarks/arc_benchmark/evaluator/requirements.txt
@@ -0,0 +1 @@
+numpy
\ No newline at end of file
diff --git a/benchmarks/arc_benchmark/evaluator/wrapper.py b/benchmarks/arc_benchmark/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/arc_benchmark/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/arc_benchmark/generate_config.py b/benchmarks/arc_benchmark/generate_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba3da073fd23f9076ae081c193331b14def47d80
--- /dev/null
+++ b/benchmarks/arc_benchmark/generate_config.py
@@ -0,0 +1,101 @@
+import os
+import yaml
+import json
+
+
+def load_task_as_prompt(task_json, task_num):
+    with open(task_json, 'r') as f:
+        tasks = json.load(f)
+    
+    task_id = list(tasks.keys())[int(task_num)]
+    task = tasks[task_id]
+    train_inputs = [inp["input"] for inp in task['train']]
+    train_outputs = [gt["output"] for gt in task['train']]
+
+    train_pairs = ""
+    for i, (inp, out) in enumerate(zip(train_inputs, train_outputs)):
+        train_pairs += f"In {i} - {inp}\nOut {i} - {out}\n"
+    
+    prompt = f"""You are participating in a puzzle solving competition. You are an expert at solving puzzles.
+Find the common pattern that transforms each input grid into its corresponding output grid.
+
+Your task is to write python functions that implement the MOST GENERAL transformation rule. The rule must:
+- Apply consistently to ALL training examples
+- Generalize to unseen inputs (critical for success)
+- Be based on structural patterns, not memorized examples
+- Use relative/spatial rules rather than absolute coordinates
+
+Generalization rules (THIS IS CRITICAL):
+- Infer the transformation ONLY from the training input-output pairs
+- If multiple rules fit the training data, choose the SIMPLEST and MOST GENERAL one
+- Prefer structural/relational rules (shapes, adjacency, symmetry, patterns) over coordinate-based rules
+- Do NOT hardcode any values, coordinates, or specific grid sizes that appear in training examples
+- Think: "What is the underlying principle?" not "What fits these specific examples?"
+- Use numpy only (no external libraries)
+
+Common failure modes to avoid:
+- Overfitting to specific grid sizes or positions in training examples
+- Hardcoding colors, coordinates, or counts from training data
+- Assuming global properties (like separator colors) without verifying across ALL examples
+- Using absolute positions when relative/structural rules would generalize better
+
+Solution approach:
+- Analyze the training examples to identify the CORE transformation principle
+- Prefer block-wise, object-wise, or pattern-based rules that work locally
+- If the grid has distinct regions, solve each region independently
+- Build flexible rules that adapt to different input sizes and structures
+
+Training examples:
+{train_pairs}
+
+Your task: Write 2 different Python functions that implement the general transformation rule.
+- Each function takes a 2D numpy array as input and returns the transformed 2D numpy array
+- The two attempts should use genuinely different strategies (e.g., different algorithmic approaches)
+- Focus on generalization - your solution will be evaluated on BOTH training examples AND unseen test cases
+
+CRITICAL: Write general transformations that discover the underlying rule, not memorize the training examples.
+
+Remember to only output the modified python functions as your solution."""
+    
+    return prompt
+
+def generate_config(task_num, task_file, dataset_root=None, base_config=None):
+    if dataset_root is None:
+        dataset_root = os.getenv("DATA_ROOT")
+        if not dataset_root:
+            dataset_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+    task_json = os.path.join(dataset_root, f"arc-agi_{task_file}_challenges.json")
+    prompt = load_task_as_prompt(task_json, task_num)
+    
+    if base_config is None:
+        default_base = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config.yaml")
+        base_config = os.getenv("BASE_CONFIG", default_base)
+    with open(base_config, 'r') as file:
+        config = yaml.safe_load(file)
+    
+    config['prompt']['system_message'] = prompt
+    # Use OPENAI_API_KEY at runtime if set (keeps real key out of committed config)
+    api_key_env = os.getenv("OPENAI_API_KEY")
+    if api_key_env and api_key_env.strip() and api_key_env != "your-gemini-api-key":
+        config["llm"]["api_key"] = api_key_env.strip()
+    # Override max_iterations from env if set (e.g. by run_discovery.sh)
+    max_iter_env = os.getenv("MAX_ITERATIONS")
+    if max_iter_env is not None and str(max_iter_env).strip() != "":
+        try:
+            config["max_iterations"] = int(max_iter_env)
+        except ValueError:
+            pass
+    
+    # Write to a per-task config file so parallel runs don't conflict
+    out_path = os.getenv("CONFIG_OUT", f"./config_task_{task_num}.yaml")
+    with open(out_path, 'w') as file:
+        yaml.dump(config, file)
+    return out_path
+        
+if __name__ == "__main__":
+    TASK_FILE = os.getenv("ARC_TASK_FILE", "training")
+    TASK_NUM = os.getenv("TASK_NUM", 0)
+    
+    path = generate_config(TASK_NUM, TASK_FILE)
+    print(path)
+    
diff --git a/benchmarks/arc_benchmark/initial_program.py b/benchmarks/arc_benchmark/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..39f75f7c28a5a17a351ca335cd7ad5cf61c4136d
--- /dev/null
+++ b/benchmarks/arc_benchmark/initial_program.py
@@ -0,0 +1,42 @@
+# EVOLVE-BLOCK-START
+
+import numpy as np
+
+def transform_grid_attempt_1(grid):
+    """
+    Example transformation:
+    - Validate input (2D, integer values 0-9).
+    - Rotate the grid 90 degrees clockwise.
+    - Increment every cell by 1 modulo 10 (keeps values 0-9).
+    Returns a new numpy int array.
+    """
+    arr = _validate_grid(grid)
+    out = np.rot90(arr, k=-1)  # 90 degrees clockwise
+    out = (out + 1) % 10
+    return out.astype(np.int32)
+
+def transform_grid_attempt_2(grid):
+    """
+    Example transformation:
+    - Validate input (2D, integer values 0-9).
+    - Upsample each cell to a 2x2 block (doubling both dimensions).
+    - Invert colors by mapping v -> 9 - v (keeps values 0-9).
+    Returns a new numpy int array.
+    """
+    arr = _validate_grid(grid)
+    out = np.repeat(np.repeat(arr, 2, axis=0), 2, axis=1)
+    out = 9 - out
+    return out.astype(np.int32)
+
+# EVOLVE-BLOCK-END
+
+def _validate_grid(grid):
+    arr = np.asarray(grid)
+    if arr.ndim != 2:
+        raise ValueError("Input must be a 2D array.")
+    # cast to integer type for value checks
+    if not np.issubdtype(arr.dtype, np.integer):
+        arr = arr.astype(int)
+    if arr.size and (arr.min() < 0 or arr.max() > 9):
+        raise ValueError("Array values must be integers in the range 0-9.")
+    return arr
\ No newline at end of file
diff --git a/benchmarks/arc_benchmark/post_discovery_eval.py b/benchmarks/arc_benchmark/post_discovery_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..58e707580ead95624cf6321d481383fe9e8e9c83
--- /dev/null
+++ b/benchmarks/arc_benchmark/post_discovery_eval.py
@@ -0,0 +1,157 @@
+import importlib.util
+import os
+import json
+import numpy as np
+from evaluator import pass_at_2_accuracy_multi_test, extract_failure_artifacts
+
+TASK_FILE = os.getenv("ARC_TASK_FILE", "training")
+TASK_NUM = os.getenv("TASK_NUM", 0)
+OUTS_DIR = os.getenv("OUTS_DIR", "")
+# Optional: path to a checkpoint dir (e.g. outputs/evaluation_task_0/checkpoints/checkpoint_10) to eval that best_program.py on test set
+PROGRAM_DIR = os.getenv("PROGRAM_DIR", "")
+
+
+def _program_path():
+    """Path to best_program.py: PROGRAM_DIR if set, else OUTS_DIR/best/."""
+    if PROGRAM_DIR:
+        return os.path.join(PROGRAM_DIR, "best_program.py")
+    return os.path.join(OUTS_DIR, "best", "best_program.py")
+
+
+def _result_path():
+    """Where to write post_evolution_evaluation_result.json."""
+    if PROGRAM_DIR:
+        return os.path.join(PROGRAM_DIR, "post_evolution_evaluation_result.json")
+    return os.path.join(OUTS_DIR, "best", "post_evolution_evaluation_result.json")
+
+
+def load_program_module():
+    """Dynamically load the best_program.py module from the specified directory."""
+    path = _program_path()
+    if not os.path.isfile(path):
+        raise FileNotFoundError(f"Program not found: {path}. Set PROGRAM_DIR to a checkpoint dir (e.g. .../checkpoints/checkpoint_10) or ensure OUTS_DIR/best/best_program.py exists.")
+    spec = importlib.util.spec_from_file_location("program_module", path)
+    program_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(program_module)
+    
+    return program_module
+
+def evaluate():
+    """Evaluate the program module located in the specified directory."""
+    program_module = load_program_module()
+    if not hasattr(program_module, 'transform_grid_attempt_1') or not hasattr(program_module, 'transform_grid_attempt_2'):
+        print(f"Stage 1 validation failed: Program must define 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.")
+        
+        error_artifacts = {
+                "error_type": "MissingFunction",
+                "error_message": "Stage 1: Program is missing required 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.",
+                "suggestion": "Make sure your program includes a functions named 'transform_grid_attempt_1' and 'transform_grid_attempt_2' that take as an argument a 2D numpy array and return a 2D numpy array."
+            }
+        
+        return dict(
+                metrics={
+                    "runs_successfully": 0.0, 
+                    "combined_score": 0.0,
+                    "error": "Missing transform_grid_attempt_1 and transform_grid_attempt_2 functions"
+                },
+                artifacts=error_artifacts
+            )
+    # Load ARC tasks
+    data_root = os.getenv("DATA_ROOT")
+    if not data_root:
+        data_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+    challenge_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_challenges.json")
+    solution_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_solutions.json")
+
+    with open(challenge_path, 'r') as f:
+        tasks = json.load(f)
+    with open(solution_path, 'r') as f:
+        solutions = json.load(f)
+        
+    task_id = list(tasks.keys())[int(TASK_NUM)]
+    solution = solutions[task_id]
+    task = tasks[task_id]
+
+    # Sanity check: test inputs and solutions must align (same task, same order)
+    if len(task["test"]) != len(solution):
+        raise ValueError(
+            f"Train/test data mismatch: task {task_id} has {len(task['test'])} test inputs "
+            f"but {len(solution)} solution outputs. Check that arc-agi_{TASK_FILE}_challenges.json "
+            f"and arc-agi_{TASK_FILE}_solutions.json were generated together (convert_arc_agi2_data.py)."
+        )
+
+    test_inputs = [np.array(inp["input"]) for inp in task['test']]
+    test_gts = [np.array(gt) for gt in solution]
+    
+    test_attempts = []
+    for inp in test_inputs:
+        attempt_1 = program_module.transform_grid_attempt_1(inp)
+        if not isinstance(attempt_1, np.ndarray):
+            print(f"transform_grid_attempt_1 did not return a numpy array")
+            
+            error_artifacts = {
+                "error_type": "InvalidReturnType",
+                "error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array.",
+                "suggestion": "Make sure your transform_grid_attempt_1 function returns a 2D numpy array."
+            }
+            
+            return dict(
+                metrics={
+                    "runs_successfully": 0.0, 
+                    "combined_score": 0.0,
+                    "error": "transform_grid_attempt_1 did not return a numpy array"
+                },
+                artifacts=error_artifacts
+            )
+        
+        attempt_2 = program_module.transform_grid_attempt_2(inp)
+        if not isinstance(attempt_2, np.ndarray):
+            print(f"transform_grid_attempt_2 did not return a numpy array")
+            
+            error_artifacts = {
+                "error_type": "InvalidReturnType",
+                "error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array.",
+                "suggestion": "Make sure your transform_grid_attempt_2 function returns a 2D numpy array."
+            }
+            
+            return dict(
+                metrics={
+                    "runs_successfully": 0.0, 
+                    "combined_score": 0.0,
+                    "error": "transform_grid_attempt_2 did not return a numpy array"
+                },
+                artifacts=error_artifacts
+            )
+        test_attempts.append([attempt_1, attempt_2])
+        
+    pass_at_2_test, test_diagnostics_list = pass_at_2_accuracy_multi_test(test_attempts, test_gts)
+    metrics = {
+        "runs_successfully": 1.0,
+        "combined_score": sum(pass_at_2_test) / len(pass_at_2_test),
+    }
+    error_artifacts = {}
+    for i, (test_pass, test_diagnostics) in enumerate(zip(pass_at_2_test, test_diagnostics_list)):
+        example_name = f"test_example_{i}"
+        metrics[f"{example_name}_pass_at_2"] = test_pass
+        for attempt in test_diagnostics:
+            metrics[f"{example_name}_attempt_{attempt}"] = test_diagnostics[attempt]["perfect_match"]
+        if test_pass == 0:
+            # test_diagnostics is {0: {...}, 1: {...}}; extract_failure_artifacts expects one attempt's dict
+            first_failing = next(
+                (test_diagnostics[a] for a in test_diagnostics if not test_diagnostics[a]["perfect_match"]),
+                test_diagnostics[0],
+            )
+            error_artifacts[f"{example_name}"] = extract_failure_artifacts(first_failing)
+    
+    return dict(
+        metrics=metrics,
+        artifacts=error_artifacts
+    )
+    
+if __name__ == "__main__":
+    evaluation_result = evaluate()
+    result_path = _result_path()
+    os.makedirs(os.path.dirname(result_path), exist_ok=True)
+    with open(result_path, 'w') as f:
+        json.dump(evaluation_result, f, indent=4)
+    print(f"Test-set evaluation written to {result_path}")
\ No newline at end of file
diff --git a/benchmarks/frontier-cs-eval/README.md b/benchmarks/frontier-cs-eval/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d7c97f8213ef9207738dd073295e909e02e0308e
--- /dev/null
+++ b/benchmarks/frontier-cs-eval/README.md
@@ -0,0 +1,72 @@
+# Frontier-CS Benchmark
+
+Evolves C++ solutions for [Frontier-CS](https://github.com/facebookresearch/Frontier-CS) algorithmic optimization problems using SkyDiscover.
+
+## Setup
+
+```bash
+# 1. Clone Frontier-CS
+cd benchmarks/frontier-cs-eval
+git clone https://github.com/FrontierCS/Frontier-CS.git
+
+# 2. Start the judge server (requires Docker)
+cd Frontier-CS/algorithmic
+docker compose up -d
+
+# 3. Install dependencies (from project root)
+cd ../../..
+uv sync --extra frontier-cs
+
+# 4. Set your API key
+export OPENAI_API_KEY=...
+```
+
+## Run
+
+Supported algorithms: `adaevolve`, `evox`, `openevolve`, `gepa`, `shinkaevolve`
+
+
+Single problem:
+```bash
+cd benchmarks/frontier-cs-eval
+FRONTIER_CS_PROBLEM=0 uv run skydiscover-run initial_program.cpp evaluator.py \
+  -c config.yaml -s [search_algorithm] -i 50
+```
+
+All problems in parallel:
+```bash
+uv run python run_all_frontiercs.py --search [search_algorithm] --iterations 50 --workers 6
+```
+
+## Evaluate best programs (post-discovery)
+
+```bash
+uv run python run_best_programs_frontiercs.py
+```
+
+## Analyze results
+
+```bash
+uv run python combine_results.py   # merge training/testing scores into CSV
+uv run python analyze_results.py   # generate plots and statistics
+```
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `initial_program.cpp` | Seed C++ program |
+| `evaluator.py` | Evaluates C++ solutions via Frontier-CS docker judge |
+| `config.yaml` | Config with system prompt template |
+| `run_all_frontiercs.py` | Parallelizes evolution across all problems |
+| `run_best_programs_frontiercs.py` | Re-evaluates best programs after evolution |
+| `combine_results.py` | Combines training/testing scores into CSV |
+| `analyze_results.py` | Generates score analysis plots and statistics |
+
+## Environment variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `OPENAI_API_KEY` | (required) | API key |
+| `FRONTIER_CS_PROBLEM` | `0` | Problem ID to evolve |
+| `JUDGE_URLS` | `http://localhost:8081` | Comma-separated judge server URLs |
diff --git a/benchmarks/frontier-cs-eval/analyze_results.py b/benchmarks/frontier-cs-eval/analyze_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8359c99df099bf201b8deaf8d05c8e028e14901
--- /dev/null
+++ b/benchmarks/frontier-cs-eval/analyze_results.py
@@ -0,0 +1,105 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+from pathlib import Path
+
+# Define paths
+_script_dir = str(Path(__file__).resolve().parent)
+input_csv = str(Path(_script_dir) / "combined_results.csv")
+output_dir = _script_dir
+
+# Read the CSV file
+df = pd.read_csv(input_csv)
+
+# Calculate average of training and testing scores
+df['average_score'] = (df['training_score'] + df['testing_score']) / 2
+
+# Remove rows where either score is None (NaN)
+df_complete = df.dropna(subset=['training_score', 'testing_score'])
+
+print(f"\n=== Analysis Results ===")
+print(f"Total problems: {len(df)}")
+print(f"Problems with complete data: {len(df_complete)}")
+print(f"\nTraining Scores:")
+print(f"  Mean: {df_complete['training_score'].mean():.4f}")
+print(f"  Median: {df_complete['training_score'].median():.4f}")
+print(f"  Std Dev: {df_complete['training_score'].std():.4f}")
+print(f"  Min: {df_complete['training_score'].min():.4f}")
+print(f"  Max: {df_complete['training_score'].max():.4f}")
+
+print(f"\nTesting Scores:")
+print(f"  Mean: {df_complete['testing_score'].mean():.4f}")
+print(f"  Median: {df_complete['testing_score'].median():.4f}")
+print(f"  Std Dev: {df_complete['testing_score'].std():.4f}")
+print(f"  Min: {df_complete['testing_score'].min():.4f}")
+print(f"  Max: {df_complete['testing_score'].max():.4f}")
+
+print(f"\nAverage Scores:")
+print(f"  Mean: {df_complete['average_score'].mean():.4f}")
+print(f"  Median: {df_complete['average_score'].median():.4f}")
+print(f"  Std Dev: {df_complete['average_score'].std():.4f}")
+
+# Save the updated CSV with averages
+output_csv = Path(output_dir) / "combined_results_with_averages.csv"
+df.to_csv(output_csv, index=False)
+print(f"\nUpdated CSV with averages saved to {output_csv}")
+
+# Create visualizations
+fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+
+# 1. Scatter plot: Training vs Testing scores
+ax = axes[0, 0]
+ax.scatter(df_complete['training_score'], df_complete['testing_score'], alpha=0.6, s=50)
+# Add diagonal line for reference (where training == testing)
+lim = [min(df_complete['training_score'].min(), df_complete['testing_score'].min()),
+       max(df_complete['training_score'].max(), df_complete['testing_score'].max())]
+ax.plot(lim, lim, 'r--', alpha=0.5, label='Training = Testing')
+ax.set_xlabel('Training Score')
+ax.set_ylabel('Testing Score')
+ax.set_title('Training vs Testing Scores')
+ax.legend()
+ax.grid(True, alpha=0.3)
+
+# 2. Distribution comparison - histograms
+ax = axes[0, 1]
+ax.hist(df_complete['training_score'], bins=20, alpha=0.6, label='Training', edgecolor='black')
+ax.hist(df_complete['testing_score'], bins=20, alpha=0.6, label='Testing', edgecolor='black')
+ax.set_xlabel('Score')
+ax.set_ylabel('Frequency')
+ax.set_title('Distribution of Training vs Testing Scores')
+ax.legend()
+ax.grid(True, alpha=0.3, axis='y')
+
+# 3. Box plot comparison
+ax = axes[1, 0]
+box_data = [df_complete['training_score'], df_complete['testing_score'], df_complete['average_score']]
+bp = ax.boxplot(box_data, labels=['Training', 'Testing', 'Average'])
+ax.set_ylabel('Score')
+ax.set_title('Score Comparison (Box Plot)')
+ax.grid(True, alpha=0.3, axis='y')
+
+# 4. Difference plot: Training - Testing
+ax = axes[1, 1]
+difference = df_complete['training_score'] - df_complete['testing_score']
+ax.scatter(df_complete['problem_id'].astype(int), difference, alpha=0.6, s=50)
+ax.axhline(y=0, color='r', linestyle='--', alpha=0.5, label='No Difference')
+ax.set_xlabel('Problem ID')
+ax.set_ylabel('Training Score - Testing Score')
+ax.set_title('Score Difference (Training - Testing)')
+ax.legend()
+ax.grid(True, alpha=0.3)
+
+plt.tight_layout()
+plot_path = Path(output_dir) / "results_analysis.png"
+plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+print(f"Plot saved to {plot_path}")
+
+# Additional statistics about differences
+print(f"\nScore Differences (Training - Testing):")
+print(f"  Mean Difference: {difference.mean():.4f}")
+print(f"  Median Difference: {difference.median():.4f}")
+print(f"  Std Dev: {difference.std():.4f}")
+print(f"  Problems where training > testing: {(difference > 0).sum()}")
+print(f"  Problems where testing > training: {(difference < 0).sum()}")
+
+plt.show()
diff --git a/benchmarks/frontier-cs-eval/combine_results.py b/benchmarks/frontier-cs-eval/combine_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..8df35c7131cac075c4f9a29e45fc1c4440511d03
--- /dev/null
+++ b/benchmarks/frontier-cs-eval/combine_results.py
@@ -0,0 +1,66 @@
+import json
+import csv
+import os
+from pathlib import Path
+
+# Define paths
+_script_dir = Path(__file__).resolve().parent
+_repo_root = _script_dir.parent.parent
+training_dir = str(_repo_root / "outputs" / "frontier_cs")
+testing_dir = str(_script_dir / "evaluation_results")
+output_csv = str(_script_dir / "combined_results.csv")
+
+# Collect all problems
+results = []
+
+# Get all problem directories from training data
+training_problems = sorted([d for d in os.listdir(training_dir) if d.startswith("problem_")])
+
+print(f"Found {len(training_problems)} training problems")
+
+for problem_dir in training_problems:
+    problem_id = problem_dir.replace("problem_", "")
+    
+    # Get training score from best_program_info.json
+    training_score = None
+    training_info_path = os.path.join(training_dir, problem_dir, "best", "best_program_info.json")
+    
+    if os.path.exists(training_info_path):
+        try:
+            with open(training_info_path, 'r') as f:
+                training_data = json.load(f)
+                training_score = training_data.get("metrics", {}).get("combined_score")
+        except Exception as e:
+            print(f"Error reading training data for problem {problem_id}: {e}")
+    
+    # Get testing score from evaluation_results json
+    testing_score = None
+    testing_json_path = os.path.join(testing_dir, f"problem_{problem_id}.json")
+    
+    if os.path.exists(testing_json_path):
+        try:
+            with open(testing_json_path, 'r') as f:
+                testing_data = json.load(f)
+                testing_score = testing_data.get("combined_score")
+        except Exception as e:
+            print(f"Error reading testing data for problem {problem_id}: {e}")
+    
+    results.append({
+        "problem_id": problem_id,
+        "training_score": training_score,
+        "testing_score": testing_score
+    })
+
+# Write to CSV
+with open(output_csv, 'w', newline='') as csvfile:
+    fieldnames = ["problem_id", "training_score", "testing_score"]
+    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+    
+    writer.writeheader()
+    writer.writerows(results)
+
+print(f"\nResults written to {output_csv}")
+print(f"Total problems: {len(results)}")
+print(f"Problems with both scores: {sum(1 for r in results if r['training_score'] is not None and r['testing_score'] is not None)}")
+print(f"Problems missing training score: {sum(1 for r in results if r['training_score'] is None)}")
+print(f"Problems missing testing score: {sum(1 for r in results if r['testing_score'] is None)}")
diff --git a/benchmarks/frontier-cs-eval/config.yaml b/benchmarks/frontier-cs-eval/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..46159f8107e7acd6833300c1150dc7e90f122625
--- /dev/null
+++ b/benchmarks/frontier-cs-eval/config.yaml
@@ -0,0 +1,57 @@
+# Frontier-CS Benchmark
+# Usage: uv run skydiscover-run initial_program.cpp evaluator.py -c config.yaml -s <strategy> -i 50
+
+max_iterations: 100
+checkpoint_interval: 10
+log_level: INFO
+
+llm:
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  api_base: https://api.openai.com/v1
+  temperature: 0.7
+  # top_p: 0.95  # omitted by default; some providers (e.g. Anthropic) reject both temperature and top_p
+  max_tokens: 32000
+  timeout: 600
+  # To use Gemini: override with --model gemini-3-pro-preview
+
+prompt:
+  system_message: |
+    You are an expert competitive programmer specializing in algorithmic optimization.
+
+    PROBLEM STATEMENT:
+    {problem_statement}
+
+    CONSTRAINTS:
+    {problem_constraints}
+
+    OBJECTIVE: Maximize the score returned by the Frontier-CS judge (higher is better).
+    Your solution must be valid C++ code that compiles and runs correctly.
+
+    KEY STRATEGIES:
+    - Analyze the problem structure carefully before coding
+    - Consider time and space complexity constraints
+    - Use efficient data structures (vectors, maps, sets, priority queues)
+    - Implement clean, well-structured code
+    - Handle edge cases properly
+    - Optimize hot loops and critical sections
+
+    COMMON TECHNIQUES:
+    - Dynamic programming for optimization problems
+    - Greedy algorithms with proper ordering
+    - Graph algorithms (BFS, DFS, shortest paths)
+    - Binary search for monotonic functions
+    - Divide and conquer approaches
+    - Heuristic search (simulated annealing, genetic algorithms, local search)
+
+    OUTPUT: Complete C++ program with main() function that reads from stdin and writes to stdout.
+
+evaluator:
+  timeout: 300
+  max_retries: 3
+  cascade_evaluation: false
+
+diff_based_generation: true
+max_solution_length: 50000
+random_seed: 42
diff --git a/benchmarks/frontier-cs-eval/evaluator.py b/benchmarks/frontier-cs-eval/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..af6eea0f0f4fda9b4d6ff63a4fa496c998bff3a9
--- /dev/null
+++ b/benchmarks/frontier-cs-eval/evaluator.py
@@ -0,0 +1,174 @@
+"""
+Evaluator for Frontier-CS algorithmic problems.
+
+This evaluator integrates with SkyDiscover to evaluate generated C++ solutions
+against Frontier-CS benchmark problems using the local judge server.
+"""
+
+import traceback
+from pathlib import Path
+import logging
+import sys
+import os
+import random
+
+logger = logging.getLogger(__name__)
+
+# Support multiple judge servers for load balancing
+DEFAULT_JUDGE_URL = "http://localhost:8081"
+JUDGE_URLS = os.environ.get("JUDGE_URLS", DEFAULT_JUDGE_URL).split(",")
+JUDGE_URLS = [url.strip() for url in JUDGE_URLS if url.strip()]
+
+def get_judge_url() -> str:
+    """Get a judge URL using random selection for load balancing."""
+    return random.choice(JUDGE_URLS)
+
+# Add Frontier-CS to path
+frontier_cs_path = Path(__file__).resolve().parent / "Frontier-CS" / "src"
+if str(frontier_cs_path) not in sys.path:
+    sys.path.insert(0, str(frontier_cs_path))
+
+try:
+    from frontier_cs.single_evaluator import SingleEvaluator as FrontierCSEvaluator
+    from frontier_cs.runner.base import EvaluationStatus
+except ImportError as e:
+    logger.error(f"Failed to import Frontier-CS: {e}")
+    logger.error("Please ensure Frontier-CS is installed as a submodule in benchmarks/frontier-cs-eval/Frontier-CS")
+    raise
+
+def evaluate(program_path: str, problem_id: str = None, **kwargs) -> dict:
+    """
+    Evaluate a C++ solution for a Frontier-CS algorithmic problem.
+
+    Args:
+        program_path: Path to the C++ solution file
+        problem_id: Frontier-CS problem ID (e.g., "0", "1", "2", etc.)
+                    If None, will be read from FRONTIER_CS_PROBLEM env var or config
+
+    Returns:
+        dict with evaluation results:
+            - combined_score: The score from the judge (higher is better)
+            - runs_successfully: 1.0 if evaluation succeeded, 0.0 otherwise
+            - status: Evaluation status string
+            - message: Any error or status messages
+            - problem_id: The problem ID
+            - program_path: Path to the evaluated program
+            - score_unbounded: Unbounded score if available
+            - metadata: Additional evaluation metadata
+    """
+    # Get problem_id from parameter, environment, or kwargs
+    if problem_id is None:
+        import os
+        problem_id = os.environ.get('FRONTIER_CS_PROBLEM')
+        if problem_id is None:
+            problem_id = kwargs.get('frontier_cs_problem', '0')
+
+    logger.info(f"Evaluating program {program_path} for Frontier-CS problem {problem_id}")
+
+    try:
+        # Initialize evaluator with judge server (load balanced if multiple configured)
+        judge_url = get_judge_url()
+        logger.info(f"Using judge server: {judge_url}")
+        evaluator = FrontierCSEvaluator(
+            backend="docker",
+            judge_url=judge_url,
+            register_cleanup=False,
+        )
+        
+        # Read the solution code
+        solution_path = Path(program_path)
+        if not solution_path.exists():
+            error_msg = f"Solution file not found: {program_path}"
+            logger.error(error_msg)
+            return {
+                "combined_score": 0.0,
+                "runs_successfully": 0.0,
+                "status": "error",
+                "message": error_msg,
+                "problem_id": problem_id,
+                "program_path": program_path,
+            }
+        
+        # Extract code and remove any EVOLVE-BLOCK markers
+        code = solution_path.read_text().replace(
+            "// EVOLVE-BLOCK-START", ""
+        ).replace(
+            "// EVOLVE-BLOCK-END", ""
+        ).strip()
+        
+        logger.info(f"Code extracted from {program_path}")
+        
+        # Evaluate the solution
+        result = evaluator.evaluate(
+            track="algorithmic",
+            problem_id=problem_id,
+            code=code,
+            backend="docker",
+        )
+        
+        logger.info(f"Evaluation completed with status: {result.status}")
+        
+        # Process result
+        if result.status == EvaluationStatus.SUCCESS:
+            print(result)
+            score = result.score
+            # Use unbounded score for optimization (allows >100 if beating reference)
+            score_unbounded = result.metadata.get('scoreUnbounded', score) if result.metadata else score
+            print(f"score={score}, score_unbounded={score_unbounded}")
+
+            # Extract only essential metadata (exclude large test case outputs)
+            essential_metadata = {}
+            if result.metadata:
+                essential_metadata = {
+                    "status": result.metadata.get("status"),
+                    "passed": result.metadata.get("passed"),
+                    "result": result.metadata.get("result"),
+                    "score": result.metadata.get("score"),
+                    "scoreUnbounded": result.metadata.get("scoreUnbounded"),
+                }
+
+            return {
+                "combined_score": float(score),  # Ensure it's a float
+                "score_unbounded": score_unbounded,
+                "runs_successfully": 1.0,
+                "status": "success",
+                "message": result.message or "Evaluation successful",
+                "problem_id": problem_id,
+                "program_path": program_path,
+                "duration_seconds": result.duration_seconds,
+                "metadata": essential_metadata,
+            } 
+        elif result.status == EvaluationStatus.TIMEOUT:
+            logger.warning(f"Evaluation timed out: {result.message}")
+            return {
+                "combined_score": 0.0,
+                "runs_successfully": 0.0,
+                "status": "timeout",
+                "message": result.message or "Evaluation timed out",
+                "problem_id": problem_id,
+                "program_path": program_path,
+            }
+        else:  # ERROR status
+            logger.error(f"Evaluation error: {result.message}")
+            return {
+                "combined_score": 0.0,
+                "runs_successfully": 0.0,
+                "status": "error",
+                "message": result.message or "Evaluation failed",
+                "problem_id": problem_id,
+                "program_path": program_path,
+                "logs": result.logs,
+            }
+            
+    except Exception as e:
+        logger.error(f"Evaluation failed completely: {str(e)}")
+        logger.error(traceback.format_exc())
+        return {
+            "combined_score": 0.0,
+            "runs_successfully": 0.0,
+            "status": "error",
+            "message": str(e),
+            "problem_id": problem_id,
+            "program_path": program_path,
+            "error": str(e),
+        }
diff --git a/benchmarks/frontier-cs-eval/initial_program.cpp b/benchmarks/frontier-cs-eval/initial_program.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..20e5839ba653567805fc89560d23bf3ddc243d5c
--- /dev/null
+++ b/benchmarks/frontier-cs-eval/initial_program.cpp
@@ -0,0 +1,6 @@
+#include <bits/stdc++.h>
+using namespace std;
+int main(){
+    std::cout << "Hello, World!" << std::endl;
+    return 0;
+}
\ No newline at end of file
diff --git a/benchmarks/frontier-cs-eval/run_all_frontiercs.py b/benchmarks/frontier-cs-eval/run_all_frontiercs.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b805fd8e21b6d63238688cf5e5089342cba1a4c
--- /dev/null
+++ b/benchmarks/frontier-cs-eval/run_all_frontiercs.py
@@ -0,0 +1,70 @@
+import argparse
+import os
+import sys
+import subprocess
+from pathlib import Path
+from concurrent.futures import ProcessPoolExecutor
+
+from dotenv import load_dotenv
+load_dotenv()
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+
+frontier_cs_path = SCRIPT_DIR / "Frontier-CS" / "src"
+if str(frontier_cs_path) not in sys.path:
+    sys.path.insert(0, str(frontier_cs_path))
+
+from frontier_cs.runner.algorithmic_local import AlgorithmicLocalRunner
+
+
+def run_single_problem(args):
+    p_id, search, iterations, env = args
+    print(f"\n[START] Problem ID: {p_id}")
+    command = [
+        "uv", "run", "skydiscover-run",
+        "initial_program.cpp", "evaluator.py",
+        "-c", "config.yaml",
+        "-s", search,
+        "-i", str(iterations),
+        "-o", f"outputs/frontier_cs/problem_{p_id}",
+    ]
+    env = {**env, "FRONTIER_CS_PROBLEM": str(p_id)}
+    try:
+        subprocess.run(command, check=True, env=env, cwd=str(SCRIPT_DIR))
+        return f"✅ Problem {p_id} completed."
+    except subprocess.CalledProcessError as e:
+        return f"❌ Problem {p_id} failed: {e}"
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run SkyDiscover on all Frontier-CS problems")
+    parser.add_argument("--search", "-s", default="adaevolve",
+                        help="Search algorithm (default: adaevolve)")
+    parser.add_argument("--iterations", "-i", type=int, default=50,
+                        help="Iterations per problem (default: 50)")
+    parser.add_argument("--workers", "-w", type=int, default=6,
+                        help="Parallel workers (default: 6)")
+    args = parser.parse_args()
+
+    runner = AlgorithmicLocalRunner()
+    problems_data = runner.list_problems()
+    problem_ids = sorted([p['id'] for p in problems_data['problems']], key=int)
+
+    print(f"Running {len(problem_ids)} problems with {args.workers} workers "
+          f"(search={args.search}, iterations={args.iterations})...")
+
+    env = os.environ.copy()
+    task_args = [(p_id, args.search, args.iterations, env) for p_id in problem_ids]
+
+    with ProcessPoolExecutor(max_workers=args.workers) as executor:
+        results = list(executor.map(run_single_problem, task_args))
+
+    print("\n" + "=" * 30)
+    print("ALL RUNS COMPLETE")
+    print("=" * 30)
+    for result in results:
+        print(result)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/benchmarks/frontier-cs-eval/run_best_programs_frontiercs.py b/benchmarks/frontier-cs-eval/run_best_programs_frontiercs.py
new file mode 100644
index 0000000000000000000000000000000000000000..9be033548a0aebf71f080d882014bfe76d5e68ef
--- /dev/null
+++ b/benchmarks/frontier-cs-eval/run_best_programs_frontiercs.py
@@ -0,0 +1,404 @@
+import os
+import sys
+import json
+import logging
+import threading
+from pathlib import Path
+from typing import Dict, List, Tuple
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# Add Frontier-CS to path
+frontier_cs_path = Path(__file__).resolve().parent / "Frontier-CS" / "src"
+if str(frontier_cs_path) not in sys.path:
+    sys.path.insert(0, str(frontier_cs_path))
+
+try:
+    from frontier_cs.evaluator import FrontierCSEvaluator
+    from frontier_cs.runner.base import EvaluationStatus
+except ImportError as e:
+    logger.error(f"Failed to import Frontier-CS: {e}")
+    logger.error("Please ensure Frontier-CS is installed as a submodule in benchmarks/frontier-cs-eval/Frontier-CS")
+    sys.exit(1)
+
+
+class BestProgramEvaluator:
+    """Evaluates all best_program.cpp files in the outputs directory."""
+    
+    def __init__(self, outputs_dir: str, judge_url: str = "http://localhost:8081", num_workers: int = 8):
+        """
+        Initialize the evaluator.
+        
+        Args:
+            outputs_dir: Path to the outputs directory containing problem folders
+            judge_url: URL of the judge server
+            num_workers: Number of parallel workers for evaluation
+        """
+        self.outputs_dir = Path(outputs_dir)
+        self.judge_url = judge_url
+        self.num_workers = num_workers
+        
+        # Use thread-local storage for evaluator instances (avoid race condition)
+        self._evaluator_local = threading.local()
+        
+        self.results = []
+        
+        # Create results directory in the script's directory
+        self.results_dir = Path(__file__).resolve().parent / "evaluation_results"
+        self.results_dir.mkdir(exist_ok=True)
+        logger.info(f"Results will be saved to {self.results_dir}")
+        logger.info(f"Using {self.num_workers} parallel workers with thread-local evaluators")
+    
+    def _get_evaluator(self) -> 'FrontierCSEvaluator':
+        """
+        Get the evaluator for the current thread.
+        Creates a new instance if this thread hasn't created one yet.
+        This avoids race conditions from sharing a single evaluator across threads.
+        """
+        if not hasattr(self._evaluator_local, 'evaluator'):
+            self._evaluator_local.evaluator = FrontierCSEvaluator(
+                backend="docker",
+                judge_url=self.judge_url,
+            )
+            logger.debug(f"Created new evaluator for thread {threading.current_thread().name}")
+        return self._evaluator_local.evaluator
+    
+    def find_best_programs(self) -> Dict[str, Path]:
+        """
+        Find all best_program.cpp files in the outputs directory.
+        
+        Returns:
+            Dict mapping problem_id to best_program.cpp path
+        """
+        best_programs = {}
+        
+        # Look for frontier_cs subdirectory
+        frontier_cs_dir = self.outputs_dir / "frontier_cs"
+        if not frontier_cs_dir.exists():
+            logger.error(f"frontier_cs directory not found at {frontier_cs_dir}")
+            return best_programs
+        
+        # Iterate through problem directories
+        for problem_dir in sorted(frontier_cs_dir.iterdir()):
+            if not problem_dir.is_dir() or not problem_dir.name.startswith("problem_"):
+                continue
+            
+            # Extract problem ID
+            problem_id = problem_dir.name.replace("problem_", "")
+            
+            # Look for best_program.cpp
+            best_program_path = problem_dir / "best" / "best_program.cpp"
+            if best_program_path.exists():
+                best_programs[problem_id] = best_program_path
+                logger.info(f"Found best_program.cpp for problem {problem_id}")
+            else:
+                logger.warning(f"best_program.cpp not found for problem {problem_id} at {best_program_path}")
+        
+        return best_programs
+    
+    def evaluate_program(self, problem_id: str, program_path: Path) -> Dict:
+        """
+        Evaluate a single best_program.cpp file.
+        
+        Args:
+            problem_id: The Frontier-CS problem ID
+            program_path: Path to the best_program.cpp file
+            
+        Returns:
+            Dictionary with evaluation results
+        """
+        logger.info(f"Evaluating problem {problem_id}: {program_path}")
+        
+        try:
+            # Read the solution code
+            if not program_path.exists():
+                error_msg = f"Solution file not found: {program_path}"
+                logger.error(error_msg)
+                return {
+                    "problem_id": problem_id,
+                    "program_path": str(program_path),
+                    "combined_score": 0.0,
+                    "runs_successfully": 0.0,
+                    "status": "error",
+                    "message": error_msg,
+                }
+            
+            # Read the code
+            code = program_path.read_text().replace(
+                "// EVOLVE-BLOCK-START", ""
+            ).replace(
+                "// EVOLVE-BLOCK-END", ""
+            ).strip()
+            
+            logger.info(f"Code extracted from {program_path}, length: {len(code)} characters")
+            
+            # Evaluate the solution (use thread-local evaluator)
+            evaluator = self._get_evaluator()
+            result = evaluator.evaluate(
+                track="algorithmic",
+                problem_id=problem_id,
+                code=code,
+                backend="docker",
+            )
+            
+            logger.info(f"Evaluation completed for problem {problem_id} with status: {result.status}")
+            
+            # Log the result object and its properties
+            logger.info(f"Judger output for problem {problem_id}:")
+            logger.info(f"  Status: {result.status}")
+            logger.info(f"  Message: {result.message}")
+            if hasattr(result, 'score'):
+                logger.info(f"  Score: {result.score}")
+            if hasattr(result, 'duration_seconds'):
+                logger.info(f"  Duration: {result.duration_seconds}s")
+            if hasattr(result, 'metadata'):
+                logger.info(f"  Metadata: {result.metadata}")
+            logger.info(f"  Full result object: {result}")
+            
+            # Process result
+            if result.status == EvaluationStatus.SUCCESS:
+                score = result.score
+                logger.info(f"Problem {problem_id}: Score = {score}")
+                
+                return {
+                    "problem_id": problem_id,
+                    "program_path": str(program_path),
+                    "combined_score": float(score),
+                    "runs_successfully": 1.0,
+                    "status": "success",
+                    "message": result.message or "Evaluation successful",
+                    "duration_seconds": result.duration_seconds,
+                    "judger_output": str(result),
+                    "metadata": result.metadata if hasattr(result, 'metadata') else None,
+                }
+            elif result.status == EvaluationStatus.TIMEOUT:
+                logger.warning(f"Problem {problem_id}: Evaluation timed out")
+                return {
+                    "problem_id": problem_id,
+                    "program_path": str(program_path),
+                    "combined_score": 0.0,
+                    "runs_successfully": 0.0,
+                    "status": "timeout",
+                    "message": f"Evaluation timed out: {result.message}",
+                    "duration_seconds": result.duration_seconds,
+                    "judger_output": str(result),
+                }
+            elif result.status == EvaluationStatus.COMPILATION_ERROR:
+                logger.warning(f"Problem {problem_id}: Compilation error")
+                return {
+                    "problem_id": problem_id,
+                    "program_path": str(program_path),
+                    "combined_score": 0.0,
+                    "runs_successfully": 0.0,
+                    "status": "compilation_error",
+                    "message": f"Compilation error: {result.message}",
+                    "duration_seconds": result.duration_seconds,
+                    "judger_output": str(result),
+                }
+            else:
+                logger.error(f"Problem {problem_id}: Evaluation failed with status {result.status}")
+                return {
+                    "problem_id": problem_id,
+                    "program_path": str(program_path),
+                    "combined_score": 0.0,
+                    "runs_successfully": 0.0,
+                    "status": str(result.status),
+                    "message": f"Evaluation failed: {result.message}",
+                    "duration_seconds": result.duration_seconds,
+                    "judger_output": str(result),
+                }
+        
+        except Exception as e:
+            logger.error(f"Exception while evaluating problem {problem_id}: {str(e)}")
+            logger.error(f"Exception traceback: {type(e).__name__}")
+            import traceback
+            logger.error(traceback.format_exc())
+            
+            return {
+                "problem_id": problem_id,
+                "program_path": str(program_path),
+                "combined_score": 0.0,
+                "runs_successfully": 0.0,
+                "status": "exception",
+                "message": str(e),
+            }
+    
+    def run_all_evaluations(self) -> List[Dict]:
+        """
+        Run evaluations for all best_program.cpp files sequentially (one at a time).
+        
+        Returns:
+            List of evaluation results
+        """
+        logger.info(f"Starting evaluation of all best programs in {self.outputs_dir}")
+        
+        best_programs = self.find_best_programs()
+        logger.info(f"Found {len(best_programs)} best_program.cpp files")
+        
+        if not best_programs:
+            logger.warning("No best_program.cpp files found!")
+            return []
+        
+        # Sort problems by ID for consistent ordering
+        sorted_problems = sorted(best_programs.items(), key=lambda x: int(x[0]))
+        
+        # Evaluate each program sequentially (no parallelization)
+        results = []
+        total = len(sorted_problems)
+        for idx, (problem_id, program_path) in enumerate(sorted_problems, 1):
+            logger.info(f"[SEQ] Evaluating problem {problem_id} ({idx}/{total})")
+            try:
+                result = self.evaluate_program(problem_id, program_path)
+                
+                # CRITICAL: Ensure problem_id matches
+                if result.get("problem_id") != problem_id:
+                    logger.error(f"[CRITICAL] Problem ID MISMATCH! Expected {problem_id}, got {result.get('problem_id')}")
+                    result["problem_id"] = problem_id  # Force correct problem_id
+                
+                results.append(result)
+                self.results.append(result)
+                
+                logger.info(f"[SAVE] Saving problem {problem_id} result to file")
+                # Save result immediately after evaluation
+                self.save_problem_result(result)
+                
+            except Exception as e:
+                logger.error(f"Exception evaluating problem {problem_id}: {str(e)}")
+                import traceback
+                logger.error(traceback.format_exc())
+                
+                error_result = {
+                    "problem_id": problem_id,
+                    "combined_score": 0.0,
+                    "runs_successfully": 0.0,
+                    "status": "exception",
+                    "message": str(e),
+                }
+                results.append(error_result)
+                self.results.append(error_result)
+                self.save_problem_result(error_result)
+        
+        return results
+    
+    def save_results(self, output_file: str = "evaluation_results.json"):
+        """
+        Save evaluation results to a JSON file.
+        
+        Args:
+            output_file: Path to save the results
+        """
+        output_path = Path(output_file)
+        with open(output_path, 'w') as f:
+            json.dump(self.results, f, indent=2)
+        logger.info(f"Results saved to {output_path}")
+    
+    def save_problem_result(self, result: Dict):
+        """
+        Save individual problem result to a separate file.
+        
+        Args:
+            result: The evaluation result for a single problem
+        """
+        problem_id = result.get("problem_id", "unknown")
+        result_file = self.results_dir / f"problem_{problem_id}.json"
+        
+        with open(result_file, 'w') as f:
+            json.dump(result, f, indent=2)
+        logger.info(f"Problem {problem_id} result saved to {result_file}")
+    
+    def print_summary(self):
+        """Print a summary of the evaluation results."""
+        if not self.results:
+            logger.info("No results to summarize")
+            return
+        
+        logger.info("\n" + "="*80)
+        logger.info("EVALUATION SUMMARY")
+        logger.info("="*80)
+        
+        successful = [r for r in self.results if r.get("status") == "success"]
+        timeout = [r for r in self.results if r.get("status") == "timeout"]
+        compilation_error = [r for r in self.results if r.get("status") == "compilation_error"]
+        other_error = [r for r in self.results if r.get("status") not in ["success", "timeout", "compilation_error"]]
+        
+        logger.info(f"Total problems evaluated: {len(self.results)}")
+        logger.info(f"Successful: {len(successful)}")
+        logger.info(f"Timeouts: {len(timeout)}")
+        logger.info(f"Compilation errors: {len(compilation_error)}")
+        logger.info(f"Other errors: {len(other_error)}")
+        
+        if successful:
+            scores = [r["combined_score"] for r in successful]
+            logger.info(f"\nSuccessful evaluation scores:")
+            logger.info(f"  Average score: {sum(scores) / len(scores):.2f}")
+            logger.info(f"  Min score: {min(scores):.2f}")
+            logger.info(f"  Max score: {max(scores):.2f}")
+            
+            logger.info(f"\nTop 5 problems by score:")
+            top_5 = sorted(successful, key=lambda r: r["combined_score"], reverse=True)[:5]
+            for i, result in enumerate(top_5, 1):
+                logger.info(f"  {i}. Problem {result['problem_id']}: {result['combined_score']:.2f}")
+        
+        logger.info("="*80 + "\n")
+
+
+def main():
+    """Main entry point."""
+    import argparse
+    
+    parser = argparse.ArgumentParser(
+        description="Evaluate all best_program.cpp files in the outputs directory"
+    )
+    
+    # Default outputs directory is two levels up from this script
+    default_outputs_dir = Path(__file__).resolve().parent.parent.parent / "outputs"
+    
+    parser.add_argument(
+        "--outputs-dir",
+        type=str,
+        default=str(default_outputs_dir),
+        help="Path to the outputs directory (default: ../../outputs from script location)"
+    )
+    parser.add_argument(
+        "--judge-url",
+        type=str,
+        default="http://localhost:8081",
+        help="URL of the judge server (default: http://localhost:8081)"
+    )
+    parser.add_argument(
+        "--output-file",
+        type=str,
+        default="evaluation_results.json",
+        help="Path to save the evaluation results (default: evaluation_results.json)"
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=8,
+        help="Number of parallel workers for evaluation (default: 8)"
+    )
+    
+    args = parser.parse_args()
+    
+    # Run evaluations
+    evaluator = BestProgramEvaluator(
+        outputs_dir=args.outputs_dir,
+        judge_url=args.judge_url,
+        num_workers=args.workers
+    )
+    
+    results = evaluator.run_all_evaluations()
+    evaluator.save_results(args.output_file)
+    evaluator.print_summary()
+    
+    logger.info(f"Evaluation complete. Results saved to {args.output_file}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/gpu_mode/mla_decode/config.yaml b/benchmarks/gpu_mode/mla_decode/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..87418e34edf8eef667a6b85b7a499c8a7e239a84
--- /dev/null
+++ b/benchmarks/gpu_mode/mla_decode/config.yaml
@@ -0,0 +1,355 @@
+# GPU Mode: MLA Decode (Multi-Head Latent Attention) Triton Kernel
+
+max_iterations: 100
+checkpoint_interval: 1
+log_level: "INFO"
+
+llm:
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  api_base: https://api.openai.com/v1
+  temperature: 1.0
+  # top_p: 0.95  # omitted by default; some providers (e.g. Anthropic) reject both temperature and top_p
+  max_tokens: 32000
+  timeout: 600
+
+prompt:
+  system_message: |
+    You are an expert Triton engineer tasked with translating PyTorch code into highly optimized Triton kernel code.
+
+    Below is a pytorch implementation of the multi-head latent attention (MLA) module. You will want to implement a Triton kernel for the operations in the forward call:
+
+    ```python
+    import math
+    from dataclasses import dataclass
+    import torch
+    from torch import nn
+    import torch.nn.functional as F
+
+    class RoPE(nn.Module):
+        def __init__(self, d_model: int):
+            super().__init__()
+            self.d_model = d_model
+            theta = 10000 ** (-torch.arange(0, d_model//2,dtype=torch.bfloat16) / (d_model//2))
+            self.register_buffer("theta", theta)
+
+        def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
+            x1, x2 = x.chunk(2, dim=-1)
+            return torch.cat((-x2, x1), dim=-1)
+
+        def forward(self, x: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
+            seq_len = x.size(-2)
+            d_model = x.size(-1)
+            assert d_model == self.d_model
+            seq_idx = torch.arange(start_pos, start_pos + seq_len, device=x.device)
+            idx_theta = torch.einsum('s,d->sd', seq_idx, self.theta)
+            idx_theta2 = torch.cat([idx_theta, idx_theta], dim=-1)
+            cos = idx_theta2.cos().to(torch.bfloat16)
+            sin = idx_theta2.sin().to(torch.bfloat16)
+            return x * cos + self.rotate_half(x) * sin
+
+    class KVCache(nn.Module):
+        def __init__(self, kv_cache_shape: tuple) -> None:
+            super().__init__()
+            self.register_buffer('data', torch.zeros(kv_cache_shape, dtype=torch.bfloat16, device='cuda'))
+            self.seq_len = 0
+            self.zero()
+
+        def zero(self) -> None:
+            self.data.zero_()
+
+        def get_data(self) -> torch.Tensor:
+            return self.data
+
+        def forward(self, c_kv: torch.Tensor) -> torch.Tensor:
+            assert self.seq_len + c_kv.size(1) <= self.data.size(1), "KV Cache Exceeded"
+
+            self.data = self.data.to(c_kv.dtype)
+            self.data[
+                :, self.seq_len : self.seq_len + c_kv.size(1), :
+            ] = c_kv
+            self.seq_len += c_kv.size(1)
+
+            return self.data[:, :self.seq_len], self.seq_len
+
+    @dataclass
+    class Config:
+        batch_size: int
+        dim: int
+        n_heads: int
+        q_lora_rank: int
+        kv_lora_rank: int
+        qk_nope_head_dim: int
+        qk_rope_head_dim: int
+        v_head_dim: int
+        seq_len: int
+        max_seq_len: int
+        kv_cache_shape: tuple
+        Q_proj_down_weight: torch.Tensor
+        Q_proj_up_weight: torch.Tensor
+        KV_proj_down_weight: torch.Tensor
+        KV_proj_up_weight: torch.Tensor
+        wo_weight: torch.Tensor
+
+    class MLA(nn.Module):
+        def __init__(self, config: Config):
+            super().__init__()
+            self.dim = config.dim
+            self.n_heads = config.n_heads
+            self.q_lora_rank = config.q_lora_rank
+            self.kv_lora_rank = config.kv_lora_rank
+            self.nope_head_dim = config.qk_nope_head_dim
+            self.rope_head_dim = config.qk_rope_head_dim
+            self.v_head_dim = config.v_head_dim
+            # Down-projection matrices
+            self.Q_proj_down = nn.Linear(self.dim, self.q_lora_rank, bias=False, dtype=torch.bfloat16)
+            self.KV_proj_down = nn.Linear(self.dim, self.kv_lora_rank + self.rope_head_dim, bias=False, dtype=torch.bfloat16)
+
+            # Up-projection and rope projection matrices
+            self.Q_proj_up = nn.Linear(self.q_lora_rank, (self.nope_head_dim + self.rope_head_dim) * self.n_heads, bias=False, dtype=torch.bfloat16)
+            self.KV_proj_up = nn.Linear(self.kv_lora_rank, (self.nope_head_dim + self.v_head_dim) * self.n_heads, bias=False, dtype=torch.bfloat16)
+
+            # RoPE on half embeddings
+            self.q_rope = RoPE(self.rope_head_dim)
+            self.k_rope = RoPE(self.rope_head_dim)
+
+            # Output projection
+            self.wo = nn.Linear(self.v_head_dim * self.n_heads, self.dim, dtype=torch.bfloat16, bias=False)
+            self.eps = 1e-6
+
+        def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
+            # seq_len = 1 always here
+            batch_size, seq_len, model_dim = x.size()
+
+            ## Step 1: Handle down-projection + KV cache ##
+
+            q_lora = self.Q_proj_down(x)
+            kv_lora = self.KV_proj_down(x)
+            kv_lora, kv_len = kv_cache(kv_lora)
+            query_pos = kv_len - 1
+
+            ## Step 2: Up-project and prepare NoPE + RoPE ##
+
+            # Handle queries Q first
+            q_nope_and_rope = self.Q_proj_up(q_lora).view(
+                batch_size, seq_len, self.n_heads, self.nope_head_dim + self.rope_head_dim)
+            q_nope, q_rope = torch.split(q_nope_and_rope, [self.nope_head_dim, self.rope_head_dim], dim=-1)
+
+            # Handle keys and values K/V. V does not need RoPE
+            kv_nope, k_rope = torch.split(kv_lora, [self.kv_lora_rank, self.rope_head_dim], dim=-1)
+            kv_nope = self.KV_proj_up(kv_nope).view(
+                batch_size, kv_len, self.n_heads, self.nope_head_dim + self.v_head_dim)
+            k_nope, v = torch.split(kv_nope, [self.nope_head_dim, self.v_head_dim], dim=-1)
+
+            ## Step 3: Handle RoPE Stream ##
+
+            # Compute RoPE for queries and combine with no-RoPE part
+            q_rope = q_rope.permute(0, 2, 1, 3) # bs x n_heads x seq_len x rope_head_dim
+            q_rope = self.q_rope(q_rope, start_pos=query_pos)
+
+            q_nope = q_nope.permute(0, 2, 1, 3) # bs x n_heads x seq_len x rope_head_dim
+            q = torch.concat([q_nope, q_rope], dim=-1)
+
+            # Compute RoPE for keys and combine with no-RoPE part
+            k_rope = k_rope[:, None, :, :]
+            k_rope = self.k_rope(k_rope).expand(-1,self.n_heads,-1,-1)
+            k_nope = k_nope.permute(0, 2, 1, 3) # bs x kv_len x n_heads x rope_head_dim
+            k = torch.concat([k_nope, k_rope], dim=-1)
+
+            ## Step 4: Compute Multi-head Attention ##
+
+            v = v.permute(0, 2, 1, 3) # bs x n_heads x kv_len x v_head_dim
+            scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.rope_head_dim + self.nope_head_dim)
+            attn = F.softmax(scores, dim=-1).to(torch.bfloat16)
+            y = torch.matmul(attn, v).view(batch_size, 1, -1)
+            y = self.wo(y)
+
+            return y, kv_cache.get_data()
+    ```
+
+    Your function should be defined as 'custom_kernel' (skeleton provided below)
+
+    ```python
+    ### DO NOT CHANGE THIS IMPORT STATEMENTS BLOCK ###
+    import os
+    import math
+    from typing import Tuple
+    import torch
+    import torch.nn.functional as F
+    import triton
+    from reference import KVCache, Config  # Definition of KVCache and Config classes are shown above. Must import this way. Do not rewrite yourself.
+    ### END OF IMPORT STATEMENTS BLOCK ###
+
+    ### Import other packages here if needed
+
+    def custom_kernel(data: Tuple[Config, torch.Tensor, KVCache]) -> Tuple[torch.Tensor, KVCache]:
+        config, x, kv_cache = data
+
+        bs = config.batch_size
+        sl = config.seq_len
+        pl = kv_cache.seq_len
+        msl = config.max_seq_len
+        nh = config.n_heads
+        d =  config.dim
+        dq = config.q_lora_rank
+        dkv = config.kv_lora_rank
+        dnope = config.qk_nope_head_dim
+        drope = config.qk_rope_head_dim
+        dv = config.v_head_dim
+
+        wDQ  = config.Q_proj_down_weight
+        wDKV = config.KV_proj_down_weight
+        wUQ  = config.Q_proj_up_weight
+        wUKV = config.KV_proj_up_weight
+        wO   = config.wo_weight
+
+        # Perform MLA operations to process data into output and updated kv_cache
+
+        return output, kv_cache.data
+    ```
+
+    with the following signature:
+
+    Input:
+    - `data`: Tuple of (config: Config, x: torch.Tensor, kv_cache: KVCache)
+        - config: An instance of class `Config` containing model configurations and weights
+        - x: Input tensor of shape [batch_size, seq_len, dim]
+        - kv_cache: An instance of KVCache class for caching the keys and values
+
+    Output:
+    - output: Output tensor [batch_size, seq_len, dim]
+    - kv_cache.data: The data field of the updated `KVCache` instance with the new keys and values added
+
+    To warm you up in writing optimized triton code, here is an example code which is correct for your task but very unoptimized. Your code should be as optimized as possible but still correct.
+
+    ```python
+    import os
+    import math
+    from typing import Tuple
+    import torch
+    import torch.nn.functional as F
+    import triton
+    import triton.language as tl
+    from reference import KVCache, Config
+
+    @triton.jit
+    def rope_swap_halves_kernel(
+        x_ptr,
+        cos_ptr, sin_ptr,
+        B: tl.constexpr,
+        T: tl.constexpr,
+        D: tl.constexpr,
+        stride_xb, stride_xt, stride_xd,
+        stride_cos_t, stride_cos_d,
+        stride_sin_t, stride_sin_d,
+        BLOCK_HALF: tl.constexpr,
+    ):
+        pid = tl.program_id(0)
+        bt = pid
+        b = bt // T
+        t = bt - b * T
+        half = D // 2
+        off = tl.arange(0, BLOCK_HALF)
+        mask = off < half
+        x_base = x_ptr + b * stride_xb + t * stride_xt
+        x0_ptr = x_base + off * stride_xd
+        x1_ptr = x_base + (half + off) * stride_xd
+        cos_base = cos_ptr + t * stride_cos_t
+        sin_base = sin_ptr + t * stride_sin_t
+        c_ptr = cos_base + off * stride_cos_d
+        s_ptr = sin_base + off * stride_sin_d
+        x0 = tl.load(x0_ptr, mask=mask, other=0.0).to(tl.float32)
+        x1 = tl.load(x1_ptr, mask=mask, other=0.0).to(tl.float32)
+        c  = tl.load(c_ptr,  mask=mask, other=0.0).to(tl.float32)
+        s  = tl.load(s_ptr,  mask=mask, other=0.0).to(tl.float32)
+        out0 = x0 * c - x1 * s
+        out1 = x1 * c + x0 * s
+        tl.store(x0_ptr, out0.to(tl.bfloat16), mask=mask)
+        tl.store(x1_ptr, out1.to(tl.bfloat16), mask=mask)
+
+    # ... (see initial_program.py for full working baseline)
+    ```
+
+    Below are the different configs that your kernel will be tested on:
+
+    Common configs:
+      - {"batch_size": 128, "seq_len": 1, "kv_lora_rank": 512, "qk_rope_head_dim": 64, "v_head_dim": 128, "n_heads": 128, "dim": 7168, "q_lora_rank": 1536, "max_seq_len": 8192}
+
+    For correctness check:
+      - {"prefill": 128}
+      - {"prefill": 512}
+      - {"prefill": 1024}
+      - {"prefill": 2048}
+
+    For performance benchmark (optimize runtime for these):
+      - {"prefill": 6144}
+
+    Rules:
+    - The tensors arguments passed in will be already on your cuda device.
+    - The weights for all parameters in the MLA will be given as input.
+    - All weights and data will be in `torch.bfloat16` format.
+    - Define all of your code in one final ```python ``` block.
+    - The entrypoint to your code must be named 'custom_kernel'.
+    - You will be using trition 3.4.0 and your kernels will be run on an Nvidia H200 GPU.
+    - Consider optimizing multiple operations with triton, not just limited to softmax. E.g., rope, attention, etc.
+    - You are allowed to use torch.compile().
+
+    Important rules in triton 3.4.0:
+    - `tl.load` does not have an argument called `dtype`. Never use it like `tl.load(..., dtype=...)`.
+    - Triton dtypes are not callable, so never use them like `tl.float16(1.0)`, `tl.float32(0.0)`.
+    - `tl.arange(start, end)`:
+        - range length (end - start) must be power-of-2
+        - start, end must be of type `tl.constexpr`
+    - `tl.range(start, end, step, num_stages)`:
+        - keep loop index type stable, don't reassign it
+        - start, end, step do not have to be `tl.constexpr` but must stay scalar integer types
+        - num_stages must be `tl.constexpr`
+    - Do not something like x[0] or offs[0] inside a Triton kernel. Triton tensors are SIMD vectors; scalar indexing like [0] is not generally supported.
+
+    Here's an simple example correctly following these rules:
+
+    ```python
+    import torch
+    import triton
+    import triton.language as tl
+
+    @triton.jit
+    def kernel_right(
+        x_ptr, y_ptr, out_ptr,
+        n_elements: tl.constexpr,
+        BLOCK: tl.constexpr,
+        ROW_STEP: tl.constexpr,
+        NUM_STAGES: tl.constexpr,
+    ):
+        pid = tl.program_id(axis=0)
+        offs = pid * BLOCK + tl.arange(0, BLOCK)
+        mask = offs < n_elements
+        x = tl.load(x_ptr + offs, mask=mask, other=0.0)
+        y = tl.load(y_ptr + offs, mask=mask, other=0.0)
+        one_f32 = tl.full([], 1.0, tl.float32)
+        acc = tl.zeros((BLOCK,), dtype=tl.float32)
+        acc = tl.cast(x, tl.float32) + tl.cast(y, tl.float32) + one_f32
+        base = tl.full([], pid * BLOCK, tl.int32)
+        x0 = tl.load(x_ptr + base, mask=(base < n_elements), other=0.0)
+        x0_vec = tl.full((BLOCK,), x0, tl.float32)
+        out_vec = acc + x0_vec
+        n_rows = tl.full([], 4, tl.int32)
+        extra = tl.zeros((BLOCK,), dtype=tl.float32)
+        for r in tl.range(0, n_rows, ROW_STEP, num_stages=NUM_STAGES):
+            shift = r * tl.full([], 1, tl.int32)
+            offs_r = offs + shift
+            xr = tl.load(x_ptr + offs_r, mask=(offs_r < n_elements), other=0.0)
+            extra += tl.cast(xr, tl.float32)
+        out_vec = out_vec + extra
+        tl.store(out_ptr + offs, tl.cast(out_vec, tl.float16), mask=mask)
+    ```
+evaluator:
+  timeout: 600
+  max_retries: 3
+  cascade_evaluation: true
+  cascade_thresholds: [0.4, 0.3]
+
+diff_based_generation: true
+max_solution_length: 60000
+random_seed: 42
diff --git a/benchmarks/gpu_mode/mla_decode/initial_program.py b/benchmarks/gpu_mode/mla_decode/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe89b054ca44a85b5e86eaf717d608257dff4e39
--- /dev/null
+++ b/benchmarks/gpu_mode/mla_decode/initial_program.py
@@ -0,0 +1,245 @@
+# EVOLVE-BLOCK-START
+"""
+Initial MLA Decode submission — optimised baseline with Triton softmax and RoPE kernels.
+"""
+
+import os
+import math
+from typing import Tuple
+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+from reference import KVCache, Config
+
+
+@triton.jit
+def rope_swap_halves_kernel(
+    x_ptr,
+    cos_ptr, sin_ptr,
+    B: tl.constexpr,
+    T: tl.constexpr,
+    D: tl.constexpr,
+    stride_xb, stride_xt, stride_xd,
+    stride_cos_t, stride_cos_d,
+    stride_sin_t, stride_sin_d,
+    BLOCK_HALF: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    bt = pid
+    b = bt // T
+    t = bt - b * T
+
+    half = D // 2
+
+    off = tl.arange(0, BLOCK_HALF)
+    mask = off < half
+
+    x_base = x_ptr + b * stride_xb + t * stride_xt
+    x0_ptr = x_base + off * stride_xd
+    x1_ptr = x_base + (half + off) * stride_xd
+
+    cos_base = cos_ptr + t * stride_cos_t
+    sin_base = sin_ptr + t * stride_sin_t
+
+    c_ptr = cos_base + off * stride_cos_d
+    s_ptr = sin_base + off * stride_sin_d
+
+    x0 = tl.load(x0_ptr, mask=mask, other=0.0).to(tl.float32)
+    x1 = tl.load(x1_ptr, mask=mask, other=0.0).to(tl.float32)
+    c = tl.load(c_ptr, mask=mask, other=0.0).to(tl.float32)
+    s = tl.load(s_ptr, mask=mask, other=0.0).to(tl.float32)
+
+    out0 = x0 * c - x1 * s
+    out1 = x1 * c + x0 * s
+
+    tl.store(x0_ptr, out0.to(tl.bfloat16), mask=mask)
+    tl.store(x1_ptr, out1.to(tl.bfloat16), mask=mask)
+
+
+def rope_inplace_query(q_rope: torch.Tensor, cos_q: torch.Tensor, sin_q: torch.Tensor):
+    assert q_rope.is_cuda
+    assert q_rope.shape[-1] % 2 == 0
+    bs, nh, d_rope = q_rope.shape
+
+    half = d_rope // 2
+    BLOCK_HALF = 1 << (half - 1).bit_length()
+
+    grid = (bs * nh,)
+
+    rope_swap_halves_kernel[grid](
+        q_rope,
+        cos_q, sin_q,
+        B=bs, T=nh, D=d_rope,
+        stride_xb=q_rope.stride(0),
+        stride_xt=q_rope.stride(1),
+        stride_xd=q_rope.stride(2),
+        stride_cos_t=0, stride_cos_d=cos_q.stride(0),
+        stride_sin_t=0, stride_sin_d=sin_q.stride(0),
+        BLOCK_HALF=BLOCK_HALF,
+        num_warps=4,
+    )
+
+
+_rope_cache = {}
+
+
+def _rotate_half(x: torch.Tensor) -> torch.Tensor:
+    half = x.shape[-1] // 2
+    return torch.cat((-x[..., half:], x[..., :half]), dim=-1)
+
+
+def _get_rope_tables(dim: int, max_seq_len: int, device: torch.device):
+    key = (dim, max_seq_len, device)
+    if key not in _rope_cache:
+        half = dim // 2
+        theta = (10000.0 ** (-torch.arange(half, dtype=torch.float32, device=device) / half)).to(
+            torch.bfloat16
+        )
+        pos = torch.arange(max_seq_len, dtype=torch.int64, device=device).unsqueeze_(1)
+        idx = pos * theta[None, :]
+        idx = torch.cat([idx, idx], dim=-1)
+        _rope_cache[key] = (idx.cos().to(torch.bfloat16), idx.sin().to(torch.bfloat16))
+    return _rope_cache[key]
+
+
+@triton.jit
+def _softmax_kernel(
+    out_ptr, in_ptr,
+    stride_out, stride_in,
+    n_cols,
+    BLOCK_SIZE: tl.constexpr,
+    NUM_STAGES: tl.constexpr,
+):
+    row = tl.program_id(0)
+    row_off_in = row * stride_in
+    row_off_out = row * stride_out
+
+    max_val = tl.full([BLOCK_SIZE], -float("inf"), tl.float32)
+    col = tl.arange(0, BLOCK_SIZE)
+    for start in range(0, n_cols, BLOCK_SIZE):
+        cur = start + col
+        mask = cur < n_cols
+        val = tl.load(in_ptr + row_off_in + cur, mask=mask, other=-float('inf'))
+        max_val = tl.maximum(max_val, tl.cast(val, tl.float32))
+    row_max = tl.max(max_val)
+
+    sum_val = tl.full([BLOCK_SIZE], 0.0, tl.float32)
+    for start in range(0, n_cols, BLOCK_SIZE):
+        cur = start + col
+        mask = cur < n_cols
+        val = tl.load(in_ptr + row_off_in + cur, mask=mask, other=-float('inf'))
+        exp_val = tl.exp(tl.cast(val, tl.float32) - row_max)
+        tl.store(out_ptr + row_off_out + cur, tl.cast(exp_val, tl.bfloat16), mask=mask)
+        sum_val += exp_val
+    row_sum = tl.sum(sum_val)
+
+    for start in range(0, n_cols, BLOCK_SIZE):
+        cur = start + col
+        mask = cur < n_cols
+        val = tl.load(out_ptr + row_off_out + cur, mask=mask, other=0.0)
+        norm = tl.cast(val, tl.float32) / row_sum
+        tl.store(out_ptr + row_off_out + cur, tl.cast(norm, tl.bfloat16), mask=mask)
+
+
+def _triton_softmax(x: torch.Tensor) -> torch.Tensor:
+    assert x.is_cuda and x.dtype == torch.bfloat16
+    n_rows, n_cols = x.shape
+
+    if n_cols <= 32:
+        BLOCK_SIZE = 32
+    elif n_cols <= 64:
+        BLOCK_SIZE = 64
+    elif n_cols <= 128:
+        BLOCK_SIZE = 128
+    else:
+        BLOCK_SIZE = 1 << (n_cols - 1).bit_length()
+        BLOCK_SIZE = min(BLOCK_SIZE, 1024)
+
+    out = torch.empty_like(x)
+    grid = (n_rows,)
+    _softmax_kernel[grid](
+        out, x,
+        out.stride(0), x.stride(0),
+        n_cols,
+        BLOCK_SIZE=BLOCK_SIZE,
+        NUM_STAGES=2,
+        num_warps=4,
+    )
+    return out
+
+
+def custom_kernel(data: Tuple[Config, torch.Tensor, KVCache]) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Optimised forward step of the Multi-head Latent Attention (MLA) module.
+    """
+    config, x, kv_cache = data
+
+    bs = config.batch_size
+    sl = config.seq_len
+    nh = config.n_heads
+    dq = config.q_lora_rank
+    dkv = config.kv_lora_rank
+    d_nope = config.qk_nope_head_dim
+    d_rope = config.qk_rope_head_dim
+    dv = config.v_head_dim
+    msl = config.max_seq_len
+
+    wDQ = config.Q_proj_down_weight
+    wDKV = config.KV_proj_down_weight
+    wUQ = config.Q_proj_up_weight
+    wUKV = config.KV_proj_up_weight
+    wO = config.wo_weight
+
+    q_lora = F.linear(x, wDQ)
+    kv_lora_input = F.linear(x, wDKV)
+
+    kv_lora, kv_len = kv_cache(kv_lora_input)
+    query_pos = kv_len - 1
+
+    q_up = F.linear(q_lora.squeeze(1), wUQ)
+    q_up = q_up.view(bs, nh, d_nope + d_rope)
+    q_nope = q_up[..., :d_nope]
+    q_rope = q_up[..., d_nope:]
+
+    kv_nope_input = kv_lora[..., :dkv]
+    k_rope_input = kv_lora[..., dkv:]
+
+    cos_table, sin_table = _get_rope_tables(d_rope, msl, x.device)
+
+    cos_q = cos_table[query_pos].view(d_rope).contiguous()
+    sin_q = sin_table[query_pos].view(d_rope).contiguous()
+    rope_inplace_query(q_rope, cos_q, sin_q)
+
+    cos_k = cos_table[:kv_len]
+    sin_k = sin_table[:kv_len]
+    k_rope = k_rope_input * cos_k + _rotate_half(k_rope_input) * sin_k
+
+    wUKV_view = wUKV.view(nh, d_nope + dv, dkv)
+    wK = wUKV_view[:, :d_nope, :]
+    q_nope_latent = torch.einsum('bhd,hdk->bhk', q_nope, wK)
+
+    kv_nope_T = kv_nope_input.transpose(1, 2)
+    scores_nope = torch.matmul(q_nope_latent, kv_nope_T)
+
+    scores_rope = torch.matmul(q_rope, k_rope.transpose(-2, -1))
+
+    scale = 1.0 / math.sqrt(d_nope + d_rope)
+    scores = (scores_nope + scores_rope) * scale
+
+    scores_flat = scores.reshape(bs * nh, kv_len)
+    attn_flat = _triton_softmax(scores_flat)
+    attn = attn_flat.view(bs, nh, kv_len)
+
+    M = torch.matmul(attn, kv_nope_input)
+
+    wV = wUKV_view[:, d_nope:, :]
+    wV_T = wV.permute(0, 2, 1)
+    y_head = torch.einsum('bhd,hdk->bhk', M, wV_T)
+
+    y = y_head.reshape(bs, nh * dv)
+    y = y.unsqueeze(1)
+    output = F.linear(y, wO)
+
+    return output, kv_cache.data
+# EVOLVE-BLOCK-END
diff --git a/benchmarks/gpu_mode/mla_decode/reference.py b/benchmarks/gpu_mode/mla_decode/reference.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32f8da9cddc98b2395db6c28538863f76fb589d
--- /dev/null
+++ b/benchmarks/gpu_mode/mla_decode/reference.py
@@ -0,0 +1,520 @@
+"""
+Reference implementation for MLA Decode (Multi-Head Latent Attention) Triton kernel.
+Same test cases, benchmarks, generate_input, ref_kernel, and check_implementation.
+"""
+
+import math
+from dataclasses import dataclass
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+# ---------------------------------------------------------------------------
+# Scoring and benchmark configuration (read by shared_eval.py)
+# ---------------------------------------------------------------------------
+
+SCORE_SCALE = 3000.0
+
+# MLA uses wall-clock timing, 1% rel error, no wall clock timeout, torch.no_grad()
+BENCH_USE_CUDA_EVENTS = False
+BENCH_REL_ERROR = 0.01
+BENCH_WALL_TIMEOUT_NS = None
+BENCH_NO_GRAD = True
+BENCH_MAX_REPEATS = 100
+BENCH_MAX_TIME_NS = 10e9
+BENCH_WARMUP_STYLE = 'timed_calls'
+
+# ---------------------------------------------------------------------------
+# Model classes (needed by both reference and submissions)
+# ---------------------------------------------------------------------------
+
+
+class RoPE(nn.Module):
+    def __init__(self, d_model: int):
+        super().__init__()
+        self.d_model = d_model
+        theta = 10000 ** (-torch.arange(0, d_model // 2, dtype=torch.bfloat16) / (d_model // 2))
+        self.register_buffer("theta", theta)
+
+    def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+
+    def forward(self, x: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
+        seq_len = x.size(-2)
+        d_model = x.size(-1)
+        assert d_model == self.d_model
+        seq_idx = torch.arange(start_pos, start_pos + seq_len, device=x.device)
+        idx_theta = torch.einsum('s,d->sd', seq_idx, self.theta)
+        idx_theta2 = torch.cat([idx_theta, idx_theta], dim=-1)
+        cos = idx_theta2.cos().to(torch.bfloat16)
+        sin = idx_theta2.sin().to(torch.bfloat16)
+        return x * cos + self.rotate_half(x) * sin
+
+
+class KVCache(nn.Module):
+    def __init__(self, kv_cache_shape: tuple, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.register_buffer('data', torch.zeros(kv_cache_shape, dtype=torch.bfloat16))
+        self.seq_len = 0
+        self.zero()
+
+    def zero(self) -> None:
+        self.data.zero_()
+
+    def get_data(self) -> torch.Tensor:
+        return self.data
+
+    def forward(self, c_kv: torch.Tensor) -> torch.Tensor:
+        assert self.seq_len + c_kv.size(1) <= self.data.size(1), "KV Cache Exceeded"
+
+        self.data = self.data.to(c_kv.dtype)
+        self.data[
+            :, self.seq_len: self.seq_len + c_kv.size(1), :
+        ] = c_kv
+        self.seq_len += c_kv.size(1)
+
+        return self.data[:, :self.seq_len], self.seq_len
+
+
+@dataclass
+class Config:
+    batch_size: int
+    dim: int
+    n_heads: int
+    q_lora_rank: int
+    kv_lora_rank: int
+    qk_nope_head_dim: int
+    qk_rope_head_dim: int
+    v_head_dim: int
+    seq_len: int
+    max_seq_len: int
+    kv_cache_shape: tuple
+    Q_proj_down_weight: torch.Tensor
+    Q_proj_up_weight: torch.Tensor
+    KV_proj_down_weight: torch.Tensor
+    KV_proj_up_weight: torch.Tensor
+    wo_weight: torch.Tensor
+
+
+class MLA(nn.Module):
+    def __init__(self, config: Config):
+        super().__init__()
+        self.dim = config.dim
+        self.n_heads = config.n_heads
+        self.q_lora_rank = config.q_lora_rank
+        self.kv_lora_rank = config.kv_lora_rank
+        self.nope_head_dim = config.qk_nope_head_dim
+        self.rope_head_dim = config.qk_rope_head_dim
+        self.v_head_dim = config.v_head_dim
+        self.Q_proj_down = nn.Linear(self.dim, self.q_lora_rank, dtype=torch.bfloat16, bias=False)
+        self.KV_proj_down = nn.Linear(self.dim, self.kv_lora_rank + self.rope_head_dim, dtype=torch.bfloat16, bias=False)
+        self.Q_proj_up = nn.Linear(self.q_lora_rank, (self.nope_head_dim + self.rope_head_dim) * self.n_heads, dtype=torch.bfloat16, bias=False)
+        self.KV_proj_up = nn.Linear(self.kv_lora_rank, (self.nope_head_dim + self.v_head_dim) * self.n_heads, dtype=torch.bfloat16, bias=False)
+        self.q_rope = RoPE(self.rope_head_dim)
+        self.k_rope = RoPE(self.rope_head_dim)
+        self.wo = nn.Linear(self.v_head_dim * self.n_heads, self.dim, dtype=torch.bfloat16, bias=False)
+        self.eps = 1e-6
+
+    def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
+        batch_size, seq_len, model_dim = x.size()
+
+        q_lora = self.Q_proj_down(x)
+        kv_lora = self.KV_proj_down(x)
+        kv_lora, kv_len = kv_cache(kv_lora)
+        query_pos = kv_len - 1
+
+        q_nope_and_rope = self.Q_proj_up(q_lora).view(
+            batch_size, seq_len, self.n_heads, self.nope_head_dim + self.rope_head_dim)
+        q_nope, q_rope = torch.split(q_nope_and_rope, [self.nope_head_dim, self.rope_head_dim], dim=-1)
+
+        kv_nope, k_rope = torch.split(kv_lora, [self.kv_lora_rank, self.rope_head_dim], dim=-1)
+        kv_nope = self.KV_proj_up(kv_nope).view(
+            batch_size, kv_len, self.n_heads, self.nope_head_dim + self.v_head_dim)
+        k_nope, v = torch.split(kv_nope, [self.nope_head_dim, self.v_head_dim], dim=-1)
+
+        q_rope = q_rope.permute(0, 2, 1, 3)
+        q_rope = self.q_rope(q_rope, start_pos=query_pos)
+
+        q_nope = q_nope.permute(0, 2, 1, 3)
+        q = torch.concat([q_nope, q_rope], dim=-1)
+
+        k_rope = k_rope[:, None, :, :]
+        k_rope = self.k_rope(k_rope).expand(-1, self.n_heads, -1, -1)
+        k_nope = k_nope.permute(0, 2, 1, 3)
+        k = torch.concat([k_nope, k_rope], dim=-1)
+
+        v = v.permute(0, 2, 1, 3)
+        scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.rope_head_dim + self.nope_head_dim)
+        attn = F.softmax(scores, dim=-1).to(torch.bfloat16)
+        y = torch.matmul(attn, v).view(batch_size, 1, -1)
+        y = self.wo(y)
+
+        return y, kv_cache.get_data()
+
+
+# ---------------------------------------------------------------------------
+# Test / benchmark cases — from discover task.yml
+# ---------------------------------------------------------------------------
+
+TEST_CASES = [
+    {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 128, "seed": 9247},
+    {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 512, "seed": 2197},
+    {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 1024, "seed": 9107},
+    {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 2048, "seed": 5291},
+]
+
+BENCHMARK_CASES = [
+    {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 4096, "seed": 9817},
+    {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 6144, "seed": 5291},
+]
+
+
+# ---------------------------------------------------------------------------
+# Input generation
+# ---------------------------------------------------------------------------
+
+
+def generate_input(batchsize, dim, dq, prefill, seed):
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+
+    Q_proj_down_weight = torch.randn((dq, dim), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dim)
+    KV_proj_down_weight = torch.randn((512 + 64, dim), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dim)
+    Q_proj_up_weight = torch.randn(((128 + 64) * 128, dq), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dq)
+    KV_proj_up_weight = torch.randn(((128 + 128) * 128, 512), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(512)
+    wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(128 * 128)
+
+    config = Config(
+        batch_size=batchsize,
+        dim=dim,
+        q_lora_rank=dq,
+        n_heads=128,
+        kv_lora_rank=512,
+        qk_nope_head_dim=128,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        seq_len=1,
+        max_seq_len=8192,
+        kv_cache_shape=(batchsize, 8192, 512 + 64),
+        Q_proj_down_weight=Q_proj_down_weight,
+        Q_proj_up_weight=Q_proj_up_weight,
+        KV_proj_down_weight=KV_proj_down_weight,
+        KV_proj_up_weight=KV_proj_up_weight,
+        wo_weight=wo_weight,
+    )
+    x = torch.randn((config.batch_size, 1, config.dim), dtype=torch.bfloat16, generator=gen, device='cuda')
+
+    kv_cache = KVCache((config.batch_size, config.max_seq_len, config.kv_lora_rank + config.qk_rope_head_dim)).to('cuda')
+    pre_filled_cache = torch.randn(
+        (config.batch_size, prefill, config.kv_lora_rank + config.qk_rope_head_dim),
+        dtype=torch.bfloat16, generator=gen, device='cuda')
+    kv_cache(pre_filled_cache)
+
+    return config, x, kv_cache
+
+
+# ---------------------------------------------------------------------------
+# Reference kernel
+# ---------------------------------------------------------------------------
+
+
+def ref_kernel(data):
+    config, x, kv_cache = data
+
+    model = MLA(config).to('cuda')
+    model.Q_proj_down.weight = nn.Parameter(config.Q_proj_down_weight)
+    model.Q_proj_up.weight = nn.Parameter(config.Q_proj_up_weight)
+    model.KV_proj_down.weight = nn.Parameter(config.KV_proj_down_weight)
+    model.KV_proj_up.weight = nn.Parameter(config.KV_proj_up_weight)
+    model.wo.weight = nn.Parameter(config.wo_weight)
+
+    output, kv_data = model(x, kv_cache)
+    return output, kv_data
+
+
+# ---------------------------------------------------------------------------
+# Correctness checking
+# ---------------------------------------------------------------------------
+
+
+@torch.no_grad()
+def _verbose_allclose(received, expected, rtol=1e-05, atol=1e-08, max_print=5):
+    if received.shape != expected.shape:
+        return False, [f"SIZE MISMATCH. received shape: {received.shape}, expected shape: {expected.shape}"]
+
+    diff = torch.abs(received.to(torch.float32) - expected.to(torch.float32))
+    tolerance = atol + rtol * torch.abs(expected.to(torch.float32))
+    tol_mismatched = diff > tolerance
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+
+    mismatched_indices = torch.nonzero(mismatched)
+    num_mismatched = mismatched.count_nonzero().item()
+
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return False, mismatch_details
+
+    return True, [f"Maximum error: {torch.max(diff)}"]
+
+
+def check_implementation(data, submission_output, rtol=2e-2, atol=8e-3):
+    """Check submission output against reference. Returns (passed: bool, msg: str)."""
+    import gc
+    output_mla, output_kv = submission_output
+
+    # Move submission output to CPU and free GPU memory before running ref kernel
+    output_mla_cpu = output_mla.cpu()
+    output_kv_cpu = output_kv.cpu()
+    del output_mla, output_kv
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    config, x, kv_cache = data
+    with torch.no_grad():
+        expected_mla, expected_kv = ref_kernel((config, x, kv_cache))
+
+    # Move ref output to CPU and free GPU memory before comparison
+    expected_mla_cpu = expected_mla.cpu()
+    expected_kv_cpu = expected_kv.cpu()
+    del expected_mla, expected_kv
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    good_mla, reasons_mla = _verbose_allclose(output_mla_cpu, expected_mla_cpu, rtol=rtol, atol=atol)
+    good_kv, reasons_kv = _verbose_allclose(output_kv_cpu, expected_kv_cpu, rtol=rtol, atol=atol)
+
+    if not good_mla:
+        return False, "MLA output mismatch: " + " ".join(reasons_mla)
+    if not good_kv:
+        return False, "KV cache mismatch: " + " ".join(reasons_kv)
+
+    return True, "Match"
+
+
+# ---------------------------------------------------------------------------
+# Self-contained reference code for Modal remote execution
+# ---------------------------------------------------------------------------
+
+MODAL_REFERENCE_CODE = r'''
+import math
+from dataclasses import dataclass
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+
+class RoPE(nn.Module):
+    def __init__(self, d_model: int):
+        super().__init__()
+        self.d_model = d_model
+        theta = 10000 ** (-torch.arange(0, d_model // 2, dtype=torch.bfloat16) / (d_model // 2))
+        self.register_buffer("theta", theta)
+
+    def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+
+    def forward(self, x: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
+        seq_len = x.size(-2)
+        d_model = x.size(-1)
+        assert d_model == self.d_model
+        seq_idx = torch.arange(start_pos, start_pos + seq_len, device=x.device)
+        idx_theta = torch.einsum('s,d->sd', seq_idx, self.theta)
+        idx_theta2 = torch.cat([idx_theta, idx_theta], dim=-1)
+        cos = idx_theta2.cos().to(torch.bfloat16)
+        sin = idx_theta2.sin().to(torch.bfloat16)
+        return x * cos + self.rotate_half(x) * sin
+
+
+class KVCache(nn.Module):
+    def __init__(self, kv_cache_shape: tuple, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.register_buffer('data', torch.zeros(kv_cache_shape, dtype=torch.bfloat16))
+        self.seq_len = 0
+        self.zero()
+
+    def zero(self) -> None:
+        self.data.zero_()
+
+    def get_data(self) -> torch.Tensor:
+        return self.data
+
+    def forward(self, c_kv: torch.Tensor) -> torch.Tensor:
+        assert self.seq_len + c_kv.size(1) <= self.data.size(1), "KV Cache Exceeded"
+        self.data = self.data.to(c_kv.dtype)
+        self.data[:, self.seq_len: self.seq_len + c_kv.size(1), :] = c_kv
+        self.seq_len += c_kv.size(1)
+        return self.data[:, :self.seq_len], self.seq_len
+
+
+@dataclass
+class Config:
+    batch_size: int
+    dim: int
+    n_heads: int
+    q_lora_rank: int
+    kv_lora_rank: int
+    qk_nope_head_dim: int
+    qk_rope_head_dim: int
+    v_head_dim: int
+    seq_len: int
+    max_seq_len: int
+    kv_cache_shape: tuple
+    Q_proj_down_weight: torch.Tensor
+    Q_proj_up_weight: torch.Tensor
+    KV_proj_down_weight: torch.Tensor
+    KV_proj_up_weight: torch.Tensor
+    wo_weight: torch.Tensor
+
+
+class MLA(nn.Module):
+    def __init__(self, config: Config):
+        super().__init__()
+        self.dim = config.dim
+        self.n_heads = config.n_heads
+        self.q_lora_rank = config.q_lora_rank
+        self.kv_lora_rank = config.kv_lora_rank
+        self.nope_head_dim = config.qk_nope_head_dim
+        self.rope_head_dim = config.qk_rope_head_dim
+        self.v_head_dim = config.v_head_dim
+        self.Q_proj_down = nn.Linear(self.dim, self.q_lora_rank, dtype=torch.bfloat16, bias=False)
+        self.KV_proj_down = nn.Linear(self.dim, self.kv_lora_rank + self.rope_head_dim, dtype=torch.bfloat16, bias=False)
+        self.Q_proj_up = nn.Linear(self.q_lora_rank, (self.nope_head_dim + self.rope_head_dim) * self.n_heads, dtype=torch.bfloat16, bias=False)
+        self.KV_proj_up = nn.Linear(self.kv_lora_rank, (self.nope_head_dim + self.v_head_dim) * self.n_heads, dtype=torch.bfloat16, bias=False)
+        self.q_rope = RoPE(self.rope_head_dim)
+        self.k_rope = RoPE(self.rope_head_dim)
+        self.wo = nn.Linear(self.v_head_dim * self.n_heads, self.dim, dtype=torch.bfloat16, bias=False)
+        self.eps = 1e-6
+
+    def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
+        batch_size, seq_len, model_dim = x.size()
+        q_lora = self.Q_proj_down(x)
+        kv_lora = self.KV_proj_down(x)
+        kv_lora, kv_len = kv_cache(kv_lora)
+        query_pos = kv_len - 1
+        q_nope_and_rope = self.Q_proj_up(q_lora).view(
+            batch_size, seq_len, self.n_heads, self.nope_head_dim + self.rope_head_dim)
+        q_nope, q_rope = torch.split(q_nope_and_rope, [self.nope_head_dim, self.rope_head_dim], dim=-1)
+        kv_nope, k_rope = torch.split(kv_lora, [self.kv_lora_rank, self.rope_head_dim], dim=-1)
+        kv_nope = self.KV_proj_up(kv_nope).view(
+            batch_size, kv_len, self.n_heads, self.nope_head_dim + self.v_head_dim)
+        k_nope, v = torch.split(kv_nope, [self.nope_head_dim, self.v_head_dim], dim=-1)
+        q_rope = q_rope.permute(0, 2, 1, 3)
+        q_rope = self.q_rope(q_rope, start_pos=query_pos)
+        q_nope = q_nope.permute(0, 2, 1, 3)
+        q = torch.concat([q_nope, q_rope], dim=-1)
+        k_rope = k_rope[:, None, :, :]
+        k_rope = self.k_rope(k_rope).expand(-1, self.n_heads, -1, -1)
+        k_nope = k_nope.permute(0, 2, 1, 3)
+        k = torch.concat([k_nope, k_rope], dim=-1)
+        v = v.permute(0, 2, 1, 3)
+        scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.rope_head_dim + self.nope_head_dim)
+        attn = F.softmax(scores, dim=-1).to(torch.bfloat16)
+        y = torch.matmul(attn, v).view(batch_size, 1, -1)
+        y = self.wo(y)
+        return y, kv_cache.get_data()
+
+
+def ref_kernel(data):
+    config, x, kv_cache = data
+    model = MLA(config).to('cuda')
+    model.Q_proj_down.weight = nn.Parameter(config.Q_proj_down_weight)
+    model.Q_proj_up.weight = nn.Parameter(config.Q_proj_up_weight)
+    model.KV_proj_down.weight = nn.Parameter(config.KV_proj_down_weight)
+    model.KV_proj_up.weight = nn.Parameter(config.KV_proj_up_weight)
+    model.wo.weight = nn.Parameter(config.wo_weight)
+    output, kv_data = model(x, kv_cache)
+    return output, kv_data
+
+
+def generate_input(batchsize, dim, dq, prefill, seed):
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    Q_proj_down_weight = torch.randn((dq, dim), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dim)
+    KV_proj_down_weight = torch.randn((512 + 64, dim), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dim)
+    Q_proj_up_weight = torch.randn(((128 + 64) * 128, dq), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dq)
+    KV_proj_up_weight = torch.randn(((128 + 128) * 128, 512), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(512)
+    wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(128 * 128)
+    config = Config(
+        batch_size=batchsize, dim=dim, q_lora_rank=dq, n_heads=128,
+        kv_lora_rank=512, qk_nope_head_dim=128, qk_rope_head_dim=64,
+        v_head_dim=128, seq_len=1, max_seq_len=8192,
+        kv_cache_shape=(batchsize, 8192, 512 + 64),
+        Q_proj_down_weight=Q_proj_down_weight, Q_proj_up_weight=Q_proj_up_weight,
+        KV_proj_down_weight=KV_proj_down_weight, KV_proj_up_weight=KV_proj_up_weight,
+        wo_weight=wo_weight,
+    )
+    x = torch.randn((config.batch_size, 1, config.dim), dtype=torch.bfloat16, generator=gen, device='cuda')
+    kv_cache = KVCache((config.batch_size, config.max_seq_len, config.kv_lora_rank + config.qk_rope_head_dim)).to('cuda')
+    pre_filled_cache = torch.randn(
+        (config.batch_size, prefill, config.kv_lora_rank + config.qk_rope_head_dim),
+        dtype=torch.bfloat16, generator=gen, device='cuda')
+    kv_cache(pre_filled_cache)
+    return config, x, kv_cache
+
+
+@torch.no_grad()
+def _verbose_allclose(received, expected, rtol=1e-05, atol=1e-08, max_print=5):
+    if received.shape != expected.shape:
+        return False, [f"SIZE MISMATCH. received shape: {received.shape}, expected shape: {expected.shape}"]
+    diff = torch.abs(received.to(torch.float32) - expected.to(torch.float32))
+    tolerance = atol + rtol * torch.abs(expected.to(torch.float32))
+    tol_mismatched = diff > tolerance
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+    mismatched_indices = torch.nonzero(mismatched)
+    num_mismatched = mismatched.count_nonzero().item()
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return False, mismatch_details
+    return True, [f"Maximum error: {torch.max(diff)}"]
+
+
+def check_implementation(data, submission_output, rtol=2e-2, atol=8e-3):
+    import gc
+    output_mla, output_kv = submission_output
+    # Move submission output to CPU and free GPU memory before running ref kernel
+    output_mla_cpu = output_mla.cpu()
+    output_kv_cpu = output_kv.cpu()
+    del output_mla, output_kv
+    gc.collect()
+    torch.cuda.empty_cache()
+    config, x, kv_cache = data
+    with torch.no_grad():
+        expected_mla, expected_kv = ref_kernel((config, x, kv_cache))
+    # Move ref output to CPU and free GPU memory before comparison
+    expected_mla_cpu = expected_mla.cpu()
+    expected_kv_cpu = expected_kv.cpu()
+    del expected_mla, expected_kv
+    gc.collect()
+    torch.cuda.empty_cache()
+    good_mla, reasons_mla = _verbose_allclose(output_mla_cpu, expected_mla_cpu, rtol=rtol, atol=atol)
+    good_kv, reasons_kv = _verbose_allclose(output_kv_cpu, expected_kv_cpu, rtol=rtol, atol=atol)
+    if not good_mla:
+        return False, "MLA output mismatch: " + " ".join(reasons_mla)
+    if not good_kv:
+        return False, "KV cache mismatch: " + " ".join(reasons_kv)
+    return True, "Match"
+'''
diff --git a/benchmarks/gpu_mode/mla_decode/requirements.txt b/benchmarks/gpu_mode/mla_decode/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4876b15bb374f6259999dfd2c41458ba5d18e329
--- /dev/null
+++ b/benchmarks/gpu_mode/mla_decode/requirements.txt
@@ -0,0 +1,2 @@
+triton
+torch
diff --git a/benchmarks/gpu_mode/trimul/initial_program.py b/benchmarks/gpu_mode/trimul/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..d334005e0070a41d5309a1d116cfbd12763ac82b
--- /dev/null
+++ b/benchmarks/gpu_mode/trimul/initial_program.py
@@ -0,0 +1,84 @@
+# EVOLVE-BLOCK-START
+"""
+Initial TriMul submission — PyTorch baseline with dummy Triton kernel.
+"""
+
+import torch
+from torch import nn, einsum
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _dummy_kernel(x_ptr, BLOCK_SIZE: tl.constexpr):
+    pid = tl.program_id(0)
+    pass
+
+
+class TriMul(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+    ):
+        super().__init__()
+
+        self.norm = nn.LayerNorm(dim)
+
+        self.left_proj = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
+        self.right_proj = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
+
+        self.left_gate = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
+        self.right_gate = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
+        self.out_gate = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
+
+        self.to_out_norm = nn.LayerNorm(hidden_dim)
+        self.to_out = nn.Linear(hidden_dim, dim, bias=False, dtype=torch.float32)
+
+    def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, _, dim = x.shape
+
+        x = self.norm(x)
+        x = x.to(torch.float32)
+
+        left = self.left_proj(x.to(torch.float32))
+        right = self.right_proj(x.to(torch.float32))
+
+        mask = mask.unsqueeze(-1)
+        left = left * mask
+        right = right * mask
+
+        left_gate = self.left_gate(x.to(torch.float32)).sigmoid()
+        right_gate = self.right_gate(x.to(torch.float32)).sigmoid()
+        out_gate = self.out_gate(x.to(torch.float32)).sigmoid()
+
+        left = left * left_gate
+        right = right * right_gate
+
+        out = einsum('... i k d, ... j k d -> ... i j d', left.to(torch.bfloat16), right.to(torch.bfloat16))
+
+        out = out.to(torch.float32)
+        out = self.to_out_norm(out)
+        out = out * out_gate
+        return self.to_out(out)
+
+
+def custom_kernel(data):
+    input_tensor, mask, weights, config = data
+    trimul = TriMul(config["dim"], config["hidden_dim"]).to(input_tensor.device)
+
+    trimul.norm.weight = nn.Parameter(weights['norm.weight'].to(torch.float32))
+    trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'].to(torch.float32))
+    trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'].to(torch.float32))
+    trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'].to(torch.float32))
+    trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'].to(torch.float32))
+    trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'].to(torch.float32))
+    trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'].to(torch.float32))
+    trimul.to_out.weight = nn.Parameter(weights['to_out.weight'].to(torch.float32))
+    trimul.norm.bias = nn.Parameter(weights['norm.bias'].to(torch.float32))
+    trimul.to_out_norm.bias = nn.Parameter(weights['to_out_norm.bias'].to(torch.float32))
+
+    output = trimul(input_tensor, mask).to(torch.float32)
+
+    return output
+# EVOLVE-BLOCK-END
diff --git a/benchmarks/image_gen/README.md b/benchmarks/image_gen/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..23b0b87cd91b65c3bc9e0d0ba9ad238a4736c44d
--- /dev/null
+++ b/benchmarks/image_gen/README.md
@@ -0,0 +1,40 @@
+# Image Generation Benchmark
+
+This benchmark evaluates whether SkyDiscover can optimize images, not just code or text. Each "solution" in the population is an image, evolved by generating and scoring variants from a candidate pool stored in the database. The evolutionary loop is the same as for code — parent selection, mutation via LLM, crossover via other context images from other islands — but instead of evolving Python programs, SkyDiscover evolves text prompts fed to GPT-5's native image generation. The VLM receives actual parent and other context images alongside text guidance, reasons about what to improve, and generates a new image. Setting `language: "image"` in the config is the only change needed.
+
+## Benchmark: Sky Festival
+
+**Directory:** `sky_festival/`
+
+The system must generate a floating sky-festival image where many details must match exact structural constraints: 9 clouds with specific shapes (rabbit, teacup, musical note, crescent moon, whale, etc.), 5 hot-air balloons with exact colors, passengers, and a banner reading "HAPPY 100TH SKY FESTIVAL", a floating island with 4 trees in a specific left-to-right order, and a party table with precisely counted items (6 cupcakes, 8 golden plates, 5 gift boxes in a pyramid). The scene also includes 6 characters with specific attributes (e.g., a robot with 3 colored buttons on its chest, a grandmother giving a thumbs-up with her left hand), flying creatures, and a correctly ordered 7-band rainbow. The full specification is about 2000 words and lives in `config.yaml`'s `prompt.system_message`.
+
+**Evaluator.** Each generated image is graded by a GPT-5 vision judge using a strict rubric. The judge receives the image and a detailed scoring sheet, then returns per-category scores across 7 dimensions — cloud shapes (15 pts), balloons (20 pts), floating island (10 pts), table items (20 pts), characters (15 pts), decorations/creatures (10 pts), and rainbow/lighting (10 pts) — for a total of 100 points. The judge is instructed to be extremely harsh: points are awarded only when requirements are clearly and unambiguously met in the image.
+
+## Setup
+
+1. **Set your API key:**
+
+   ```bash
+   export OPENAI_API_KEY=...
+   ```
+
+   Both the image generator (GPT-5) and the evaluator judge (GPT-5) use the OpenAI API.
+
+## Run
+
+```bash
+cd benchmarks/image_gen/sky_festival
+
+# AdaEvolve
+uv run skydiscover-run evaluator.py -c config.yaml -s adaevolve -o sky_festival_output
+
+# EvoX
+uv run skydiscover-run evaluator.py -c config.yaml -s evox -o sky_festival_output
+```
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `sky_festival/evaluator.py` | GPT-5 vision judge that scores images against the 100-point rubric |
+| `sky_festival/config.yaml` | Config — scene specification in `prompt.system_message` |
diff --git a/benchmarks/image_gen/sky_festival/evaluator.py b/benchmarks/image_gen/sky_festival/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd138c5cf80924c62e1407749de96f7e3b1d4d92
--- /dev/null
+++ b/benchmarks/image_gen/sky_festival/evaluator.py
@@ -0,0 +1,220 @@
+"""
+Sky Festival evaluator — GPT-5 LLM-as-a-judge.
+
+Scores VLM-generated images against a 100-point rubric using GPT-5 vision.
+Returns combined_score normalized to [0, 1].
+
+The framework passes the image path via a sidecar file:
+    <program_path>.image_path  ->  absolute path to the generated image
+
+Requirements:
+    pip install openai
+    Environment: OPENAI_API_KEY (required), JUDGE_MODEL (optional, default gpt-5)
+"""
+
+import base64
+import json
+import logging
+import os
+import re
+from typing import Dict, Union
+
+logger = logging.getLogger(__name__)
+
+JUDGE_MODEL = os.environ.get("JUDGE_MODEL", "gpt-5")
+
+SYSTEM_PROMPT = """\
+You are an extremely strict image evaluation judge. You score images against a precise rubric.
+You must output ONLY valid JSON with the exact keys specified. No markdown, no explanation outside JSON.
+Be harsh — most AI-generated images fail these criteria. Award points only when clearly met.
+If you cannot verify a requirement (e.g., too small to see), award 0 for that item."""
+
+RUBRIC_PROMPT = """\
+Score this image against the following rubric for a "Floating Sky Festival" scene.
+Be extremely strict. Only award points when requirements are CLEARLY and UNAMBIGUOUSLY met.
+
+## Category 1: Cloud Counting and Shapes (15 pts)
+- Exactly 9 clouds visible in the sky: 5 pts (8 or 10 clouds = 0)
+- At least 5 of the 9 clouds have recognizable distinct shapes (rabbit, teacup, musical note, crescent moon, whale, bicycle, crown, butterfly, number 7): 10 pts (2 pts per recognizable shape, max 10)
+
+## Category 2: Hot Air Balloons — Count, Colors, and Passengers (20 pts)
+- Exactly 5 hot air balloons visible: 4 pts (4 or 6 = 0)
+- Each balloon has correct distinct color/pattern (red-striped, yellow-dotted, rainbow, purple-stars, green-peace-sign): 6 pts (deduct 2 per wrong/missing pattern)
+- Correct passenger count per balloon (2 children, 1 woman, 3 cats, 1 violinist, empty): 6 pts (deduct 2 per wrong count)
+- Banner on Balloon 5 reads exactly "HAPPY 100TH SKY FESTIVAL": 4 pts (any word wrong = 0)
+
+## Category 3: Floating Island and Trees (10 pts)
+- Floating island visible suspended in air: 3 pts
+- Exactly 4 different trees on the island: 4 pts (3 or 5 = 0)
+- Trees in correct order left to right (oak, cherry blossom, palm, pine): 3 pts
+
+## Category 4: Party Table Items — Counting and Arrangement (20 pts)
+- 3-tier cake with candle present: 3 pts
+- Cake text "100 YEARS" legible on middle tier: 3 pts
+- Exactly 6 cupcakes in 2 rows of 3 with different colored frostings: 4 pts
+- Lemonade pitcher with 3 lemon slices and 2 ice cubes: 3 pts
+- Stack of exactly 8 golden plates: 3 pts
+- Exactly 5 gift boxes in pyramid (3 bottom, 2 top): 4 pts
+
+## Category 5: Characters — Count, Identity, and Details (15 pts)
+- Exactly 6 characters seated at the table (3 per side): 5 pts
+- Correct characters identifiable (girl with pigtails, penguin with bowtie, giraffe, robot, grandmother, golden retriever): 5 pts (1 pt per correct character, max 5 — giraffe counts as 1 even if neck extends)
+- Specific details: robot has 3 colored buttons on chest, grandmother thumbs-up with LEFT hand, dog wears striped party hat, girl has 5 fingers per hand: 5 pts (deduct 1.5 per missing detail)
+
+## Category 6: Decorations and Flying Creatures (10 pts)
+- Bunting banner with approximately 11 flags in alternating red/yellow/blue: 3 pts
+- Exactly 7 paper lanterns in different colors: 3 pts
+- Correct flying creatures: 4 birds (blue jay, cardinal, canary, hummingbird) + 2 butterflies (monarch, morpho): 4 pts (1 pt per 2 correct creatures)
+
+## Category 7: Rainbow, Lighting, and Overall Composition (10 pts)
+- Complete semicircular rainbow with 7 color bands in correct order: 4 pts
+- Consistent warm golden lighting from upper left with shadows falling lower right: 3 pts
+- Overall magical/celebratory mood, scene is joyful and cohesive: 3 pts
+
+Respond with ONLY this JSON (no other text):
+{
+  "cloud_shapes": <0-15>,
+  "balloons": <0-20>,
+  "floating_island": <0-10>,
+  "table_items": <0-20>,
+  "characters": <0-15>,
+  "decorations_creatures": <0-10>,
+  "rainbow_lighting": <0-10>,
+  "reasoning": "<brief 2-3 sentence explanation>"
+}"""
+
+# Category maximum scores for validation
+CATEGORY_MAXES = {
+    "cloud_shapes": 15,
+    "balloons": 20,
+    "floating_island": 10,
+    "table_items": 20,
+    "characters": 15,
+    "decorations_creatures": 10,
+    "rainbow_lighting": 10,
+}
+
+_client = None
+
+
+def _get_client():
+    global _client
+    if _client is None:
+        from openai import OpenAI
+        _client = OpenAI()
+    return _client
+
+
+def _encode_image(image_path: str) -> str:
+    with open(image_path, "rb") as f:
+        return base64.b64encode(f.read()).decode("utf-8")
+
+
+def _judge_image(image_path: str) -> Dict[str, Union[float, str]]:
+    """Call GPT-5 to score the image. Retries once on failure."""
+    client = _get_client()
+    b64 = _encode_image(image_path)
+
+    ext = os.path.splitext(image_path)[1].lstrip(".").lower()
+    mime = {"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg", "webp": "image/webp"}.get(ext, "image/png")
+    data_url = f"data:{mime};base64,{b64}"
+
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": data_url, "detail": "high"}},
+                {"type": "text", "text": RUBRIC_PROMPT},
+            ],
+        },
+    ]
+
+    last_error = None
+    for attempt in range(2):
+        try:
+            response = client.chat.completions.create(
+                model=JUDGE_MODEL,
+                messages=messages,
+                max_completion_tokens=16384,
+            )
+            content = response.choices[0].message.content or ""
+            raw = content.strip()
+            logger.info(f"Judge raw response (first 300 chars): {raw[:300]}")
+
+            # Extract JSON from markdown code block if present
+            if "```" in raw:
+                m = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", raw, re.DOTALL)
+                if m:
+                    raw = m.group(1).strip()
+
+            # Find JSON object in response
+            start = raw.find("{")
+            end = raw.rfind("}") + 1
+            if start >= 0 and end > start:
+                raw = raw[start:end]
+
+            result = json.loads(raw)
+
+            # Validate and clamp scores
+            scores = {}
+            for cat, max_val in CATEGORY_MAXES.items():
+                val = result.get(cat, 0)
+                if not isinstance(val, (int, float)):
+                    val = 0
+                scores[cat] = max(0, min(max_val, float(val)))
+
+            scores["reasoning"] = str(result.get("reasoning", ""))
+            return scores
+
+        except Exception as e:
+            last_error = e
+            logger.warning(f"Judge attempt {attempt + 1} failed: {e}")
+
+    logger.error(f"GPT-5 judge failed after retries: {last_error}")
+    return {cat: 0.0 for cat in CATEGORY_MAXES}
+
+
+def evaluate(program_path: str) -> Dict[str, Union[float, str]]:
+    """Score a VLM-generated image using GPT-5 as judge.
+
+    Args:
+        program_path: Path to the text file (VLM reasoning).
+            A sidecar file ``<program_path>.image_path`` contains the
+            absolute path to the generated image.
+
+    Returns:
+        Dictionary with combined_score (0-1), per-category scores, and image_path.
+    """
+    # Read image path from sidecar
+    sidecar = program_path + ".image_path"
+    image_path = None
+    if os.path.exists(sidecar):
+        with open(sidecar) as f:
+            image_path = f.read().strip()
+
+    if not image_path or not os.path.exists(image_path):
+        logger.warning("No image found for scoring")
+        return {"combined_score": 0.0, "error": "No image to score"}
+
+    # Score with GPT-5
+    scores = _judge_image(image_path)
+
+    # Compute total out of 100, normalize to 0-1
+    total = sum(v for k, v in scores.items() if k in CATEGORY_MAXES)
+    combined = round(total / 100.0, 4)
+
+    result = {"combined_score": combined, "image_path": image_path}
+
+    # Add per-category scores (normalized to 0-1 for each category)
+    for cat, max_val in CATEGORY_MAXES.items():
+        result[cat] = round(scores.get(cat, 0) / max_val, 4)
+
+    # Also store raw scores
+    result["raw_total"] = round(total, 1)
+
+    reasoning = scores.get("reasoning", "")
+    if reasoning:
+        result["judge_reasoning"] = reasoning
+
+    return result
diff --git a/benchmarks/math/README.md b/benchmarks/math/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4ae4cab48f8fb94fcc390592c023f72b29ea13fc
--- /dev/null
+++ b/benchmarks/math/README.md
@@ -0,0 +1,43 @@
+# Math Benchmarks
+
+Mathematical optimization and algorithm evolution problems.
+
+## Problems
+
+### Signal processing & geometry (from SkyDiscover demos)
+
+- [signal_processing](signal_processing/) — Real-time adaptive filtering for non-stationary time series
+- [circle_packing](circle_packing/) — Pack 26 circles in a unit square to maximize sum of radii (AlphaEvolve B.12)
+
+### AlphaEvolve mathematical problems
+
+12 problems from [AlphaEvolve Appendices A and B](https://storage.googleapis.com/deepmind-media/DeepMind.com/Blog/alphaevolve-a-gemini-powered-coding-agent-for-designing-advanced-algorithms/AlphaEvolve.pdf). All evaluators are normalized to **maximize** the target metric.
+
+**Appendix A:**
+- [matmul](matmul/) — Faster algorithm for matrix multiplication (A)
+
+**Appendix B:**
+1. [first_autocorr_ineq](first_autocorr_ineq/) — Upper bound on autoconvolution constant (B.1)
+2. [second_autocorr_ineq](second_autocorr_ineq/) — Lower bound on autoconvolution norm constant (B.2)
+3. [third_autocorr_ineq](third_autocorr_ineq/) — Upper bound on absolute autoconvolution constant (B.3)
+4. [uncertainty_ineq](uncertainty_ineq/) — Upper bound on Fourier uncertainty constant (B.4)
+5. [erdos_min_overlap](erdos_min_overlap/) — Upper bound on Erdos minimum overlap constant (B.5)
+6. [sums_diffs_finite_sets](sums_diffs_finite_sets/) — Lower bound on sums/differences of finite sets (B.6)
+7. [hexagon_packing](hexagon_packing/) — Pack unit hexagons in a regular hexagon, n=11,12 (B.7)
+8. [minimizing_max_min_dist](minimizing_max_min_dist/) — Minimize max/min distance ratio, n=16 d=2 and n=14 d=3 (B.8)
+9. [heilbronn_triangle](heilbronn_triangle/) — Heilbronn problem for triangles, n=11 (B.9)
+10. [heilbronn_convex](heilbronn_convex/) — Heilbronn problem for convex regions, n=13,14 (B.10)
+11. [circle_packing_rect](circle_packing_rect/) — Pack circles in a rectangle of perimeter 4 (B.13)
+
+## Run
+
+```bash
+uv run skydiscover-run \
+  benchmarks/math/signal_processing/initial_program.py \
+  benchmarks/math/signal_processing/evaluator.py \
+  -c benchmarks/math/signal_processing/config.yaml \
+  -s [your_algorithm] \
+  -i 100
+```
+
+Each problem directory contains `initial_program.py`, `evaluator.py`, and either `config.yaml` or per-search configs. Some multi-variant problems have numbered subdirectories (e.g., `heilbronn_convex/13/`, `hexagon_packing/11/`).
diff --git a/benchmarks/math/circle_packing/README.md b/benchmarks/math/circle_packing/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9c9847399bcdfd282161ec44f50300d0550a61ca
--- /dev/null
+++ b/benchmarks/math/circle_packing/README.md
@@ -0,0 +1,38 @@
+# Circle Packing
+
+Pack 26 non-overlapping circles in a unit square to maximize the sum of their radii (AlphaEvolve B.12). Target: 2.635.
+
+## Problem
+
+- Pack exactly 26 circles inside a unit square
+- No circles may overlap
+- Each circle must lie entirely within the square
+- Maximize the sum of all radii
+
+## Run
+
+```bash
+# From repo root
+uv run skydiscover-run \
+  benchmarks/math/circle_packing/initial_program.py \
+  benchmarks/math/circle_packing/evaluator.py \
+  -c benchmarks/math/circle_packing/config.yaml \
+  -s [your_algorithm] \
+  -i 100
+```
+
+A `codebase/reference/` directory is provided with geometric insights (hex grids, optimization patterns, packing strategies) that can be used with agentic mode (`--agentic`).
+
+## Scoring
+
+- **combined_score**: `sum_of_radii / 2.635` (ratio to AlphaEvolve target)
+- Evaluator validates no overlaps and boundary constraints
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `initial_program.py` | Seed: simple ring-based circle arrangement |
+| `evaluator.py` | Validates constraints, computes sum-of-radii ratio to target |
+| `config.yaml` | LLM and evaluator settings |
+| `codebase/reference/` | Geometric reference material for agentic mode |
diff --git a/benchmarks/math/circle_packing/codebase/reference/hex_grid.py b/benchmarks/math/circle_packing/codebase/reference/hex_grid.py
new file mode 100644
index 0000000000000000000000000000000000000000..7466ba1dac3bdab65fa9d93b390aa23edc5d6afc
--- /dev/null
+++ b/benchmarks/math/circle_packing/codebase/reference/hex_grid.py
@@ -0,0 +1,43 @@
+"""
+Hexagonal grid initialization for circle packing.
+
+A hexagonal (offset) grid provides a good starting arrangement
+because it's the densest regular packing pattern. Even rows are
+offset by half the spacing, which reduces wasted space.
+"""
+
+import numpy as np
+
+
+def hexagonal_grid(n, margin=0.1):
+    """
+    Generate n points on a hexagonal grid inside [margin, 1-margin]^2.
+
+    Args:
+        n: number of points to generate
+        margin: distance from edges to keep clear
+
+    Returns:
+        np.array of shape (n, 2) with (x, y) coordinates
+    """
+    usable = 1.0 - 2 * margin
+    cols = int(np.ceil(np.sqrt(n * 2 / np.sqrt(3))))
+    rows = int(np.ceil(n / cols))
+
+    dx = usable / max(cols - 1, 1)
+    dy = usable / max(rows - 1, 1)
+
+    points = []
+    for row in range(rows):
+        for col in range(cols):
+            if len(points) >= n:
+                break
+            x = margin + col * dx
+            if row % 2 == 1:
+                x += dx / 2  # offset for hex pattern
+            y = margin + row * dy
+            x = np.clip(x, margin, 1 - margin)
+            y = np.clip(y, margin, 1 - margin)
+            points.append([x, y])
+
+    return np.array(points[:n])
diff --git a/benchmarks/math/circle_packing/codebase/reference/optimization_patterns.py b/benchmarks/math/circle_packing/codebase/reference/optimization_patterns.py
new file mode 100644
index 0000000000000000000000000000000000000000..b144623fc6a49405c85051cb128221c527648e4c
--- /dev/null
+++ b/benchmarks/math/circle_packing/codebase/reference/optimization_patterns.py
@@ -0,0 +1,94 @@
+"""
+Common patterns for constrained geometric optimization using scipy.
+
+This module shows how to use scipy.optimize.minimize with inequality
+constraints and the SLSQP solver — useful for any problem where you
+need to maximize/minimize an objective subject to geometric constraints.
+"""
+
+import numpy as np
+from scipy.optimize import minimize
+
+
+def example_constrained_optimization():
+    """
+    Template: pack n objects by optimizing positions + sizes jointly.
+
+    Decision vector:  x = [pos_0, pos_1, ..., pos_{n-1}, size_0, ..., size_{n-1}]
+    Objective:        maximize sum(sizes)  =>  minimize -sum(sizes)
+    Constraints:      non-overlap + boundary containment (all >= 0)
+    """
+    n = 10  # number of objects
+
+    # --- Objective: negative sum of sizes (we minimize, so negate to maximize) ---
+    def objective(x):
+        sizes = x[2 * n:]
+        return -np.sum(sizes)
+
+    # --- Constraints as a single function returning array of values >= 0 ---
+    def constraints_fn(x):
+        positions = x[:2 * n].reshape(n, 2)
+        sizes = x[2 * n:]
+
+        c = []
+        # Pairwise non-overlap: dist(i,j) - size_i - size_j >= 0
+        for i in range(n):
+            for j in range(i + 1, n):
+                dist = np.linalg.norm(positions[i] - positions[j])
+                c.append(dist - sizes[i] - sizes[j])
+
+        # Boundary: each object stays inside [0, 1] x [0, 1]
+        for i in range(n):
+            c.append(positions[i, 0] - sizes[i])      # left
+            c.append(1 - positions[i, 0] - sizes[i])   # right
+            c.append(positions[i, 1] - sizes[i])        # bottom
+            c.append(1 - positions[i, 1] - sizes[i])    # top
+
+        return np.array(c)
+
+    # --- Initial guess ---
+    x0_pos = np.random.rand(n, 2) * 0.6 + 0.2  # avoid edges
+    x0_sizes = np.full(n, 0.05)
+    x0 = np.concatenate([x0_pos.flatten(), x0_sizes])
+
+    # --- Bounds ---
+    pos_bounds = [(0, 1)] * (2 * n)
+    size_bounds = [(0.01, 0.25)] * n
+    bounds = pos_bounds + size_bounds
+
+    # --- Solve ---
+    result = minimize(
+        objective,
+        x0,
+        method="SLSQP",
+        bounds=bounds,
+        constraints={"type": "ineq", "fun": constraints_fn},
+        options={"maxiter": 1000, "ftol": 1e-9},
+    )
+
+    opt_positions = result.x[:2 * n].reshape(n, 2)
+    opt_sizes = result.x[2 * n:]
+    return opt_positions, opt_sizes, -result.fun  # return positive sum
+
+
+def multi_start_optimization(objective, constraint_fn, bounds, n_starts=5):
+    """
+    Run SLSQP from multiple random starts and keep the best.
+
+    This helps escape local optima — the solver is gradient-based
+    and sensitive to the initial guess.
+    """
+    best_result = None
+    for _ in range(n_starts):
+        x0 = np.array([np.random.uniform(lo, hi) for lo, hi in bounds])
+        result = minimize(
+            objective,
+            x0,
+            method="SLSQP",
+            bounds=bounds,
+            constraints={"type": "ineq", "fun": constraint_fn},
+            options={"maxiter": 500, "ftol": 1e-8},
+        )
+        if best_result is None or result.fun < best_result.fun:
+            best_result = result
+    return best_result
diff --git a/benchmarks/math/circle_packing/codebase/reference/packing_strategies.md b/benchmarks/math/circle_packing/codebase/reference/packing_strategies.md
new file mode 100644
index 0000000000000000000000000000000000000000..ed28326077e66e30e07db4dff58337a1c3a18747
--- /dev/null
+++ b/benchmarks/math/circle_packing/codebase/reference/packing_strategies.md
@@ -0,0 +1,45 @@
+# Circle Packing Strategies for n=26 in a Unit Square
+
+## Key Insight
+Naive geometric placement (rings, grids) gives sum_radii ~ 1.0.
+Using numerical optimization (scipy.optimize) with proper constraint formulation
+can push sum_radii above 2.5.
+
+## Why Optimization Works Better Than Manual Placement
+
+Manual placement fixes circle positions, then computes maximum radii.
+This leaves gaps because positions aren't optimized for the radii they produce.
+
+**Joint optimization** treats both positions (x,y for each circle) AND radii
+as decision variables, optimizing them simultaneously. This is the key insight.
+
+Decision vector: [x0, y0, x1, y1, ..., x25, y25, r0, r1, ..., r25]
+Total variables: 26*2 + 26 = 78
+
+## Constraint Formulation
+
+1. **Non-overlap**: For every pair (i,j): distance(center_i, center_j) >= r_i + r_j
+2. **Boundary**: For every circle i: x_i - r_i >= 0, x_i + r_i <= 1, y_i - r_i >= 0, y_i + r_i <= 1
+3. **Positive radii**: r_i > 0 for all i (use bounds, not constraints)
+
+## Recommended Solver
+
+scipy.optimize.minimize with method="SLSQP":
+- Handles inequality constraints natively
+- Works with bounds on variables
+- Good for smooth, continuous problems like circle packing
+- Sensitive to initial guess — use multiple starts or a good heuristic
+
+## Initial Guess Strategy
+
+A hexagonal grid initial guess works well:
+- Place circles on offset rows (hex pattern)
+- Start with equal small radii (e.g., 0.05)
+- Let the optimizer adjust both positions and radii
+
+## Performance Tips
+
+- Set maxiter=1000 or higher for 26 circles
+- Use ftol=1e-8 or smaller for precise solutions
+- Radii bounds: (0.01, 0.2) is a reasonable range for n=26
+- The objective is -sum(radii) (minimize negative to maximize)
diff --git a/benchmarks/math/circle_packing/config.yaml b/benchmarks/math/circle_packing/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d5702c06da43fc9fc0cccba102f48e6c5be98dfc
--- /dev/null
+++ b/benchmarks/math/circle_packing/config.yaml
@@ -0,0 +1,54 @@
+# Math benchmark: circle_packing
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 16384
+  timeout: 600
+prompt:
+  system_message: 'You are an expert mathematician specializing in circle packing problems and computational geometry. Your
+    task is to improve a constructor function that directly produces a specific arrangement of 26 circles in a unit square,
+    maximizing the sum of their radii. The AlphaEvolve paper achieved a sum of 2.635 for n=26.
+
+
+    Key geometric insights:
+
+    - Circle packings often follow hexagonal patterns in the densest regions
+
+    - Maximum density for infinite circle packing is pi/(2*sqrt(3)) ≈ 0.9069
+
+    - Edge effects make square container packing harder than infinite packing
+
+    - Circles can be placed in layers or shells when confined to a square
+
+    - Similar radius circles often form regular patterns, while varied radii allow better space utilization
+
+    - Perfect symmetry may not yield the optimal packing due to edge effects
+
+
+    Focus on designing an explicit constructor that places each circle in a specific position, rather than an iterative search
+    algorithm.
+
+    '
+evaluator:
+  timeout: 360
+  cascade_evaluation: true
+  cascade_thresholds:
+  - 0.3
+  - 0.6
+
+# Live monitor dashboard
+monitor:
+  enabled: true
+  port: 8765
+  host: "127.0.0.1"
+
+# Human feedback
+human_feedback_enabled: true
diff --git a/benchmarks/math/circle_packing/evaluator.py b/benchmarks/math/circle_packing/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b917d56f304faece9cfe9cbba92e005cc2c861
--- /dev/null
+++ b/benchmarks/math/circle_packing/evaluator.py
@@ -0,0 +1,338 @@
+"""
+Evaluator for circle packing example (n=26) with improved timeout handling
+"""
+
+import numpy as np
+import time
+import os
+import subprocess
+import tempfile
+import traceback
+import sys
+import pickle
+
+
+class TimeoutError(Exception):
+    pass
+
+
+def timeout_handler(signum, frame):
+    """Handle timeout signal"""
+    raise TimeoutError("Function execution timed out")
+
+
+def validate_packing(centers, radii):
+    """
+    Validate that circles don't overlap and are inside the unit square
+
+    Args:
+        centers: np.array of shape (n, 2) with (x, y) coordinates
+        radii: np.array of shape (n) with radius of each circle
+
+    Returns:
+        True if valid, False otherwise
+    """
+    n = centers.shape[0]
+
+    # Check for NaN values
+    if np.isnan(centers).any():
+        print("NaN values detected in circle centers")
+        return False
+
+    if np.isnan(radii).any():
+        print("NaN values detected in circle radii")
+        return False
+
+    # Check if radii are nonnegative and not nan
+    for i in range(n):
+        if radii[i] < 0:
+            print(f"Circle {i} has negative radius {radii[i]}")
+            return False
+        elif np.isnan(radii[i]):
+            print(f"Circle {i} has nan radius")
+            return False
+
+    # Check if circles are inside the unit square
+    for i in range(n):
+        x, y = centers[i]
+        r = radii[i]
+        if x - r < -1e-6 or x + r > 1 + 1e-6 or y - r < -1e-6 or y + r > 1 + 1e-6:
+            print(f"Circle {i} at ({x}, {y}) with radius {r} is outside the unit square")
+            return False
+
+    # Check for overlaps
+    for i in range(n):
+        for j in range(i + 1, n):
+            dist = np.sqrt(np.sum((centers[i] - centers[j]) ** 2))
+            if dist < radii[i] + radii[j] - 1e-6:  # Allow for tiny numerical errors
+                print(f"Circles {i} and {j} overlap: dist={dist}, r1+r2={radii[i]+radii[j]}")
+                return False
+
+    return True
+
+
+def run_with_timeout(program_path, timeout_seconds=20):
+    """
+    Run the program in a separate process with timeout
+    using a simple subprocess approach
+
+    Args:
+        program_path: Path to the program file
+        timeout_seconds: Maximum execution time in seconds
+
+    Returns:
+        centers, radii, sum_radii tuple from the program
+    """
+    # Create a temporary file to execute
+    with tempfile.NamedTemporaryFile(suffix=".py", delete=False) as temp_file:
+        # Write a script that executes the program and saves results
+        script = f"""
+import sys
+import numpy as np
+import os
+import pickle
+import traceback
+
+# Add the directory to sys.path
+sys.path.insert(0, os.path.dirname('{program_path}'))
+
+# Debugging info
+print(f"Running in subprocess, Python version: {{sys.version}}")
+print(f"Program path: {program_path}")
+
+try:
+    # Import the program
+    spec = __import__('importlib.util').util.spec_from_file_location("program", '{program_path}')
+    program = __import__('importlib.util').util.module_from_spec(spec)
+    spec.loader.exec_module(program)
+    
+    # Run the packing function
+    print("Calling run_packing()...")
+    centers, radii, sum_radii = program.run_packing()
+    print(f"run_packing() returned successfully: sum_radii = {{sum_radii}}")
+
+    # Save results to a file
+    results = {{
+        'centers': centers,
+        'radii': radii,
+        'sum_radii': sum_radii
+    }}
+
+    with open('{temp_file.name}.results', 'wb') as f:
+        pickle.dump(results, f)
+    print(f"Results saved to {temp_file.name}.results")
+    
+except Exception as e:
+    # If an error occurs, save the error instead
+    print(f"Error in subprocess: {{str(e)}}")
+    traceback.print_exc()
+    with open('{temp_file.name}.results', 'wb') as f:
+        pickle.dump({{'error': str(e)}}, f)
+    print(f"Error saved to {temp_file.name}.results")
+"""
+        temp_file.write(script.encode())
+        temp_file_path = temp_file.name
+
+    results_path = f"{temp_file_path}.results"
+
+    try:
+        # Run the script with timeout
+        process = subprocess.Popen(
+            [sys.executable, temp_file_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+
+        try:
+            stdout, stderr = process.communicate(timeout=timeout_seconds)
+            exit_code = process.returncode
+
+            # Always print output for debugging purposes
+            print(f"Subprocess stdout: {stdout.decode()}")
+            if stderr:
+                print(f"Subprocess stderr: {stderr.decode()}")
+
+            # Still raise an error for non-zero exit codes, but only after printing the output
+            if exit_code != 0:
+                raise RuntimeError(f"Process exited with code {exit_code}")
+
+            # Load the results
+            if os.path.exists(results_path):
+                with open(results_path, "rb") as f:
+                    results = pickle.load(f)
+
+                # Check if an error was returned
+                if "error" in results:
+                    raise RuntimeError(f"Program execution failed: {results['error']}")
+
+                return results["centers"], results["radii"], results["sum_radii"]
+            else:
+                raise RuntimeError("Results file not found")
+
+        except subprocess.TimeoutExpired:
+            # Kill the process if it times out
+            process.kill()
+            process.wait()
+            raise TimeoutError(f"Process timed out after {timeout_seconds} seconds")
+
+    finally:
+        # Clean up temporary files
+        if os.path.exists(temp_file_path):
+            os.unlink(temp_file_path)
+        if os.path.exists(results_path):
+            os.unlink(results_path)
+
+
+def evaluate(program_path):
+    """
+    Evaluate the program by running it once and checking the sum of radii
+
+    Args:
+        program_path: Path to the program file
+
+    Returns:
+        Dictionary of metrics
+    """
+    # Target value from the paper
+    TARGET_VALUE = 2.635  # AlphaEvolve result for n=26
+
+    try:
+        # For constructor-based approaches, a single evaluation is sufficient
+        # since the result is deterministic
+        start_time = time.time()
+
+        # Use subprocess to run with timeout
+        centers, radii, reported_sum = run_with_timeout(
+            program_path, timeout_seconds=600  # Single timeout
+        )
+
+        end_time = time.time()
+        eval_time = end_time - start_time
+
+        # Ensure centers and radii are numpy arrays
+        if not isinstance(centers, np.ndarray):
+            centers = np.array(centers)
+        if not isinstance(radii, np.ndarray):
+            radii = np.array(radii)
+
+        # Check for NaN values before validation
+        if np.isnan(centers).any() or np.isnan(radii).any():
+            print("NaN values detected in solution")
+            return {
+                "sum_radii": 0.0,
+                "target_ratio": 0.0,
+                "validity": 0.0,
+                "eval_time": float(time.time() - start_time),
+                "combined_score": 0.0,
+            }
+
+        # Validate solution
+        valid = validate_packing(centers, radii)
+
+        # Check shape and size
+        shape_valid = centers.shape == (26, 2) and radii.shape == (26,)
+        if not shape_valid:
+            print(
+                f"Invalid shapes: centers={centers.shape}, radii={radii.shape}, expected (26, 2) and (26,)"
+            )
+            valid = False
+
+        # Calculate sum
+        sum_radii = np.sum(radii) if valid else 0.0
+
+        # Make sure reported_sum matches the calculated sum
+        if abs(sum_radii - reported_sum) > 1e-6:
+            print(f"Warning: Reported sum {reported_sum} doesn't match calculated sum {sum_radii}")
+
+        # Target ratio (how close we are to the target)
+        target_ratio = sum_radii / TARGET_VALUE if valid else 0.0
+
+        # Validity score
+        validity = 1.0 if valid else 0.0
+
+        # Combined score - higher is better
+        combined_score = target_ratio * validity
+
+        print(
+            f"Evaluation: valid={valid}, sum_radii={sum_radii:.6f}, target={TARGET_VALUE}, ratio={target_ratio:.6f}, time={eval_time:.2f}s"
+        )
+
+        return {
+            "sum_radii": float(sum_radii),
+            "target_ratio": float(target_ratio),
+            "validity": float(validity),
+            "eval_time": float(eval_time),
+            "combined_score": float(combined_score),
+        }
+
+    except Exception as e:
+        print(f"Evaluation failed completely: {str(e)}")
+        traceback.print_exc()
+        return {
+            "sum_radii": 0.0,
+            "target_ratio": 0.0,
+            "validity": 0.0,
+            "eval_time": 0.0,
+            "combined_score": 0.0,
+        }
+
+
+# Stage-based evaluation for cascade evaluation
+def evaluate_stage1(program_path):
+    """
+    First stage evaluation - quick validation check
+    """
+    try:
+        # Use the simplified subprocess approach
+        try:
+            centers, radii, sum_radii = run_with_timeout(program_path, timeout_seconds=600)
+
+            # Ensure centers and radii are numpy arrays
+            if not isinstance(centers, np.ndarray):
+                centers = np.array(centers)
+            if not isinstance(radii, np.ndarray):
+                radii = np.array(radii)
+
+            # Validate solution (shapes and constraints)
+            shape_valid = centers.shape == (26, 2) and radii.shape == (26,)
+            if not shape_valid:
+                print(f"Invalid shapes: centers={centers.shape}, radii={radii.shape}")
+                return {"validity": 0.0, "error": "Invalid shapes"}
+
+            valid = validate_packing(centers, radii)
+
+            # Calculate sum
+            actual_sum = np.sum(radii) if valid else 0.0
+
+            # Target from paper
+            target = 2.635
+
+            # Simple combined score for stage 1
+            combined_score = (actual_sum / target) if valid else 0.0
+
+            # Return evaluation metrics
+            return {
+                "validity": 1.0 if valid else 0.0,
+                "sum_radii": float(actual_sum),
+                "target_ratio": float(actual_sum / target if valid else 0.0),
+                "combined_score": float(combined_score),
+            }
+
+        except TimeoutError as e:
+            print(f"Stage 1 evaluation timed out: {e}")
+            return {"validity": 0.0, "combined_score": 0.0, "error": "Timeout"}
+        except Exception as e:
+            print(f"Stage 1 evaluation failed: {e}")
+            print(traceback.format_exc())
+            return {"validity": 0.0, "combined_score": 0.0, "error": str(e)}
+
+    except Exception as e:
+        print(f"Stage 1 evaluation failed completely: {e}")
+        print(traceback.format_exc())
+        return {"validity": 0.0, "combined_score": 0.0, "error": str(e)}
+
+
+def evaluate_stage2(program_path):
+    """
+    Second stage evaluation - full evaluation
+    """
+    # Full evaluation as in the main evaluate function
+    return evaluate(program_path)
diff --git a/benchmarks/math/circle_packing/evaluator/Dockerfile b/benchmarks/math/circle_packing/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..ae2cc7ee1e4182f1bfab7c3e8d7870350c17348a
--- /dev/null
+++ b/benchmarks/math/circle_packing/evaluator/Dockerfile
@@ -0,0 +1,11 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY evaluator.py .
+COPY evaluate.sh .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/circle_packing/evaluator/evaluate.sh b/benchmarks/math/circle_packing/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..946880edb22ffa1397d3ef777b929f1440067852
--- /dev/null
+++ b/benchmarks/math/circle_packing/evaluator/evaluate.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) is accepted but ignored — pure optimization has no data split.
+
+echo "[$(date '+%H:%M:%S')] eval start: $PROGRAM" >> /tmp/eval.log
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/circle_packing/evaluator/evaluator.py b/benchmarks/math/circle_packing/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f961809a487114446ddb14d70b3b3d99147297a
--- /dev/null
+++ b/benchmarks/math/circle_packing/evaluator/evaluator.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+"""
+Evaluate a circle-packing candidate program (n=26 circles in a unit square).
+
+Usage: run.py <program_path>
+
+The candidate must define: run_packing() -> (centers, radii, sum_radii)
+
+Writes a single JSON object to stdout following the SkyDiscover evaluator schema.
+"""
+
+import importlib.util
+import json
+import signal
+import sys
+import time
+import traceback
+
+import numpy as np
+
+N = 26
+TIMEOUT_SECONDS = 300
+
+
+def _alarm_handler(signum, frame):
+    raise TimeoutError(f"Program timed out after {TIMEOUT_SECONDS}s")
+
+
+def run_program(program_path):
+    """Import and call run_packing() from the candidate program."""
+    spec = importlib.util.spec_from_file_location("program", program_path)
+    prog = importlib.util.module_from_spec(spec)
+    signal.signal(signal.SIGALRM, _alarm_handler)
+    signal.alarm(TIMEOUT_SECONDS)
+    try:
+        spec.loader.exec_module(prog)
+        return prog.run_packing()
+    finally:
+        signal.alarm(0)
+
+
+def validate_packing(centers, radii):
+    if np.isnan(centers).any() or np.isnan(radii).any():
+        return False, "NaN values in output"
+    for i in range(len(radii)):
+        if radii[i] < 0:
+            return False, f"Circle {i} has negative radius"
+    for i in range(len(radii)):
+        x, y = centers[i]
+        r = radii[i]
+        if x - r < -1e-6 or x + r > 1 + 1e-6 or y - r < -1e-6 or y + r > 1 + 1e-6:
+            return False, f"Circle {i} outside unit square"
+    for i in range(len(radii)):
+        for j in range(i + 1, len(radii)):
+            dist = np.sqrt(np.sum((centers[i] - centers[j]) ** 2))
+            if dist < radii[i] + radii[j] - 1e-6:
+                return False, f"Circles {i} and {j} overlap"
+    return True, ""
+
+
+def fail(status, reason, elapsed=0.0):
+    print(json.dumps({
+        "status": status,
+        "combined_score": 0.0,
+        "metrics": {"combined_score": 0.0, "sum_radii": 0.0, "validity": 0.0,
+                    "eval_time": elapsed},
+        "artifacts": {"error": reason},
+    }))
+
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: run.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    def log(msg):
+        with open("/tmp/eval.log", "a") as f:
+            f.write(f"[{time.strftime('%H:%M:%S')}] {msg}\n")
+
+    start = time.time()
+    try:
+        centers, radii, _ = run_program(program_path)
+    except TimeoutError as e:
+        log(f"timeout: {e}")
+        fail("timeout", str(e))
+        return
+    except Exception as e:
+        log(f"error: {e}")
+        fail("error", f"{e}\n{traceback.format_exc()}")
+        return
+    elapsed = time.time() - start
+
+    centers = np.asarray(centers)
+    radii = np.asarray(radii)
+
+    if centers.shape != (N, 2) or radii.shape != (N,):
+        log(f"bad shapes: centers={centers.shape}, radii={radii.shape}")
+        fail("error", f"Wrong shapes: centers={centers.shape}, radii={radii.shape}", elapsed)
+        return
+
+    valid, reason = validate_packing(centers, radii)
+    sum_radii = float(np.sum(radii)) if valid else 0.0
+    log(f"done in {elapsed:.3f}s — sum_radii={sum_radii:.6f} valid={valid}" +
+        (f" ({reason})" if not valid else ""))
+
+    print(json.dumps({
+        "status": "success",
+        "combined_score": sum_radii,
+        "metrics": {
+            "combined_score": sum_radii,
+            "sum_radii": sum_radii,
+            "validity": 1.0 if valid else 0.0,
+            "eval_time": elapsed,
+        },
+        "artifacts": ({} if valid else {"error": reason}),
+    }))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/math/circle_packing/evaluator/requirements.txt b/benchmarks/math/circle_packing/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..24ce15ab7ead32f98c7ac3edcd34bb2010ff4326
--- /dev/null
+++ b/benchmarks/math/circle_packing/evaluator/requirements.txt
@@ -0,0 +1 @@
+numpy
diff --git a/benchmarks/math/circle_packing/initial_program.py b/benchmarks/math/circle_packing/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb4ea397e840733dd0e4823548c5be8fbd1e1a56
--- /dev/null
+++ b/benchmarks/math/circle_packing/initial_program.py
@@ -0,0 +1,133 @@
+# EVOLVE-BLOCK-START
+"""Constructor-based circle packing for n=26 circles"""
+import numpy as np
+
+
+def construct_packing():
+    """
+    Construct a specific arrangement of 26 circles in a unit square
+    that attempts to maximize the sum of their radii.
+
+    Returns:
+        Tuple of (centers, radii, sum_of_radii)
+        centers: np.array of shape (26, 2) with (x, y) coordinates
+        radii: np.array of shape (26) with radius of each circle
+        sum_of_radii: Sum of all radii
+    """
+    # Initialize arrays for 26 circles
+    n = 26
+    centers = np.zeros((n, 2))
+
+    # Place circles in a structured pattern
+    # This is a simple pattern - evolution will improve this
+
+    # First, place a large circle in the center
+    centers[0] = [0.5, 0.5]
+
+    # Place 8 circles around it in a ring
+    for i in range(8):
+        angle = 2 * np.pi * i / 8
+        centers[i + 1] = [0.5 + 0.3 * np.cos(angle), 0.5 + 0.3 * np.sin(angle)]
+
+    # Place 16 more circles in an outer ring
+    for i in range(16):
+        angle = 2 * np.pi * i / 16
+        centers[i + 9] = [0.5 + 0.7 * np.cos(angle), 0.5 + 0.7 * np.sin(angle)]
+
+    # Additional positioning adjustment to make sure all circles
+    # are inside the square and don't overlap
+    # Clip to ensure everything is inside the unit square
+    centers = np.clip(centers, 0.01, 0.99)
+
+    # Compute maximum valid radii for this configuration
+    radii = compute_max_radii(centers)
+
+    # Calculate the sum of radii
+    sum_radii = np.sum(radii)
+
+    return centers, radii, sum_radii
+
+
+def compute_max_radii(centers):
+    """
+    Compute the maximum possible radii for each circle position
+    such that they don't overlap and stay within the unit square.
+
+    Args:
+        centers: np.array of shape (n, 2) with (x, y) coordinates
+
+    Returns:
+        np.array of shape (n) with radius of each circle
+    """
+    n = centers.shape[0]
+    radii = np.ones(n)
+
+    # First, limit by distance to square borders
+    for i in range(n):
+        x, y = centers[i]
+        # Distance to borders
+        radii[i] = min(x, y, 1 - x, 1 - y)
+
+    # Then, limit by distance to other circles
+    # Each pair of circles with centers at distance d can have
+    # sum of radii at most d to avoid overlap
+    for i in range(n):
+        for j in range(i + 1, n):
+            dist = np.sqrt(np.sum((centers[i] - centers[j]) ** 2))
+
+            # If current radii would cause overlap
+            if radii[i] + radii[j] > dist:
+                # Scale both radii proportionally
+                scale = dist / (radii[i] + radii[j])
+                radii[i] *= scale
+                radii[j] *= scale
+
+    return radii
+
+
+# EVOLVE-BLOCK-END
+
+
+# This part remains fixed (not evolved)
+def run_packing():
+    """Run the circle packing constructor for n=26"""
+    centers, radii, sum_radii = construct_packing()
+    return centers, radii, sum_radii
+
+
+def visualize(centers, radii):
+    """
+    Visualize the circle packing
+
+    Args:
+        centers: np.array of shape (n, 2) with (x, y) coordinates
+        radii: np.array of shape (n) with radius of each circle
+    """
+    import matplotlib.pyplot as plt
+    from matplotlib.patches import Circle
+
+    fig, ax = plt.subplots(figsize=(8, 8))
+
+    # Draw unit square
+    ax.set_xlim(0, 1)
+    ax.set_ylim(0, 1)
+    ax.set_aspect("equal")
+    ax.grid(True)
+
+    # Draw circles
+    for i, (center, radius) in enumerate(zip(centers, radii)):
+        circle = Circle(center, radius, alpha=0.5)
+        ax.add_patch(circle)
+        ax.text(center[0], center[1], str(i), ha="center", va="center")
+
+    plt.title(f"Circle Packing (n={len(centers)}, sum={sum(radii):.6f})")
+    plt.show()
+
+
+if __name__ == "__main__":
+    centers, radii, sum_radii = run_packing()
+    print(f"Sum of radii: {sum_radii}")
+    # AlphaEvolve improved this to 2.635
+
+    # Uncomment to visualize:
+    visualize(centers, radii)
diff --git a/benchmarks/math/circle_packing/requirements.txt b/benchmarks/math/circle_packing/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6bad10388ecb1eefd890a797d833976a5e631541
--- /dev/null
+++ b/benchmarks/math/circle_packing/requirements.txt
@@ -0,0 +1,2 @@
+numpy
+scipy
diff --git a/benchmarks/math/circle_packing_rect/config.yaml b/benchmarks/math/circle_packing_rect/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..527c634a14741d929f2bd7e41f10fb23d6867f08
--- /dev/null
+++ b/benchmarks/math/circle_packing_rect/config.yaml
@@ -0,0 +1,33 @@
+# Math benchmark: circle_packing_rect
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: "SETTING:\nYou are an expert computational geometer and optimization specialist with deep expertise in circle\
+    \ packing problems, geometric optimization algorithms, and constraint satisfaction.\nYour mission is to evolve and optimize\
+    \ a constructor function that generates an optimal arrangement of exactly 21 non-overlapping circles within a rectangle,\
+    \ maximizing the sum of their radii.\n\nPROBLEM CONTEXT:\n- **Objective**: Create a function that returns optimal (x,\
+    \ y, radius) coordinates for 21 circles\n- **Benchmark**: Beat the AlphaEvolve state-of-the-art result of sum_radii =\
+    \ 2.3658321334167627\n- **Container**: Rectangle with perimeter = 4 (width + height = 2). You may choose optimal width/height\
+    \ ratio\n- **Constraints**: \n  * All circles must be fully contained within rectangle boundaries\n  * No circle overlaps\
+    \ (distance between centers ≥ sum of their radii)\n  * Exactly 21 circles required\n  * All radii must be positive\n\n\
+    PERFORMANCE METRICS:\n1. **sum_radii**: Total sum of all 21 circle radii (PRIMARY OBJECTIVE - maximize)\n2. **combined_score**:\
+    \ sum_radii / 2.3658321334167627 (progress toward beating benchmark)  \n3. **eval_time**: Execution time in seconds (keep\
+    \ reasonable, prefer accuracy over speed)\n\nTECHNICAL REQUIREMENTS:\n- **Determinism**: Use fixed random seeds if employing\
+    \ stochastic methods for reproducibility\n- **Error handling**: Graceful handling of optimization failures or infeasible\
+    \ configurations\n- **Memory efficiency**: Avoid excessive memory allocation for distance matrix computations\n- **Scalability**:\
+    \ Design with potential extension to different circle counts in mind\n"
+evaluator:
+  timeout: 360
+  max_retries: 3
+  cascade_evaluation: false
diff --git a/benchmarks/math/circle_packing_rect/evaluator/Dockerfile b/benchmarks/math/circle_packing_rect/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/circle_packing_rect/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/circle_packing_rect/evaluator/evaluate.sh b/benchmarks/math/circle_packing_rect/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/circle_packing_rect/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/circle_packing_rect/evaluator/evaluator.py b/benchmarks/math/circle_packing_rect/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2a12b218b019e6bdb363afdc47a6cfdc5c9bfe4
--- /dev/null
+++ b/benchmarks/math/circle_packing_rect/evaluator/evaluator.py
@@ -0,0 +1,119 @@
+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for the circle packing problem on a rectangle
+# of perimeter 4.
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+
+import time
+import numpy as np
+import sys
+import os
+from importlib import __import__
+
+BENCHMARK = 2.3658321334167627
+NUM_CIRCLES = 21
+TOL = 1e-6
+
+
+def minimum_circumscribing_rectangle(circles: np.ndarray):
+    """Returns the width and height of the minimum circumscribing rectangle.
+
+    Args:
+    circles: A numpy array of shape (num_circles, 3), where each row is of the
+        form (x, y, radius), specifying a circle.
+
+    Returns:
+    A tuple (width, height) of the minimum circumscribing rectangle.
+    """
+    min_x = np.min(circles[:, 0] - circles[:, 2])
+    max_x = np.max(circles[:, 0] + circles[:, 2])
+    min_y = np.min(circles[:, 1] - circles[:, 2])
+    max_y = np.max(circles[:, 1] + circles[:, 2])
+    return max_x - min_x, max_y - min_y
+
+
+def validate_packing_radii(radii: np.ndarray) -> None:
+    n = len(radii)
+    for i in range(n):
+        if radii[i] < 0:
+            raise ValueError(f"Circle {i} has negative radius {radii[i]}")
+        elif np.isnan(radii[i]):
+            raise ValueError(f"Circle {i} has nan radius")
+
+
+def validate_packing_overlap_wtol(circles: np.ndarray, tol: float = 1e-6) -> None:
+    n = len(circles)
+    for i in range(n):
+        for j in range(i + 1, n):
+            dist = np.sqrt(np.sum((circles[i, :2] - circles[j, :2]) ** 2))
+            if dist < circles[i, 2] + circles[j, 2] - tol:
+                raise ValueError(
+                    f"Circles {i} and {j} overlap: dist={dist}, r1+r2={circles[i,2]+circles[j,2]}"
+                )
+
+
+def validate_packing_inside_rect_wtol(circles: np.array, tol: float = 1e-6) -> None:
+    width, height = minimum_circumscribing_rectangle(circles)
+    if width + height > (2 + tol):
+        raise ValueError("Circles are not contained inside a rectangle of perimeter 4.")
+
+
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+
+        circles = None
+        eval_time = 0
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+
+            start_time = time.time()
+            circles = program.circle_packing21()
+            end_time = time.time()
+            eval_time = end_time - start_time
+        except Exception as err:
+            raise err
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+
+        if not isinstance(circles, np.ndarray):
+            circles = np.array(circles)
+
+        if circles.shape != (NUM_CIRCLES, 3):
+            raise ValueError(
+                f"Invalid shapes: circles = {circles.shape}, expected {(NUM_CIRCLES,3)}"
+            )
+
+        validate_packing_radii(circles[:, -1])
+        validate_packing_overlap_wtol(circles, TOL)
+        validate_packing_inside_rect_wtol(circles, TOL)
+
+        radii_sum = np.sum(circles[:, -1])
+
+        return {
+            "radii_sum": float(radii_sum),
+            "combined_score": float(radii_sum / BENCHMARK),
+            "eval_time": float(eval_time),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/circle_packing_rect/evaluator/requirements.txt b/benchmarks/math/circle_packing_rect/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6bad10388ecb1eefd890a797d833976a5e631541
--- /dev/null
+++ b/benchmarks/math/circle_packing_rect/evaluator/requirements.txt
@@ -0,0 +1,2 @@
+numpy
+scipy
diff --git a/benchmarks/math/circle_packing_rect/evaluator/wrapper.py b/benchmarks/math/circle_packing_rect/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/circle_packing_rect/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/circle_packing_rect/initial_program.py b/benchmarks/math/circle_packing_rect/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..601ae3866931fa9c0baf6a6ff7b273ba29468710
--- /dev/null
+++ b/benchmarks/math/circle_packing_rect/initial_program.py
@@ -0,0 +1,22 @@
+# EVOLVE-BLOCK-START
+import numpy as np
+
+
+def circle_packing21() -> np.ndarray:
+    """
+    Places 21 non-overlapping circles inside a rectangle of perimeter 4 in order to maximize the sum of their radii.
+
+    Returns:
+        circles: np.array of shape (21,3), where the i-th row (x,y,r) stores the (x,y) coordinates of the i-th circle of radius r.
+    """
+    n = 21
+    circles = np.zeros((n, 3))
+
+    return circles
+
+
+# EVOLVE-BLOCK-END
+
+if __name__ == "__main__":
+    circles = circle_packing21()
+    print(f"Radii sum: {np.sum(circles[:,-1])}")
diff --git a/benchmarks/math/erdos_min_overlap/config.yaml b/benchmarks/math/erdos_min_overlap/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d15966b4b056962769db1ed82e48e951b548a1a5
--- /dev/null
+++ b/benchmarks/math/erdos_min_overlap/config.yaml
@@ -0,0 +1,41 @@
+# Math benchmark: erdos_min_overlap
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: |
+    SETTING:
+    You are an expert in harmonic analysis, numerical optimization, and AI-driven mathematical discovery.
+    Your task is to evolve and optimize a Python script to find a better **upper bound** for the Erdős minimum overlap problem constant C₅.
+
+    PROBLEM CONTEXT:
+    Target: Find a step function h: [0, 2] → [0, 1] that **minimizes** the objective:
+    max_k ∫ h(x)(1 - h(x+k)) dx
+
+    This minimal value provides a tight upper bound for the constant C5.
+
+    Current best known upper bound: C5 ≤ 0.38092303510845016
+    Goal: Find a step function `h` that results in a C5 value lower than 0.38092303510845016.
+
+    CONSTRAINTS:
+    1. The function `h` must have values in the range [0, 1].
+    2. The integral of h(x) over [0, 2] must be exactly 1.
+
+    PERFORMANCE METRICS:
+    - c5_bound: The bound found by the program.
+    - combined_score: 0.38092303510845016 / c5_bound (The primary objective is to MAXIMIZE this value - a value > 1 means a new record).
+    - n_points: number of points used in the discretization.
+    - eval_time: evaluation time of the program.
+evaluator:
+  timeout: 600
+  max_retries: 3
diff --git a/benchmarks/math/erdos_min_overlap/evaluator/Dockerfile b/benchmarks/math/erdos_min_overlap/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/erdos_min_overlap/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/erdos_min_overlap/evaluator/evaluate.sh b/benchmarks/math/erdos_min_overlap/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/erdos_min_overlap/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/erdos_min_overlap/evaluator/evaluator.py b/benchmarks/math/erdos_min_overlap/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd2d8c0ca4ff676fff7ca07ac17d042aaa72cb31
--- /dev/null
+++ b/benchmarks/math/erdos_min_overlap/evaluator/evaluator.py
@@ -0,0 +1,84 @@
+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for the erdos minimum overlap problem.
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+
+import sys
+import os
+from importlib import __import__
+import time
+import numpy as np
+
+# Known bounds
+BENCHMARK = 0.38092303510845016
+
+
+def verify_c5_solution(h_values: np.ndarray, c5_achieved: float, n_points: int):
+    """Verifies the C5 upper bound solution."""
+
+    if h_values.shape != (n_points,):
+        raise ValueError(f"Expected h shape ({n_points},), got {h_values.shape}")
+
+    # Verify h(x) in [0, 1] constraint
+    if np.any(h_values < 0) or np.any(h_values > 1):
+        raise ValueError(f"h(x) is not in [0, 1]. Range: [{h_values.min()}, {h_values.max()}]")
+
+    # Verify integral of h = 1 constraint
+    dx = 2.0 / n_points
+    integral_h = np.sum(h_values) * dx
+    if not np.isclose(integral_h, 1.0, atol=1e-3):
+        raise ValueError(f"Integral of h is not close to 1. Got: {integral_h:.6f}")
+
+    # Re-calculate the C5 bound using np.correlate
+    j_values = 1.0 - h_values
+    correlation = np.correlate(h_values, j_values, mode="full") * dx
+    computed_c5 = np.max(correlation)
+
+    # Check for consistency
+    if not np.isclose(computed_c5, c5_achieved, atol=1e-4):
+        raise ValueError(f"C5 mismatch: reported {c5_achieved:.6f}, computed {computed_c5:.6f}")
+
+
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+            start_time = time.time()
+            h_values, c5_bound, n_points = program.run()
+            end_time = time.time()
+            eval_time = end_time - start_time
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+
+        verify_c5_solution(h_values, c5_bound, n_points)
+
+        return {
+            "c5_bound": float(c5_bound),
+            "combined_score": BENCHMARK / float(c5_bound),
+            "n_points": int(n_points),
+            "eval_time": float(eval_time),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/erdos_min_overlap/evaluator/requirements.txt b/benchmarks/math/erdos_min_overlap/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3dee69521695f692e340afea5918ed74f057d6aa
--- /dev/null
+++ b/benchmarks/math/erdos_min_overlap/evaluator/requirements.txt
@@ -0,0 +1,3 @@
+numpy
+jax
+optax
\ No newline at end of file
diff --git a/benchmarks/math/erdos_min_overlap/evaluator/wrapper.py b/benchmarks/math/erdos_min_overlap/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/erdos_min_overlap/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/erdos_min_overlap/initial_program.py b/benchmarks/math/erdos_min_overlap/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d07d8cd584e0ec37f112db83a14db9bc708b9a5
--- /dev/null
+++ b/benchmarks/math/erdos_min_overlap/initial_program.py
@@ -0,0 +1,96 @@
+# EVOLVE-BLOCK-START
+import jax
+import jax.numpy as jnp
+import optax
+import numpy as np
+from dataclasses import dataclass
+import tqdm
+
+
+@dataclass
+class Hyperparameters:
+    num_intervals: int = 200
+    learning_rate: float = 0.005
+    num_steps: int = 20000
+    penalty_strength: float = 1000000.0
+
+
+class ErdosOptimizer:
+    """
+    Finds a step function h that minimizes the maximum overlap integral.
+    """
+
+    def __init__(self, hypers: Hyperparameters):
+        self.hypers = hypers
+        self.domain_width = 2.0
+        self.dx = self.domain_width / self.hypers.num_intervals
+
+    def _objective_fn(self, latent_h_values: jnp.ndarray) -> jnp.ndarray:
+        """
+        The loss function includes the objective and a penalty for the constraint.
+        """
+        # Enforce h(x) in [0, 1] via sigmoid (hard constraint)
+        h = jax.nn.sigmoid(latent_h_values)
+
+        # Calculate the primary objective (max correlation)
+        j = 1.0 - h
+        N = self.hypers.num_intervals
+        h_padded = jnp.pad(h, (0, N))
+        j_padded = jnp.pad(j, (0, N))
+        corr_fft = jnp.fft.fft(h_padded) * jnp.conj(jnp.fft.fft(j_padded))
+        correlation = jnp.fft.ifft(corr_fft).real
+        scaled_correlation = correlation * self.dx
+        objective_loss = jnp.max(scaled_correlation)
+
+        # Calculate the penalty for the integral constraint
+        integral_h = jnp.sum(h) * self.dx
+        constraint_loss = (integral_h - 1.0) ** 2
+
+        # Combine the objective with the penalty
+        total_loss = objective_loss + self.hypers.penalty_strength * constraint_loss
+        return total_loss
+
+    def run_optimization(self):
+        optimizer = optax.adam(self.hypers.learning_rate)
+
+        key = jax.random.PRNGKey(42)
+        latent_h_values = jax.random.normal(key, (self.hypers.num_intervals,))
+
+        opt_state = optimizer.init(latent_h_values)
+
+        @jax.jit
+        def train_step(latent_h_values, opt_state):
+            loss, grads = jax.value_and_grad(self._objective_fn)(latent_h_values)
+            updates, opt_state = optimizer.update(grads, opt_state)
+            latent_h_values = optax.apply_updates(latent_h_values, updates)
+            return latent_h_values, opt_state, loss
+
+        print(f"Optimizing a step function with {self.hypers.num_intervals} intervals...")
+        for step in tqdm.tqdm(range(self.hypers.num_steps), desc="Optimizing"):
+            latent_h_values, opt_state, loss = train_step(latent_h_values, opt_state)
+
+        # Final h is just the sigmoid of the latent values
+        final_h = jax.nn.sigmoid(latent_h_values)
+
+        # Re-calculate final objective loss without the penalty for the report
+        j = 1.0 - final_h
+        N = self.hypers.num_intervals
+        h_padded = jnp.pad(final_h, (0, N))
+        j_padded = jnp.pad(j, (0, N))
+        corr_fft = jnp.fft.fft(h_padded) * jnp.conj(jnp.fft.fft(j_padded))
+        correlation = jnp.fft.ifft(corr_fft).real
+        c5_bound = jnp.max(correlation * self.dx)
+
+        print(f"Optimization complete. Final C5 upper bound: {c5_bound:.8f}")
+        return np.array(final_h), float(c5_bound)
+
+
+def run():
+    hypers = Hyperparameters()
+    optimizer = ErdosOptimizer(hypers)
+    final_h_values, c5_bound = optimizer.run_optimization()
+
+    return final_h_values, c5_bound, hypers.num_intervals
+
+
+# EVOLVE-BLOCK-END
diff --git a/benchmarks/math/first_autocorr_ineq/config.yaml b/benchmarks/math/first_autocorr_ineq/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b097f3730e1a3903f288a67f1b3d4e5fac2b871e
--- /dev/null
+++ b/benchmarks/math/first_autocorr_ineq/config.yaml
@@ -0,0 +1,102 @@
+# Math benchmark: first_autocorr_ineq
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: 'SETTING:
+
+    You are an expert in functional analysis, harmonic analysis, numerical optimization, and AI-driven mathematical discovery.
+
+    Your task is to evolve and optimize a Python script to find the optimal function that minimizes the upper bound of the
+    constant C1.
+
+
+    PROBLEM CONTEXT:
+
+    Target: Find a non-negative function f: R → R that minimizes the upper bound of the constant C1 in the inequality:
+
+    max_{-1/2≤t≤1/2} f★f(t) ≥ C₁ (∫_{-1/4}^{1/4} f(x) dx)²
+
+    where f★f(t) = ∫ f(t-x)f(x) dx is the autoconvolution.
+
+
+    Current best known bounds:
+
+    * literature: 1.28 ≤ C1 ≤ 1.5098
+
+    * alphaevolve: C1 ≤ 1.5052939684401607
+
+    Goal: Beat the current upper bound of 1.5052939684401607 discovered by step functions and alphaevolve.
+
+
+    Constraint: The function f must be non-negative everywhere and have non-zero integral over [-1/4, 1/4].
+
+
+    MATHEMATICAL FORMULATION:
+
+    Given: Discretized domain [-1/4, 1/4] with n_points equally-spaced grid points.
+
+    Objective: Minimize min_{t∈[-1/2,1/2]} (f★f)(t) / (∫f dx)² over all non-negative functions f.
+
+    Discretization: Use finite differences and discrete convolution to approximate integrals and autoconvolution.
+
+
+    PERFORMANCE METRICS:
+
+    combined_score: The 1.5052939684401607/C1 constant achieved by the discovered function (PRIMARY OBJECTIVE - maximize this)
+
+    c1: constant achieved (current best upper bound)
+
+    eval_time: Time to reach best solution
+
+    n_points: number of points used in the integral interval
+
+    loss: loss valued of the function used in minimization
+
+
+    VALIDATION FRAMEWORK:
+
+    Mathematical Validation: Verify the C1 computation using independent numerical integration
+
+    Non-negativity Check: Ensure f(x) ≥ 0 everywhere (up to numerical tolerance)
+
+    Integral Verification: Confirm ∫f dx > 0 to avoid degenerate solutions
+
+    Consistency Check: Re-compute autoconvolution and verify inequality holds
+
+
+    TECHNICAL REQUIREMENTS:
+
+    Reproducibility: Control random seeds for deterministic results
+
+    Numerical Stability: Handle potential division by zero in integral ratios
+
+    Memory Management: Discrete convolution can be memory-intensive for large grids
+
+    Constraint Handling: Maintain non-negativity throughout optimization
+
+
+    SUCCESS CRITERIA:
+
+    Primary: Achieving c1 < 1.5052939684401607 (beating current record)
+
+    Secondary: Finding interpretable functions that achieve high C1 values
+
+    Robustness: Solutions that work across multiple runs and parameter settings
+
+    Efficiency: Fast convergence to high-quality solutions
+
+    '
+evaluator:
+  timeout: 600
+  max_retries: 3
diff --git a/benchmarks/math/first_autocorr_ineq/evaluator/Dockerfile b/benchmarks/math/first_autocorr_ineq/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/first_autocorr_ineq/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/first_autocorr_ineq/evaluator/evaluator.py b/benchmarks/math/first_autocorr_ineq/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..cea82cf5aaf80ce544b38822bb3b567165dc9e28
--- /dev/null
+++ b/benchmarks/math/first_autocorr_ineq/evaluator/evaluator.py
@@ -0,0 +1,95 @@
+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for the first autocorrelation inequality problem.
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+
+import sys
+import os
+from importlib import __import__
+import time
+import numpy as np
+
+# known bounds
+BENCHMARK = 1.5052939684401607
+
+
+def verify_autocorrelation_solution(f_values: np.ndarray, c1_achieved: float, n_points: int):
+    """Verify the autocorrelation solution for UPPER BOUND optimization"""
+
+    # Check shape
+    if f_values.shape != (n_points,):
+        raise ValueError(f"Expected function values shape {(n_points,)}. Got {f_values.shape}.")
+
+    # Check non-negativity
+    if np.any(f_values < 0.0):
+        raise ValueError("Function must be non-negative.")
+
+    # Recompute C1 to verify
+    dx = 0.5 / n_points
+    f_nonneg = np.maximum(f_values, 0.0)
+
+    # Compute the FULL autoconvolution
+    autoconv = np.convolve(f_nonneg, f_nonneg, mode="full") * dx
+
+    # The rest of the calculation can be simplified as we now take the max over the whole result
+    integral_sq = (np.sum(f_nonneg) * dx) ** 2
+
+    if integral_sq < 1e-8:
+        raise ValueError("Function integral is too small.")
+
+    # The max of the full autoconv is the correct value
+    computed_c1 = float(np.max(autoconv / integral_sq))
+
+    # Verify consistency
+    delta = abs(computed_c1 - c1_achieved)
+    if delta > 1e-6:
+        raise ValueError(
+            f"C1 mismatch: reported {c1_achieved:.6f}, computed {computed_c1:.6f}, delta: {delta:.6f}"
+        )
+
+
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+            start_time = time.time()
+            f_values, c1_achieved, loss, n_points = program.run()
+            end_time = time.time()
+            eval_time = end_time - start_time
+        except Exception as err:
+            raise err
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+
+        verify_autocorrelation_solution(f_values, c1_achieved, n_points)
+        return {
+            "c1": float(c1_achieved),
+            "combined_score": BENCHMARK / float(c1_achieved),
+            "loss": float(loss),
+            "n_points": int(n_points),
+            "eval_time": float(eval_time),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/first_autocorr_ineq/evaluator/requirements.txt b/benchmarks/math/first_autocorr_ineq/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3dee69521695f692e340afea5918ed74f057d6aa
--- /dev/null
+++ b/benchmarks/math/first_autocorr_ineq/evaluator/requirements.txt
@@ -0,0 +1,3 @@
+numpy
+jax
+optax
\ No newline at end of file
diff --git a/benchmarks/math/first_autocorr_ineq/evaluator/wrapper.py b/benchmarks/math/first_autocorr_ineq/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/first_autocorr_ineq/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/first_autocorr_ineq/initial_program.py b/benchmarks/math/first_autocorr_ineq/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..94a7aa58903cd1eb50d94510f947b2eebd5330f7
--- /dev/null
+++ b/benchmarks/math/first_autocorr_ineq/initial_program.py
@@ -0,0 +1,117 @@
+# EVOLVE-BLOCK-START
+import jax
+import jax.numpy as jnp
+import optax
+import numpy as np
+from dataclasses import dataclass
+
+
+@dataclass
+class Hyperparameters:
+    """Hyperparameters for the optimization process."""
+
+    num_intervals: int = 600
+    learning_rate: float = 0.005
+    end_lr_factor: float = 1e-4
+    num_steps: int = 40000
+    warmup_steps: int = 2000
+
+
+class AutocorrelationOptimizer:
+    """
+    Optimizes a discretized function to find the minimal C1 constant.
+    """
+
+    def __init__(self, hypers: Hyperparameters):
+        self.hypers = hypers
+        self.domain_width = 0.5
+        self.dx = self.domain_width / self.hypers.num_intervals
+
+    def _objective_fn(self, f_values: jnp.ndarray) -> jnp.ndarray:
+        """
+        Computes the objective function, which is the C1 ratio.
+        We minimize this ratio to find a tight upper bound.
+        """
+        f_non_negative = jax.nn.relu(f_values)
+        integral_f = jnp.sum(f_non_negative) * self.dx
+
+        eps = 1e-9
+        integral_f_safe = jnp.maximum(integral_f, eps)
+
+        N = self.hypers.num_intervals
+        padded_f = jnp.pad(f_non_negative, (0, N))
+
+        fft_f = jnp.fft.fft(padded_f)
+        fft_conv = fft_f * fft_f
+        conv_f_f = jnp.fft.ifft(fft_conv).real
+
+        # Scale by dx.
+        scaled_conv_f_f = conv_f_f * self.dx
+
+        max_conv = jnp.max(scaled_conv_f_f)
+        c1_ratio = max_conv / (integral_f_safe**2)
+
+        # Return the value to be MINIMIZED.
+        return c1_ratio
+
+    def train_step(self, f_values: jnp.ndarray, opt_state: optax.OptState) -> tuple:
+        """Performs a single training step."""
+        loss, grads = jax.value_and_grad(self._objective_fn)(f_values)
+        updates, opt_state = self.optimizer.update(grads, opt_state, f_values)
+        f_values = optax.apply_updates(f_values, updates)
+
+        return f_values, opt_state, loss
+
+    def run_optimization(self):
+        """Sets up and runs the full optimization process."""
+        schedule = optax.warmup_cosine_decay_schedule(
+            init_value=0.0,
+            peak_value=self.hypers.learning_rate,
+            warmup_steps=self.hypers.warmup_steps,
+            decay_steps=self.hypers.num_steps - self.hypers.warmup_steps,
+            end_value=self.hypers.learning_rate * self.hypers.end_lr_factor,
+        )
+        self.optimizer = optax.adam(learning_rate=schedule)
+
+        key = jax.random.PRNGKey(42)
+        N = self.hypers.num_intervals
+        f_values = jnp.zeros((N,))
+        start_idx, end_idx = N // 4, 3 * N // 4
+        f_values = f_values.at[start_idx:end_idx].set(1.0)
+        f_values += 0.05 * jax.random.uniform(key, (N,))
+
+        opt_state = self.optimizer.init(f_values)
+
+        print(
+            f"Number of intervals (N): {self.hypers.num_intervals}, Steps: {self.hypers.num_steps}"
+        )
+
+        train_step_jit = jax.jit(self.train_step)
+
+        loss = jnp.inf  # Initialize loss
+        for step in range(self.hypers.num_steps):
+            f_values, opt_state, loss = train_step_jit(f_values, opt_state)
+            if step % 2000 == 0 or step == self.hypers.num_steps - 1:
+                # CORRECTED PRINTING: Show the positive loss value directly.
+                print(f"Step {step:5d} | C1 ≈ {loss:.8f}")
+
+        print(f"Final C1 found: {loss:.8f}")
+
+        return jax.nn.relu(f_values), loss
+
+
+def run():
+    """Entry point for running the optimization and returning results."""
+    hypers = Hyperparameters()
+    optimizer = AutocorrelationOptimizer(hypers)
+
+    optimized_f, final_loss_val = optimizer.run_optimization()
+
+    final_c1 = float(final_loss_val)
+
+    f_values_np = np.array(optimized_f)
+
+    return f_values_np, final_c1, final_loss_val, hypers.num_intervals
+
+
+# EVOLVE-BLOCK-END
diff --git a/benchmarks/math/heilbronn_convex/13/config.yaml b/benchmarks/math/heilbronn_convex/13/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..90852da85c32680501da0377fc9ebf9c9d7a6611
--- /dev/null
+++ b/benchmarks/math/heilbronn_convex/13/config.yaml
@@ -0,0 +1,68 @@
+# Math benchmark: heilbronn_convex/13
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: "SETTING:\nYou are an expert computational geometer and optimization specialist with deep expertise in the\
+    \ Heilbronn triangle problem - a fundamental challenge in discrete geometry first posed by Hans Heilbronn in 1957.\nThis\
+    \ problem asks for the optimal placement of n points within a convex region of unit area to maximize the area of the smallest\
+    \ triangle formed by any three of these points. \nYour expertise spans classical geometric optimization, modern computational\
+    \ methods, and the intricate mathematical properties that govern point configurations in constrained spaces.\n\nPROBLEM\
+    \ SPECIFICATION:\nDesign and implement a constructor function that generates an optimal arrangement of exactly 13 points\
+    \ within or on the boundary of a unit-area convex region. The solution must:\n- Place all 13 points within or on a convex\
+    \ boundary\n- Maximize the minimum triangle area among all C(13,3) = 286 possible triangles\n- Return deterministic, reproducible\
+    \ results\n- Execute efficiently within computational constraints\n\nPERFORMANCE METRICS:\n1. **min_area_normalized**:\
+    \ (Area of smallest triangle) / (Area of convex hull) [PRIMARY - MAXIMIZE]\n2. **combined_score**: min_area_normalized\
+    \ / 0.030936889034895654 [BENCHMARK COMPARISON - TARGET > 1.0]\n3. **eval_time**: Execution time in seconds [EFFICIENCY\
+    \ - secondary priority]\n\nTECHNICAL REQUIREMENTS:\n- **Determinism**: Use fixed random seeds if employing stochastic\
+    \ methods for reproducibility\n- **Error handling**: Graceful handling of optimization failures or infeasible configurations\n\
+    \nMATHEMATICAL CONTEXT & THEORETICAL BACKGROUND:\n- **PROBLEM COMPLEXITY**: The Heilbronn problem is among the most challenging\
+    \ in discrete geometry, with optimal configurations rigorously known only for n ≤ 4 points\n- **ASYMPTOTIC BEHAVIOR**:\
+    \ For large n, the optimal value approaches O(1/n²) with logarithmic corrections, but the exact constant remains unknown\n\
+    - **GEOMETRIC CONSTRAINTS**: Points must balance competing objectives:\n  * Interior points can form larger triangles\
+    \ but create crowding\n  * Boundary points avoid area penalties but limit triangle formation\n  * Edge cases arise when\
+    \ three points become nearly collinear\n- **SYMMETRY CONSIDERATIONS**: Optimal configurations often exhibit rotational\
+    \ symmetries (particularly 3-fold due to triangular geometry)\n- **SCALING INVARIANCE**: The problem is scale-invariant;\
+    \ solutions can be normalized to any convex region\n- **CRITICAL GEOMETRIC PROPERTIES**:\n  * Delaunay triangulation properties\
+    \ and angle optimization\n  * Voronoi diagram regularity as indicator of point distribution quality\n  * Relationship\
+    \ between circumradius and triangle area\n  * Connection to sphere packing and energy minimization principles\n\nADVANCED\
+    \ OPTIMIZATION STRATEGIES:\n- **MULTI-SCALE APPROACH**: Coarse global search → fine local refinement with adaptive step\
+    \ sizes\n- **CONSTRAINT HANDLING**: Penalty methods, barrier functions, or projection operators for convexity\n- **INITIALIZATION\
+    \ STRATEGIES**:\n  * Perturbed regular grids (triangular, square, hexagonal lattices)\n  * Random points with force-based\
+    \ relaxation\n  * Symmetry-constrained configurations (3-fold, 6-fold rotational)\n  * Hybrid boundary/interior distributions\n\
+    \  * Low-discrepancy sequences (Sobol, Halton) for uniform coverage\n- **OBJECTIVE FUNCTION DESIGN**:\n  * Smooth approximations\
+    \ to min() function (LogSumExp, p-norms with p→∞)\n  * Barrier methods for boundary constraints\n  * Multi-objective formulations\
+    \ balancing multiple triangle areas\n  * Weighted combinations of smallest k triangle areas\n- **ADVANCED TECHNIQUES**:\n\
+    \  * Riemannian optimization on manifolds\n  * Variational methods treating point density as continuous field\n  * Machine\
+    \ learning-guided search using learned geometric priors\n  * Topological optimization considering point connectivity graphs\n\
+    \  * Continuation methods with parameter homotopy\n\nGEOMETRIC INSIGHTS & HEURISTICS:\n- **BOUNDARY CONSIDERATIONS**:\
+    \ Points on boundary contribute to convex hull but may form smaller triangles\n- **TRIANGLE DEGENERACY**: Avoid near-collinear\
+    \ configurations that create arbitrarily small triangles\n- **LOCAL VS GLOBAL**: Balance between locally optimal triangle\
+    \ sizes and global configuration harmony\n- **SYMMETRY EXPLOITATION**: 3-fold rotational symmetry often appears in optimal\
+    \ configurations\n- **VORONOI RELATIONSHIPS**: Points should have roughly equal Voronoi cell areas for optimal distribution\n\
+    - **ENERGY ANALOGIES**: Treat as electrostatic repulsion or gravitational equilibrium problem\n- **HISTORICAL APPROACHES**:\n\
+    \  * Regular lattice arrangements (suboptimal but provide baselines)\n  * Hexagonal close-packing adaptations\n  * Force-based\
+    \ relaxation (treating points as mutually repelling particles)\n  * Simulated annealing and evolutionary computation\n\
+    \  * Gradient descent with carefully designed objective functions\n\nVALIDATION FRAMEWORK:\n- **Geometric constraint verification**:\n\
+    \  * Point count validation: Exactly 13 points required\n  * Convexity check: All points within or on boundary of convex\
+    \ hull\n- **Data integrity checks**:\n  * Coordinate bounds: All coordinates are finite real numbers\n  * Point uniqueness:\
+    \ No duplicate points (within numerical tolerance)\n  * Geometric consistency: Points form valid geometric configuration\n\
+    - **Solution quality assessment**:\n  * Local optimality testing through small perturbations\n  * Symmetry analysis: Detection\
+    \ of rotational/reflectional symmetries\n  * Distribution quality: Voronoi cell area variance, nearest neighbor statistics\n\
+    \  * Convergence verification: For iterative methods, check convergence criteria\n- **Determinism verification**:\n  *\
+    \ Multiple execution consistency: Same results across multiple runs\n  * Seed effectiveness: Proper random seed implementation\n\
+    \  * Platform independence: Results stable across different computing environments\n"
+evaluator:
+  timeout: 360
+  max_retries: 3
+  cascade_evaluation: false
diff --git a/benchmarks/math/heilbronn_convex/13/evaluator/Dockerfile b/benchmarks/math/heilbronn_convex/13/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/heilbronn_convex/13/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/heilbronn_convex/13/evaluator/evaluate.sh b/benchmarks/math/heilbronn_convex/13/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/heilbronn_convex/13/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/heilbronn_convex/13/evaluator/evaluator.py b/benchmarks/math/heilbronn_convex/13/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..27dd8c1c5376c8ebbd3ff9b6c9c36aaca53ae26b
--- /dev/null
+++ b/benchmarks/math/heilbronn_convex/13/evaluator/evaluator.py
@@ -0,0 +1,85 @@
+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for the heilbronn problem for convex regions, with
+# 13 points.
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+
+import time
+import numpy as np
+import itertools
+from scipy.spatial import ConvexHull
+import sys
+import os
+from importlib import __import__
+
+BENCHMARK = 0.030936889034895654
+NUM_POINTS = 13
+
+# Scoring: min(triangle_area) / ConvexHull_area over all C(13,3) = 286 triangles.
+# Decision space: 13 points in 2D = 26 continuous coordinate variables.
+# No constraints on point coordinates; the evaluator scores any (13,2) array.
+# Objective computed via differentiable operations (area = determinant formula).
+
+
+def triangle_area(p1: np.ndarray, p2: np.ndarray, p3: np.ndarray) -> float:
+    """Calculates the area of a triangle given its vertices p1, p2, and p3."""
+    return abs(p1[0] * (p2[1] - p3[1]) + p2[0] * (p3[1] - p1[1]) + p3[0] * (p1[1] - p2[1])) / 2
+
+
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+
+        points = None
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+
+            start_time = time.time()
+            points = program.heilbronn_convex13()
+            end_time = time.time()
+            eval_time = end_time - start_time
+        except Exception as err:
+            raise err
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+
+        # validate
+        if not isinstance(points, np.ndarray):
+            points = np.array(points)
+
+        if points.shape != (NUM_POINTS, 2):
+            raise ValueError(f"Invalid shapes: points = {points.shape}, expected {(NUM_POINTS,2)}")
+        # metrics
+        min_triangle_area = min(
+            [triangle_area(p1, p2, p3) for p1, p2, p3 in itertools.combinations(points, 3)]
+        )
+        convex_hull_area = ConvexHull(points).volume
+        min_area_normalized = min_triangle_area / convex_hull_area
+
+        return {
+            "min_area_normalized": float(min_area_normalized),
+            "combined_score": float(min_area_normalized / BENCHMARK),
+            "eval_time": float(eval_time),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/heilbronn_convex/13/evaluator/requirements.txt b/benchmarks/math/heilbronn_convex/13/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f31e2ae9321b63854113d2a9d1569b0a25049219
--- /dev/null
+++ b/benchmarks/math/heilbronn_convex/13/evaluator/requirements.txt
@@ -0,0 +1,2 @@
+numpy 
+scipy
diff --git a/benchmarks/math/heilbronn_convex/13/evaluator/wrapper.py b/benchmarks/math/heilbronn_convex/13/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/heilbronn_convex/13/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/heilbronn_convex/13/initial_program.py b/benchmarks/math/heilbronn_convex/13/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f612acfca5d54fa80baa143af91ed2a25916ce3
--- /dev/null
+++ b/benchmarks/math/heilbronn_convex/13/initial_program.py
@@ -0,0 +1,19 @@
+# EVOLVE-BLOCK-START
+import numpy as np
+
+
+def heilbronn_convex13() -> np.ndarray:
+    """
+    Construct an arrangement of n points on or inside a convex region in order to maximize the area of the
+    smallest triangle formed by these points. Here n = 13.
+
+    Returns:
+        points: np.ndarray of shape (13,2) with the x,y coordinates of the points.
+    """
+    n = 13
+    rng = np.random.default_rng(seed=42)
+    points = rng.random((n, 2))
+    return points
+
+
+# EVOLVE-BLOCK-END
diff --git a/benchmarks/math/heilbronn_convex/14/config.yaml b/benchmarks/math/heilbronn_convex/14/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6b9b5ca878e0c0781e671aa3f5a4eca9234602aa
--- /dev/null
+++ b/benchmarks/math/heilbronn_convex/14/config.yaml
@@ -0,0 +1,45 @@
+# Math benchmark: heilbronn_convex/14
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: |
+    SETTING:
+    You are an expert computational geometer and optimization specialist with deep expertise in the Heilbronn triangle problem - a fundamental challenge in discrete geometry first posed by Hans Heilbronn in 1957.
+    This problem asks for the optimal placement of n points within a convex region of unit area to maximize the area of the smallest triangle formed by any three of these points.
+    Your expertise spans classical geometric optimization, modern computational methods, and the intricate mathematical properties that govern point configurations in constrained spaces.
+
+    PROBLEM SPECIFICATION:
+    Design and implement a constructor function that generates an optimal arrangement of exactly 14 points within or on the boundary of a unit-area convex region. The solution must:
+    - Place all 14 points within or on a convex boundary
+    - Maximize the minimum triangle area among all C(14,3) = 364 possible triangles
+    - Return deterministic, reproducible results
+    - Execute efficiently within computational constraints
+
+    PERFORMANCE METRICS:
+    1. **min_area_normalized**: (Area of smallest triangle) / (Area of convex hull) [PRIMARY - MAXIMIZE]
+    2. **combined_score**: min_area_normalized / 0.027835571458482138 [BENCHMARK COMPARISON - TARGET > 1.0]
+    3. **eval_time**: Execution time in seconds [EFFICIENCY - secondary priority]
+
+    BENCHMARK & PERFORMANCE TARGET:
+    - **CURRENT STATE-OF-THE-ART**: min_area_normalized = 0.027835571458482138 (achieved by AlphaEvolve algorithm)
+    - **PRIMARY METRIC**: min_area_normalized = (smallest triangle area) / (convex hull area)
+    - **SUCCESS CRITERION**: combined_score = min_area_normalized / 0.027835571458482138 > 1.0
+    - **SIGNIFICANCE**: Even marginal improvements (combined_score > 1.01) represent meaningful advances in this notoriously difficult problem
+
+    TECHNICAL REQUIREMENTS:
+    - **Determinism**: Use fixed random seeds if employing stochastic methods for reproducibility
+    - **Error handling**: Graceful handling of optimization failures or infeasible configurations
+evaluator:
+  timeout: 600
+  max_retries: 3
diff --git a/benchmarks/math/heilbronn_convex/14/evaluator/Dockerfile b/benchmarks/math/heilbronn_convex/14/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/heilbronn_convex/14/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/heilbronn_convex/14/evaluator/evaluate.sh b/benchmarks/math/heilbronn_convex/14/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/heilbronn_convex/14/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/heilbronn_convex/14/evaluator/evaluator.py b/benchmarks/math/heilbronn_convex/14/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbfb41274d9956464a4fe0e3aea679ba3154178b
--- /dev/null
+++ b/benchmarks/math/heilbronn_convex/14/evaluator/evaluator.py
@@ -0,0 +1,79 @@
+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for the heilbronn problem for convex regions, with
+# 14 points.
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+
+import time
+import numpy as np
+import itertools
+from scipy.spatial import ConvexHull
+import sys
+import os
+from importlib import __import__
+
+BENCHMARK = 0.027835571458482138
+NUM_POINTS = 14
+
+
+def triangle_area(p1: np.ndarray, p2: np.ndarray, p3: np.ndarray) -> float:
+    """Calculates the area of a triangle given its vertices p1, p2, and p3."""
+    return abs(p1[0] * (p2[1] - p3[1]) + p2[0] * (p3[1] - p1[1]) + p3[0] * (p1[1] - p2[1])) / 2
+
+
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+
+        points = None
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+
+            start_time = time.time()
+            points = program.heilbronn_convex14()
+            end_time = time.time()
+            eval_time = end_time - start_time
+        except Exception as err:
+            raise err
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+
+        if not isinstance(points, np.ndarray):
+            points = np.array(points)
+
+        if points.shape != (NUM_POINTS, 2):
+            raise ValueError(f"Invalid shapes: points = {points.shape}, expected {(NUM_POINTS,2)}")
+
+        min_triangle_area = min(
+            [triangle_area(p1, p2, p3) for p1, p2, p3 in itertools.combinations(points, 3)]
+        )
+        convex_hull_area = ConvexHull(points).volume
+        min_area_normalized = min_triangle_area / convex_hull_area
+
+        return {
+            "min_area_normalized": float(min_area_normalized),
+            "combined_score": float(min_area_normalized / BENCHMARK),
+            "eval_time": float(eval_time),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/heilbronn_convex/14/evaluator/requirements.txt b/benchmarks/math/heilbronn_convex/14/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3dee69521695f692e340afea5918ed74f057d6aa
--- /dev/null
+++ b/benchmarks/math/heilbronn_convex/14/evaluator/requirements.txt
@@ -0,0 +1,3 @@
+numpy
+jax
+optax
\ No newline at end of file
diff --git a/benchmarks/math/heilbronn_convex/14/evaluator/wrapper.py b/benchmarks/math/heilbronn_convex/14/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/heilbronn_convex/14/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/heilbronn_convex/14/initial_program.py b/benchmarks/math/heilbronn_convex/14/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..527f6feb611c8a3bc3d31a7f5752f48bbd3e9832
--- /dev/null
+++ b/benchmarks/math/heilbronn_convex/14/initial_program.py
@@ -0,0 +1,19 @@
+# EVOLVE-BLOCK-START
+import numpy as np
+
+
+def heilbronn_convex14() -> np.ndarray:
+    """
+    Construct an arrangement of n points on or inside a convex region in order to maximize the area of the
+    smallest triangle formed by these points. Here n = 14.
+
+    Returns:
+        points: np.ndarray of shape (14,2) with the x,y coordinates of the points.
+    """
+    n = 14
+    rng = np.random.default_rng(seed=42)
+    points = rng.random((n, 2))
+    return points
+
+
+# EVOLVE-BLOCK-END
diff --git a/benchmarks/math/heilbronn_triangle/config.yaml b/benchmarks/math/heilbronn_triangle/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dcfdc0a07fa97d8a57a6583bf457cd6d3d844731
--- /dev/null
+++ b/benchmarks/math/heilbronn_triangle/config.yaml
@@ -0,0 +1,47 @@
+# Math benchmark: heilbronn_triangle
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: 'SETTING:
+
+    You are an expert computational geometer and optimization specialist with deep expertise in the Heilbronn triangle problem
+    - a classical problem in discrete geometry that asks for the optimal placement of n points to maximize the minimum triangle
+    area formed by any three points.
+
+
+    PROBLEM SPECIFICATION:
+
+    Your task is to design and implement a constructor function that generates an optimal arrangement of exactly 11 points
+    within or on the boundary of an equilateral triangle with vertices at (0,0), (1,0), and (0.5, sqrt(3)/2).
+
+
+    PERFORMANCE METRICS:
+
+    1. **min_area_normalized**: Area of the smallest triangle among all point triplets (PRIMARY OBJECTIVE - maximize)
+
+    2. **combined_score**: min_area_normalized / 0.036529889880030156 (BENCHMARK COMPARISON - maximize above 1.0)
+
+    3. **eval_time**: Function execution time in seconds (EFFICIENCY - minimize, but secondary to quality)
+
+
+    TECHNICAL REQUIREMENTS:
+
+    - **Determinism**: Use fixed random seeds if employing stochastic methods for reproducibility
+
+    - **Error handling**: Graceful handling of optimization failures or infeasible configurations
+
+    '
+evaluator:
+  timeout: 360
+  max_retries: 3
diff --git a/benchmarks/math/heilbronn_triangle/evaluator/Dockerfile b/benchmarks/math/heilbronn_triangle/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/heilbronn_triangle/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/heilbronn_triangle/evaluator/evaluate.sh b/benchmarks/math/heilbronn_triangle/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/heilbronn_triangle/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/heilbronn_triangle/evaluator/evaluator.py b/benchmarks/math/heilbronn_triangle/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d395fe38ecfe7143aedf7d692bec980ebe2d01
--- /dev/null
+++ b/benchmarks/math/heilbronn_triangle/evaluator/evaluator.py
@@ -0,0 +1,100 @@
+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for the heilbronn problem for triangles, with
+# 11 points.
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+
+import time
+import numpy as np
+import sys
+import os
+from importlib import __import__
+import itertools
+
+BENCHMARK = 0.036529889880030156
+TOL = 1e-6
+NUM_POINTS = 11
+
+
+def check_inside_triangle_wtol(points: np.ndarray, tol: float = 1e-6):
+    """Checks that all points are inside the triangle with vertices (0,0), (1,0), (0.5, sqrt(3)/2).
+
+    Args:
+        points: Array of 2D points to check
+        tol: Tolerance for numerical errors
+    """
+    for x, y in points:
+        cond1 = y >= -tol
+        cond2 = np.sqrt(3) * x <= np.sqrt(3) - y + tol
+        cond3 = y <= np.sqrt(3) * x + tol
+
+        if not (cond1 and cond2 and cond3):
+            raise ValueError(
+                f"Point ({x}, {y}) is outside the equilateral triangle (tolerance: {tol})."
+            )
+
+
+def triangle_area(a: np.array, b: np.array, c: np.array) -> float:
+    return np.abs(a[0] * (b[1] - c[1]) + b[0] * (c[1] - a[1]) + c[0] * (a[1] - b[1])) / 2
+
+
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+
+        points = None
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+
+            start_time = time.time()
+            points = program.heilbronn_triangle11()
+            end_time = time.time()
+            eval_time = end_time - start_time
+        except Exception as err:
+            raise err
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+
+        if not isinstance(points, np.ndarray):
+            points = np.array(points)
+
+        if points.shape != (NUM_POINTS, 2):
+            raise ValueError(f"Invalid shapes: points = {points.shape}, expected {(NUM_POINTS,2)}")
+
+        check_inside_triangle_wtol(points, TOL)
+
+        a = np.array([0, 0])
+        b = np.array([1, 0])
+        c = np.array([0.5, np.sqrt(3) / 2])
+        min_triangle_area = min(
+            [triangle_area(p1, p2, p3) for p1, p2, p3 in itertools.combinations(points, 3)]
+        )
+        min_area_normalized = min_triangle_area / triangle_area(a, b, c)
+
+        return {
+            "min_area_normalized": float(min_area_normalized),
+            "combined_score": float(min_area_normalized / BENCHMARK),
+            "eval_time": float(eval_time),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/heilbronn_triangle/evaluator/requirements.txt b/benchmarks/math/heilbronn_triangle/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2213afdf01c86cd734459b55f686a63d1816aaef
--- /dev/null
+++ b/benchmarks/math/heilbronn_triangle/evaluator/requirements.txt
@@ -0,0 +1,2 @@
+numpy 
+scipy
\ No newline at end of file
diff --git a/benchmarks/math/heilbronn_triangle/evaluator/wrapper.py b/benchmarks/math/heilbronn_triangle/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/heilbronn_triangle/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/heilbronn_triangle/initial_program.py b/benchmarks/math/heilbronn_triangle/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..05c6e577c8cfb0a843f7b28b1e0733b77debea20
--- /dev/null
+++ b/benchmarks/math/heilbronn_triangle/initial_program.py
@@ -0,0 +1,18 @@
+# EVOLVE-BLOCK-START
+import numpy as np
+
+
+def heilbronn_triangle11() -> np.ndarray:
+    """
+    Construct an arrangement of n points on or inside a convex region in order to maximize the area of the
+    smallest triangle formed by these points. Here n = 11.
+
+    Returns:
+        points: np.ndarray of shape (11,2) with the x,y coordinates of the points.
+    """
+    n = 11
+    points = np.zeros((n, 2))
+    return points
+
+
+# EVOLVE-BLOCK-END
diff --git a/benchmarks/math/hexagon_packing/11/config.yaml b/benchmarks/math/hexagon_packing/11/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..02e43553d3a7433baa4ba1b1e8b8868912a84dc1
--- /dev/null
+++ b/benchmarks/math/hexagon_packing/11/config.yaml
@@ -0,0 +1,35 @@
+# Math benchmark: hexagon_packing/11
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: |
+    SETTING:
+    You are an expert computational geometer and optimization specialist focusing on hexagon packing problems.
+    Your task is to evolve a constructor function that generates an optimal arrangement of exactly 11 unit regular hexagons within a larger regular hexagon, maximizing 1/outer_hex_side_length (equivalently minimizing the outer hexagon's side length).
+
+    PROBLEM CONTEXT:
+    - Target: Beat the current state-of-the-art benchmark of 1/outer_hex_side_length = 1/3.930092 ≈ 0.2544
+    - Constraint: All 11 inner hexagons must be unit regular hexagons (side length = 1) that are fully contained within the outer hexagon with no overlaps
+    - Mathematical formulation: For hexagon i at position (xi, yi) with rotation θi:
+      * Non-overlap: All pairs of inner hexagons must be disjoint
+      * Containment: All vertices of inner hexagons must lie within the outer hexagon
+      * Objective: maximize 1/R where R is the outer hexagon side length
+
+    PERFORMANCE METRICS:
+    1. **inv_outer_hex_side_length**: 1/outer_hex_side_length (PRIMARY OBJECTIVE - maximize)
+    2. **combined_score**: inverse_side_length / 0.2544 (progress toward beating SOTA)
+    3. **eval_time**: Execution time for full evaluation
+evaluator:
+  timeout: 600
+  max_retries: 3
diff --git a/benchmarks/math/hexagon_packing/11/evaluator/Dockerfile b/benchmarks/math/hexagon_packing/11/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/hexagon_packing/11/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/hexagon_packing/11/evaluator/evaluate.sh b/benchmarks/math/hexagon_packing/11/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/hexagon_packing/11/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/hexagon_packing/11/evaluator/evaluator.py b/benchmarks/math/hexagon_packing/11/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d898b112e8d7db87984ecb89c96bbc2978ce5f0
--- /dev/null
+++ b/benchmarks/math/hexagon_packing/11/evaluator/evaluator.py
@@ -0,0 +1,242 @@
+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for problem of packing unit regular hexagons inside
+# a regular hexagon, with 11 unit hexagons.
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+
+import sys
+import os
+from importlib import __import__
+import time
+
+import numpy as np
+import math
+
+N_HEX = 11
+BENCHMARK = 1 / 3.930092
+
+
+def hexagon_vertices(
+    center_x: float,
+    center_y: float,
+    side_length: float,
+    angle_degrees: float,
+) -> list[tuple[float, float]]:
+    """Generates the vertices of a regular hexagon.
+    Args:
+    center_x: x-coordinate of the center.
+    center_y: y-coordinate of the center.
+    side_length: Length of each side.
+    angle_degrees: Rotation angle in degrees (clockwise from horizontal).
+    Returns:
+    A list of tuples, where each tuple (x, y) represents the vertex location.
+    """
+    vertices = []
+    angle_radians = math.radians(angle_degrees)
+    for i in range(6):
+        angle = angle_radians + 2 * math.pi * i / 6
+        x = center_x + side_length * math.cos(angle)
+        y = center_y + side_length * math.sin(angle)
+        vertices.append((x, y))
+    return vertices
+
+
+def normalize_vector(v: tuple[float, float]) -> tuple[float, float]:
+    """Normalizes a 2D vector."""
+    magnitude = math.sqrt(v[0] ** 2 + v[1] ** 2)
+    return (v[0] / magnitude, v[1] / magnitude) if magnitude != 0 else (0.0, 0.0)
+
+
+def get_normals(vertices: list[tuple[float, float]]) -> list[tuple[float, float]]:
+    """Gets the outward normals of a polygon's edges."""
+    normals = []
+    for i in range(len(vertices)):
+        p1 = vertices[i]
+        p2 = vertices[(i + 1) % len(vertices)]  # Wrap around to the first vertex.
+        edge = (p2[0] - p1[0], p2[1] - p1[1])
+        normal = normalize_vector((-edge[1], edge[0]))  # Rotate edge by 90 degrees.
+        normals.append(normal)
+    return normals
+
+
+def project_polygon(
+    vertices: list[tuple[float, float]],
+    axis: tuple[float, float],
+) -> tuple[float, float]:
+    """Projects a polygon onto an axis and returns the min/max values."""
+    min_proj = float("inf")
+    max_proj = float("-inf")
+    for vertex in vertices:
+        projection = vertex[0] * axis[0] + vertex[1] * axis[1]  # Dot product.
+        min_proj = min(min_proj, projection)
+        max_proj = max(max_proj, projection)
+    return min_proj, max_proj
+
+
+def overlap_1d(min1: float, max1: float, min2: float, max2: float, tol: float = 1e-6) -> bool:
+    """Determines whether two 1D intervals overlap, allowing for numerical tolerance."""
+    return max1 >= min2 - tol and max2 >= min1 - tol
+
+
+def polygons_intersect(
+    vertices1: list[tuple[float, float]],
+    vertices2: list[tuple[float, float]],
+    tol: float = 1e-6,
+) -> bool:
+    """Determines if two polygons intersect using the Separating Axis Theorem."""
+    normals1 = get_normals(vertices1)
+    normals2 = get_normals(vertices2)
+    axes = normals1 + normals2
+    for axis in axes:
+        min1, max1 = project_polygon(vertices1, axis)
+        min2, max2 = project_polygon(vertices2, axis)
+        if not overlap_1d(min1, max1, min2, max2, tol):
+            return False  # Separating axis found, polygons are disjoint.
+    return True  # No separating axis found, polygons intersect.
+
+
+def hexagons_are_disjoint(
+    hex1_params: tuple[float, float, float, float],
+    hex2_params: tuple[float, float, float, float],
+    tol: float = 1e-6,
+) -> bool:
+    """Determines if two hexagons are disjoint given their parameters."""
+    hex1_vertices = hexagon_vertices(*hex1_params)
+    hex2_vertices = hexagon_vertices(*hex2_params)
+    return not polygons_intersect(hex1_vertices, hex2_vertices, tol)
+
+
+def is_inside_hexagon(
+    point: tuple[float, float],
+    hex_params: tuple[float, float, float, float],
+    tol: float = 1e-6,
+) -> bool:
+    """Checks if a point is inside a hexagon (given its parameters)."""
+    hex_vertices = hexagon_vertices(*hex_params)
+    for i in range(len(hex_vertices)):
+        p1 = hex_vertices[i]
+        p2 = hex_vertices[(i + 1) % len(hex_vertices)]
+        edge_vector = (p2[0] - p1[0], p2[1] - p1[1])
+        point_vector = (point[0] - p1[0], point[1] - p1[1])
+        cross_product = edge_vector[0] * point_vector[1] - edge_vector[1] * point_vector[0]
+        if cross_product < -tol:  # Allow small numerical errors
+            return False
+    return True
+
+
+def all_hexagons_contained(
+    inner_hex_params_list: list[tuple[float, float, float, float]],
+    outer_hex_params: tuple[float, float, float, float],
+    tol: float = 1e-6,
+) -> bool:
+    """Checks if all inner hexagons are contained within the outer hexagon."""
+    for inner_hex_params in inner_hex_params_list:
+        inner_hex_vertices = hexagon_vertices(*inner_hex_params)
+        for vertex in inner_hex_vertices:
+            if not is_inside_hexagon(vertex, outer_hex_params, tol):
+                return False
+    return True
+
+
+def verify_construction(
+    inner_hex_data: tuple[float, float, float],
+    outer_hex_center: tuple[float, float],
+    outer_hex_side_length: float,
+    outer_hex_angle_degrees: float,
+    tol: float = 1e-6,
+):
+    """Verifies the hexagon packing construction with a rotated outer hexagon.
+    Args:
+    inner_hex_data: List of (x, y, angle_degrees) tuples for inner hexagons.
+    outer_hex_center: (x, y) tuple for the outer hexagon center.
+    outer_hex_side_length: Side length of the outer hexagon.
+    outer_hex_angle_degrees: Rotation angle of the outer hexagon in degrees.
+    tol: Numerical tolerance for geometric checks (default: 1e-6).
+    Raises:
+    AssertionError if the construction is not valid.
+    """
+    inner_hex_params_list = [
+        (x, y, 1, angle) for x, y, angle in inner_hex_data
+    ]  # Sets the side length to 1.
+    outer_hex_params = (
+        outer_hex_center[0],
+        outer_hex_center[1],
+        outer_hex_side_length,
+        outer_hex_angle_degrees,
+    )
+    # Disjointness check.
+    for i in range(len(inner_hex_params_list)):
+        for j in range(i + 1, len(inner_hex_params_list)):
+            if not hexagons_are_disjoint(inner_hex_params_list[i], inner_hex_params_list[j], tol):
+                raise AssertionError(f"Hexagons {i+1} and {j+1} intersect!")
+    # Containment check.
+    if not all_hexagons_contained(inner_hex_params_list, outer_hex_params, tol):
+        raise AssertionError("Not all inner hexagons are contained in the outer hexagon!")
+    print("Construction is valid.")
+
+
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+            start_time = time.time()
+            inner_hex_data, outer_hex_data, outer_hex_side_length = program.hexagon_packing_11()
+            end_time = time.time()
+            eval_time = end_time - start_time
+        except Exception as err:
+            raise err
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+
+        if not isinstance(inner_hex_data, np.ndarray):
+            inner_hex_data = np.array(inner_hex_data)
+        if not isinstance(outer_hex_data, np.ndarray):
+            outer_hex_data = np.array(outer_hex_data)
+
+        if inner_hex_data.shape != (N_HEX, 3):
+            raise ValueError(
+                f"Invalid shapes: inner_hex_data = {inner_hex_data.shape}, expected {(N_HEX,3)}"
+            )
+
+        if outer_hex_data.shape != (3,):
+            raise ValueError(
+                f"Invalid shapes: outer_hex_data = {outer_hex_data.shape}, expected {(3,)}"
+            )
+
+        outer_hex_center = outer_hex_data[:2]
+        outer_hex_angle_degrees = outer_hex_data[-1]
+        verify_construction(
+            inner_hex_data, outer_hex_center, outer_hex_side_length, outer_hex_angle_degrees
+        )
+
+        inv_outer_hex_side_length = float(1 / outer_hex_side_length)
+
+        return {
+            "inv_outer_hex_side_length": inv_outer_hex_side_length,
+            "combined_score": float(inv_outer_hex_side_length / BENCHMARK),
+            "eval_time": float(eval_time),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/hexagon_packing/11/evaluator/requirements.txt b/benchmarks/math/hexagon_packing/11/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6bad10388ecb1eefd890a797d833976a5e631541
--- /dev/null
+++ b/benchmarks/math/hexagon_packing/11/evaluator/requirements.txt
@@ -0,0 +1,2 @@
+numpy
+scipy
diff --git a/benchmarks/math/hexagon_packing/11/evaluator/wrapper.py b/benchmarks/math/hexagon_packing/11/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/hexagon_packing/11/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/hexagon_packing/11/initial_program.py b/benchmarks/math/hexagon_packing/11/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8ff28280e0ed5cdec29500605c918b5e2bfc328
--- /dev/null
+++ b/benchmarks/math/hexagon_packing/11/initial_program.py
@@ -0,0 +1,37 @@
+# EVOLVE-BLOCK-START
+import numpy as np
+
+
+def hexagon_packing_11():
+    """
+    Constructs a packing of 11 disjoint unit regular hexagons inside a larger regular hexagon, maximizing 1/outer_hex_side_length.
+    Returns
+        inner_hex_data: np.ndarray of shape (11,3), where each row is of the form (x, y, angle_degrees) containing the (x,y) coordinates and angle_degree of the respective inner hexagon.
+        outer_hex_data: np.ndarray of shape (3,) of form (x,y,angle_degree) containing the (x,y) coordinates and angle_degree of the outer hexagon.
+        outer_hex_side_length: float representing the side length of the outer hexagon.
+    """
+    n = 11
+    # Simple grid arrangement of inner hexagons
+    inner_hex_data = np.array(
+        [
+            [0, 0, 0],  # center
+            [-2.5, 0, 0],  # left
+            [2.5, 0, 0],  # right
+            [-1.25, 2.17, 0],  # top-left
+            [1.25, 2.17, 0],  # top-right
+            [-1.25, -2.17, 0],  # bottom-left
+            [1.25, -2.17, 0],  # bottom-right
+            [-3.75, 2.17, 0],  # far top-left
+            [3.75, 2.17, 0],  # far top-right
+            [-3.75, -2.17, 0],  # far bottom-left
+            [3.75, -2.17, 0],  # far bottom-right
+        ]
+    )
+
+    outer_hex_data = np.array([0, 0, 0])  # centered at origin
+    outer_hex_side_length = 8  # large enough to contain all inner hexagons
+
+    return inner_hex_data, outer_hex_data, outer_hex_side_length
+
+
+# EVOLVE-BLOCK-END
diff --git a/benchmarks/math/hexagon_packing/12/config.yaml b/benchmarks/math/hexagon_packing/12/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6a8afef970fc51151d5d87553165412dbd685a87
--- /dev/null
+++ b/benchmarks/math/hexagon_packing/12/config.yaml
@@ -0,0 +1,35 @@
+# Math benchmark: hexagon_packing/12
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: |
+    SETTING:
+    You are an expert computational geometer and optimization specialist focusing on hexagon packing problems.
+    Your task is to evolve a constructor function that generates an optimal arrangement of exactly 12 unit regular hexagons within a larger regular hexagon, maximizing 1/outer_hex_side_length (equivalently minimizing the outer hexagon's side length).
+
+    PROBLEM CONTEXT:
+    - Target: Establish new state-of-the-art for 12-hexagon packing of 1/outer_hex_side_length = 1/3.9419123 ≈ 0.2537
+    - Constraint: All 12 inner hexagons must be unit regular hexagons (side length = 1) that are fully contained within the outer hexagon with no overlaps
+    - Mathematical formulation: For hexagon i at position (xi, yi) with rotation θi:
+      * Non-overlap: All pairs of inner hexagons must be disjoint
+      * Containment: All vertices of inner hexagons must lie within the outer hexagon
+      * Objective: maximize 1/R where R is the outer hexagon side length
+
+    PERFORMANCE METRICS:
+    1. **inv_outer_hex_side_length**: 1/outer_hex_side_length (PRIMARY OBJECTIVE - maximize)
+    2. **combined_score**: inverse_side_length / 0.2537 (progress toward beating SOTA)
+    3. **eval_time**: Execution time for full evaluation
+evaluator:
+  timeout: 600
+  max_retries: 3
diff --git a/benchmarks/math/hexagon_packing/12/evaluator/Dockerfile b/benchmarks/math/hexagon_packing/12/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/hexagon_packing/12/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/hexagon_packing/12/evaluator/evaluate.sh b/benchmarks/math/hexagon_packing/12/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/hexagon_packing/12/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/hexagon_packing/12/evaluator/evaluator.py b/benchmarks/math/hexagon_packing/12/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..f89de7dbab8a31849bb5c1d1d91788dfcd8b9eaa
--- /dev/null
+++ b/benchmarks/math/hexagon_packing/12/evaluator/evaluator.py
@@ -0,0 +1,242 @@
+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for problem of packing unit regular hexagons inside
+# a regular hexagon, with 12 unit hexagons.
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+
+import sys
+import os
+from importlib import __import__
+import time
+
+import numpy as np
+import math
+
+N_HEX = 12
+BENCHMARK = 1 / 3.9419123
+
+
+def hexagon_vertices(
+    center_x: float,
+    center_y: float,
+    side_length: float,
+    angle_degrees: float,
+) -> list[tuple[float, float]]:
+    """Generates the vertices of a regular hexagon.
+    Args:
+    center_x: x-coordinate of the center.
+    center_y: y-coordinate of the center.
+    side_length: Length of each side.
+    angle_degrees: Rotation angle in degrees (clockwise from horizontal).
+    Returns:
+    A list of tuples, where each tuple (x, y) represents the vertex location.
+    """
+    vertices = []
+    angle_radians = math.radians(angle_degrees)
+    for i in range(6):
+        angle = angle_radians + 2 * math.pi * i / 6
+        x = center_x + side_length * math.cos(angle)
+        y = center_y + side_length * math.sin(angle)
+        vertices.append((x, y))
+    return vertices
+
+
+def normalize_vector(v: tuple[float, float]) -> tuple[float, float]:
+    """Normalizes a 2D vector."""
+    magnitude = math.sqrt(v[0] ** 2 + v[1] ** 2)
+    return (v[0] / magnitude, v[1] / magnitude) if magnitude != 0 else (0.0, 0.0)
+
+
+def get_normals(vertices: list[tuple[float, float]]) -> list[tuple[float, float]]:
+    """Gets the outward normals of a polygon's edges."""
+    normals = []
+    for i in range(len(vertices)):
+        p1 = vertices[i]
+        p2 = vertices[(i + 1) % len(vertices)]  # Wrap around to the first vertex.
+        edge = (p2[0] - p1[0], p2[1] - p1[1])
+        normal = normalize_vector((-edge[1], edge[0]))  # Rotate edge by 90 degrees.
+        normals.append(normal)
+    return normals
+
+
+def project_polygon(
+    vertices: list[tuple[float, float]],
+    axis: tuple[float, float],
+) -> tuple[float, float]:
+    """Projects a polygon onto an axis and returns the min/max values."""
+    min_proj = float("inf")
+    max_proj = float("-inf")
+    for vertex in vertices:
+        projection = vertex[0] * axis[0] + vertex[1] * axis[1]  # Dot product.
+        min_proj = min(min_proj, projection)
+        max_proj = max(max_proj, projection)
+    return min_proj, max_proj
+
+
+def overlap_1d(min1: float, max1: float, min2: float, max2: float, tol: float = 1e-6) -> bool:
+    """Determines whether two 1D intervals overlap, allowing for numerical tolerance."""
+    return max1 >= min2 - tol and max2 >= min1 - tol
+
+
+def polygons_intersect(
+    vertices1: list[tuple[float, float]],
+    vertices2: list[tuple[float, float]],
+    tol: float = 1e-6,
+) -> bool:
+    """Determines if two polygons intersect using the Separating Axis Theorem."""
+    normals1 = get_normals(vertices1)
+    normals2 = get_normals(vertices2)
+    axes = normals1 + normals2
+    for axis in axes:
+        min1, max1 = project_polygon(vertices1, axis)
+        min2, max2 = project_polygon(vertices2, axis)
+        if not overlap_1d(min1, max1, min2, max2, tol):
+            return False  # Separating axis found, polygons are disjoint.
+    return True  # No separating axis found, polygons intersect.
+
+
+def hexagons_are_disjoint(
+    hex1_params: tuple[float, float, float, float],
+    hex2_params: tuple[float, float, float, float],
+    tol: float = 1e-6,
+) -> bool:
+    """Determines if two hexagons are disjoint given their parameters."""
+    hex1_vertices = hexagon_vertices(*hex1_params)
+    hex2_vertices = hexagon_vertices(*hex2_params)
+    return not polygons_intersect(hex1_vertices, hex2_vertices, tol)
+
+
+def is_inside_hexagon(
+    point: tuple[float, float],
+    hex_params: tuple[float, float, float, float],
+    tol: float = 1e-6,
+) -> bool:
+    """Checks if a point is inside a hexagon (given its parameters)."""
+    hex_vertices = hexagon_vertices(*hex_params)
+    for i in range(len(hex_vertices)):
+        p1 = hex_vertices[i]
+        p2 = hex_vertices[(i + 1) % len(hex_vertices)]
+        edge_vector = (p2[0] - p1[0], p2[1] - p1[1])
+        point_vector = (point[0] - p1[0], point[1] - p1[1])
+        cross_product = edge_vector[0] * point_vector[1] - edge_vector[1] * point_vector[0]
+        if cross_product < -tol:  # Allow small numerical errors
+            return False
+    return True
+
+
+def all_hexagons_contained(
+    inner_hex_params_list: list[tuple[float, float, float, float]],
+    outer_hex_params: tuple[float, float, float, float],
+    tol: float = 1e-6,
+) -> bool:
+    """Checks if all inner hexagons are contained within the outer hexagon."""
+    for inner_hex_params in inner_hex_params_list:
+        inner_hex_vertices = hexagon_vertices(*inner_hex_params)
+        for vertex in inner_hex_vertices:
+            if not is_inside_hexagon(vertex, outer_hex_params, tol):
+                return False
+    return True
+
+
+def verify_construction(
+    inner_hex_data: tuple[float, float, float],
+    outer_hex_center: tuple[float, float],
+    outer_hex_side_length: float,
+    outer_hex_angle_degrees: float,
+    tol: float = 1e-6,
+):
+    """Verifies the hexagon packing construction with a rotated outer hexagon.
+    Args:
+    inner_hex_data: List of (x, y, angle_degrees) tuples for inner hexagons.
+    outer_hex_center: (x, y) tuple for the outer hexagon center.
+    outer_hex_side_length: Side length of the outer hexagon.
+    outer_hex_angle_degrees: Rotation angle of the outer hexagon in degrees.
+    tol: Numerical tolerance for geometric checks (default: 1e-6).
+    Raises:
+    AssertionError if the construction is not valid.
+    """
+    inner_hex_params_list = [
+        (x, y, 1, angle) for x, y, angle in inner_hex_data
+    ]  # Sets the side length to 1.
+    outer_hex_params = (
+        outer_hex_center[0],
+        outer_hex_center[1],
+        outer_hex_side_length,
+        outer_hex_angle_degrees,
+    )
+    # Disjointness check.
+    for i in range(len(inner_hex_params_list)):
+        for j in range(i + 1, len(inner_hex_params_list)):
+            if not hexagons_are_disjoint(inner_hex_params_list[i], inner_hex_params_list[j], tol):
+                raise AssertionError(f"Hexagons {i+1} and {j+1} intersect!")
+    # Containment check.
+    if not all_hexagons_contained(inner_hex_params_list, outer_hex_params, tol):
+        raise AssertionError("Not all inner hexagons are contained in the outer hexagon!")
+    print("Construction is valid.")
+
+
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+            start_time = time.time()
+            inner_hex_data, outer_hex_data, outer_hex_side_length = program.hexagon_packing_12()
+            end_time = time.time()
+            eval_time = end_time - start_time
+        except Exception as err:
+            raise err
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+
+        if not isinstance(inner_hex_data, np.ndarray):
+            inner_hex_data = np.array(inner_hex_data)
+        if not isinstance(outer_hex_data, np.ndarray):
+            outer_hex_data = np.array(outer_hex_data)
+
+        if inner_hex_data.shape != (N_HEX, 3):
+            raise ValueError(
+                f"Invalid shapes: inner_hex_data = {inner_hex_data.shape}, expected {(N_HEX,3)}"
+            )
+
+        if outer_hex_data.shape != (3,):
+            raise ValueError(
+                f"Invalid shapes: outer_hex_data = {outer_hex_data.shape}, expected {(3,)}"
+            )
+
+        outer_hex_center = outer_hex_data[:2]
+        outer_hex_angle_degrees = outer_hex_data[-1]
+        verify_construction(
+            inner_hex_data, outer_hex_center, outer_hex_side_length, outer_hex_angle_degrees
+        )
+
+        inv_outer_hex_side_length = float(1 / outer_hex_side_length)
+
+        return {
+            "inv_outer_hex_side_length": inv_outer_hex_side_length,
+            "combined_score": float(inv_outer_hex_side_length / BENCHMARK),
+            "eval_time": float(eval_time),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/hexagon_packing/12/evaluator/requirements.txt b/benchmarks/math/hexagon_packing/12/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6bad10388ecb1eefd890a797d833976a5e631541
--- /dev/null
+++ b/benchmarks/math/hexagon_packing/12/evaluator/requirements.txt
@@ -0,0 +1,2 @@
+numpy
+scipy
diff --git a/benchmarks/math/hexagon_packing/12/evaluator/wrapper.py b/benchmarks/math/hexagon_packing/12/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/hexagon_packing/12/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/hexagon_packing/12/initial_program.py b/benchmarks/math/hexagon_packing/12/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5aa3da4a2980308358bf73d80c100d93b062e07
--- /dev/null
+++ b/benchmarks/math/hexagon_packing/12/initial_program.py
@@ -0,0 +1,38 @@
+# EVOLVE-BLOCK-START
+import numpy as np
+
+
+def hexagon_packing_12():
+    """
+    Constructs a packing of 12 disjoint unit regular hexagons inside a larger regular hexagon, maximizing 1/outer_hex_side_length.
+    Returns
+        inner_hex_data: np.ndarray of shape (12,3), where each row is of the form (x, y, angle_degrees) containing the (x,y) coordinates and angle_degree of the respective inner hexagon.
+        outer_hex_data: np.ndarray of shape (3,) of form (x,y,angle_degree) containing the (x,y) coordinates and angle_degree of the outer hexagon.
+        outer_hex_side_length: float representing the side length of the outer hexagon.
+    """
+    n = 12
+    # Simple grid arrangement of inner hexagons
+    inner_hex_data = np.array(
+        [
+            [0, 0, 0],  # center
+            [-2.5, 0, 0],  # left
+            [2.5, 0, 0],  # right
+            [-1.25, 2.17, 0],  # top-left
+            [1.25, 2.17, 0],  # top-right
+            [-1.25, -2.17, 0],  # bottom-left
+            [1.25, -2.17, 0],  # bottom-right
+            [-3.75, 2.17, 0],  # far top-left
+            [3.75, 2.17, 0],  # far top-right
+            [-3.75, -2.17, 0],  # far bottom-left
+            [3.75, -2.17, 0],  # far bottom-right,
+            [0, -4, 0],  # far bottom-center
+        ]
+    )
+
+    outer_hex_data = np.array([0, 0, 0])  # centered at origin
+    outer_hex_side_length = 8  # large enough to contain all inner hexagons
+
+    return inner_hex_data, outer_hex_data, outer_hex_side_length
+
+
+# EVOLVE-BLOCK-END
diff --git a/benchmarks/math/matmul/config.yaml b/benchmarks/math/matmul/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dea11bd883db95211da8d6c89daaf36e0b63b4a4
--- /dev/null
+++ b/benchmarks/math/matmul/config.yaml
@@ -0,0 +1,61 @@
+# Math benchmark: matmul
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: |
+    SETTING:
+    You are an expert in computational linear algebra, numerical optimization, and AI-driven algorithm discovery.
+    Your task is to evolve and optimize a Python script to find the lowest-rank decomposition of the matrix multiplication tensor for a specific instance with variables (n=2,m=4,p=5) fixed.
+
+    PROBLEM CONTEXT:
+    Target: Find the minimal rank R for the tensor decomposition T_ijk = ∑_r=1^R U_ir V_jr W_kr.
+    The goal is to beat the best algorithm and discover a state-of-the-art algorithm with the lowest rank possible.
+    Constraint: The reconstructed tensor from the learned factors (U, V, W) must be EXACTLY EQUAL to the ground-truth matrix multiplication tensor T_ijk after its composition to a tensor form.
+    This can be enforced in two ways:
+    * by minimizing the loss function to near-zero, and do a "rounding algorithm" to make the continuous approximate solution converge to a exactly one where its elements are integer multiples of a constant.
+    * by making an algorithm that search in the space of the set of possible elements, or a grid of elements that can compose the final decomposition.
+    You can be creative and choose the best possible algorithm to solve this problem, for example incorporating constraints that force the solution to be in the right space, or by enforcing this from the start.
+
+    MATHEMATICAL FORMULATION:
+    Given: The standard matrix multiplication tensor T for n, m, p fixed.
+    Objective: Find the smallest integer R such that there exist real or complex valued matrices U, V, W of shapes (n*p, R), (n*m, R), and (m*p, R) that compose T_ijk.
+
+    PERFORMANCE METRICS:
+    combined_score: The minimal inverse rank 32/R for which the optimization was successful, where 32 is the best know decomposition found by Google. A value of 1.0 means you have matched the state-of-the-art. (PRIMARY OBJECTIVE - maximize 1/R).
+    loss: The final loss function result if applicable to the method used.
+    rank: rank of the best decomposition found.
+    eval_time: time of the evaluation.
+
+    VALIDATION FRAMEWORK:
+    Numerical Validation: The final loss for a successful run must be below the success_threshold (e.g., 1e-6).
+    Equality validation: The final decomposition must be exactly equal to the tensor T_ijk:
+    matmul_tensor = np.zeros((n * m, m * p, p * n), dtype=np.int32)
+    for i in range(n):
+      for j in range(m):
+        for k in range(p):
+          matmul_tensor[i * m + j][j * p + k][k * n + i] = 1
+    Rank Validation: The discovered_rank must be an integer.
+
+    TECHNICAL REQUIREMENTS:
+    Reproducibility: Ensure the JAX PRNGKey is handled correctly (or any lib with random numbers) to get reproducible results for a given set of initial conditions and hyperparameters.
+    Numerical Stability: Be aware of potential floating-point precision issues and the possibility of exploding or vanishing gradients, suggesting remedies like gradient clipping if necessary.
+
+    PROBLEM-SPECIFIC CONSIDERATIONS:
+    Initialization is Key: Due to the non-convex landscape, the success of a run is highly dependent on the random initialization. A robust solution should work from multiple different random seeds.
+    Steps vs. Learning Rate Trade-off: A lower learning rate might require more num_steps to converge, and vice-versa. Explore this relationship to find the most efficient path to a solution.
+    From Discovery to Algorithm: The end goal is not just the factors U, V, W, but the algorithm they represent. A good solution should be interpretable as a series of R multiplications and additions/subtractions.
+    The robustness and efficiency of the proposed code and hyperparameter configuration (i.e., it should converge reliably and quickly).
+evaluator:
+  timeout: 600
+  max_retries: 3
diff --git a/benchmarks/math/matmul/evaluator/Dockerfile b/benchmarks/math/matmul/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/matmul/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/matmul/evaluator/evaluate.sh b/benchmarks/math/matmul/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/matmul/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/matmul/evaluator/evaluator.py b/benchmarks/math/matmul/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..331efefdcd80afd32aaa5ca362bc231bb0216970
--- /dev/null
+++ b/benchmarks/math/matmul/evaluator/evaluator.py
@@ -0,0 +1,115 @@
+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for the matrix multiplication problem with tensor size
+# of <2,4,5>
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+
+import sys
+import os
+from importlib import __import__
+import time
+import numpy as np
+
+BENCHMARK = 32
+
+
+def verify_tensor_decomposition(
+    decomposition: tuple[np.ndarray, np.ndarray, np.ndarray], n: int, m: int, p: int, rank: int
+):
+    """Verifies the correctness of the tensor decomposition."""
+
+    # Add robustness for cases where the optimizer might fail
+    if not all(isinstance(arr, np.ndarray) for arr in decomposition) or not decomposition:
+        raise ValueError("Decomposition must be a tuple of NumPy arrays.")
+    if any(arr.size == 0 for arr in decomposition):
+        print("Warning: One or more decomposition arrays are empty. Verification skipped.")
+        return
+
+    # Check that each factor matrix has the correct shape.
+    factor_matrix_1, factor_matrix_2, factor_matrix_3 = decomposition
+    if factor_matrix_1.shape != (n * m, rank):
+        raise ValueError(
+            f"Expected shape of factor matrix 1 is {(n * m, rank)}. Actual shape is {factor_matrix_1.shape}."
+        )
+    if factor_matrix_2.shape != (m * p, rank):
+        raise ValueError(
+            f"Expected shape of factor matrix 2 is {(m * p, rank)}. Actual shape is {factor_matrix_2.shape}."
+        )
+    if factor_matrix_3.shape != (n * p, rank):
+        raise ValueError(
+            f"Expected shape of factor matrix 3 is {(n * p, rank)}. Actual shape is {factor_matrix_3.shape}."
+        )
+
+    # Form the matrix multiplication tensor <n, m, p>.
+    matmul_tensor = np.zeros((n * m, m * p, n * p), dtype=np.float32)
+    for i in range(n):
+        for j in range(m):
+            for k in range(p):
+                # Use the standard k*n+i indexing for the third dimension
+                matmul_tensor[i * m + j, j * p + k, k * n + i] = 1
+
+    # Check that the tensor is correctly constructed.
+    constructed_tensor = np.einsum("ir,jr,kr -> ijk", *decomposition)
+
+    # Exact check
+    if not np.array_equal(constructed_tensor, matmul_tensor):
+        # If the exact check fails, report the floating-point difference for diagnostics.
+        diff = np.max(np.abs(constructed_tensor - matmul_tensor))
+        raise ValueError(
+            f"Tensor constructed by decomposition does not exactly match the target tensor. Maximum difference is {diff:.6e}."
+        )
+
+
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+            start_time = time.time()
+            decomposition, n, m, p, loss, rank = program.run()
+            end_time = time.time()
+            eval_time = end_time - start_time
+        except Exception as err:
+            raise err
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+
+        verify_tensor_decomposition(decomposition, n, m, p, rank)
+
+        success_threshold = 1e-6
+        if loss > success_threshold:
+            print(
+                f"\nWarning: Final loss {loss:.2e} is above the success threshold of {success_threshold:.2e}."
+            )
+
+        inverse_rank = BENCHMARK / rank
+
+        return {
+            "combined_score": inverse_rank,
+            "loss": loss,
+            "rank": rank,
+            "eval_time": float(eval_time),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/matmul/evaluator/requirements.txt b/benchmarks/math/matmul/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3dee69521695f692e340afea5918ed74f057d6aa
--- /dev/null
+++ b/benchmarks/math/matmul/evaluator/requirements.txt
@@ -0,0 +1,3 @@
+numpy
+jax
+optax
\ No newline at end of file
diff --git a/benchmarks/math/matmul/evaluator/wrapper.py b/benchmarks/math/matmul/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/matmul/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/matmul/initial_program.py b/benchmarks/math/matmul/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9c30df06ba319dbc7bc9f1880aa735aaf43dc79
--- /dev/null
+++ b/benchmarks/math/matmul/initial_program.py
@@ -0,0 +1,199 @@
+# Disable progress bar for cleaner output logs
+import os
+
+os.environ["TQDM_DISABLE"] = "1"
+
+# Fixed parameters
+n, m, p = 2, 4, 5
+
+# EVOLVE-BLOCK-START
+import numpy as np
+import jax
+import jax.numpy as jnp
+import optax
+from dataclasses import dataclass
+import tqdm
+
+
+# --- Straight-Through Estimator for Rounding ---
+@jax.custom_vjp
+def round_to_half_ste(x):
+    """Forward pass: snaps values to the nearest half-integer."""
+    return jnp.round(x * 2) / 2
+
+
+def round_ste_fwd(x):
+    """Standard forward pass and identity for backward pass."""
+    return round_to_half_ste(x), None
+
+
+def round_ste_bwd(res, g):
+    """Backward pass: Identity function, passes gradient straight through."""
+    return (g,)
+
+
+round_to_half_ste.defvjp(round_ste_fwd, round_ste_bwd)
+# --- End of STE definition ---
+
+
+# --- Loss Functions ---
+def weighted_l2_loss(reconstructed: jnp.ndarray, target: jnp.ndarray) -> jnp.ndarray:
+    error = reconstructed - target
+    weights = jnp.where(target != 0, 100.0, 1.0)
+    return jnp.mean(weights * (error**2))
+
+
+def l2_loss_real(x: jnp.ndarray, y: jnp.ndarray) -> jnp.ndarray:
+    return jnp.mean((x - y) ** 2)
+
+
+# --- Hyperparameters ---
+@dataclass
+class Hyperparameters:
+    rank: int = 55
+    # Phase 1: Continuous Search
+    num_restarts: int = 10
+    phase1_steps: int = 80000
+    phase1_lr: float = 0.01
+    init_scale: float = 0.1
+    l1_strength: float = 1e-6
+    clamp_range: float = 4.0
+    # Phase 2: Discrete Fine-tuning
+    phase2_steps: int = 20000
+    phase2_lr: float = 1e-4  # A much smaller learning rate for fine-tuning
+
+
+# --- Optimizer Classes ---
+class ContinuousOptimizer:
+    """Finds a high-quality approximate continuous solution."""
+
+    def __init__(self, target_tensor: jnp.ndarray, hypers: Hyperparameters):
+        self.target_tensor = target_tensor
+        self.hypers = hypers
+        self.opt = optax.adam(hypers.phase1_lr)
+
+    def _get_constrained_decomposition(self, latent_decomposition: tuple) -> tuple:
+        """Applies a scaled tanh to map latent parameters to the desired range."""
+        return jax.tree_util.tree_map(
+            lambda x: self.hypers.clamp_range * jnp.tanh(x), latent_decomposition
+        )
+
+    def _loss_fn(self, latent_decomposition: tuple) -> jnp.ndarray:
+        constrained = self._get_constrained_decomposition(latent_decomposition)
+        reconstructed = jnp.einsum("ir,jr,kr->ijk", *constrained)
+        recon_loss = weighted_l2_loss(reconstructed, self.target_tensor)
+        l1_penalty = sum(jnp.mean(jnp.abs(arr)) for arr in constrained)
+        return recon_loss + self.hypers.l1_strength * l1_penalty
+
+
+class DiscreteOptimizer:
+    """Refines a continuous solution into an exact discrete one using an STE."""
+
+    def __init__(self, target_tensor: jnp.ndarray, hypers: Hyperparameters):
+        self.target_tensor = target_tensor
+        self.hypers = hypers
+        self.opt = optax.adam(hypers.phase2_lr)
+
+    def _loss_fn(self, continuous_decomposition: tuple) -> jnp.ndarray:
+        # Snap the continuous parameters to the discrete grid
+        discrete_decomposition = jax.tree_util.tree_map(round_to_half_ste, continuous_decomposition)
+        # Compute the loss using only these exact half-integer values
+        reconstructed = jnp.einsum("ir,jr,kr->ijk", *discrete_decomposition)
+        return l2_loss_real(reconstructed, self.target_tensor)
+
+
+# --- JIT-compatible Train Step ---
+def train_step(params, opt_state, optimizer, loss_fn):
+    loss, grads = jax.value_and_grad(loss_fn)(params)
+    updates, opt_state = optimizer.update(grads, opt_state, params)
+    params = optax.apply_updates(params, updates)
+    return params, opt_state, loss
+
+
+def get_matrix_multiplication_tensor(n, m, p):
+    T = jnp.zeros((n * m, m * p, n * p))
+    for i, j, k in np.ndindex(n, m, p):
+        T = T.at[i * m + j, j * p + k, k * n + i].set(1)
+    return T
+
+
+def run():
+    hypers = Hyperparameters()
+    target_tensor = get_matrix_multiplication_tensor(n, m, p)
+    main_key = jax.random.PRNGKey(42)
+
+    # --- PHASE 1: CONTINUOUS EXPLORATION ---
+    print(f"\n{'='*20} PHASE 1: Continuous Exploration {'='*20}")
+    best_loss_phase1 = float("inf")
+    best_latent_decomp = None
+
+    continuous_optimizer = ContinuousOptimizer(target_tensor, hypers)
+
+    # JIT the train_step for the continuous phase
+    jit_train_step_continuous = jax.jit(train_step, static_argnums=(2, 3))
+
+    for i in range(hypers.num_restarts):
+        print(f"\n--- Restart {i+1}/{hypers.num_restarts} ---")
+        main_key, restart_key = jax.random.split(main_key)
+        init_fn = jax.nn.initializers.normal(stddev=hypers.init_scale)
+        latent_decomp = (
+            init_fn(restart_key, (n * m, hypers.rank)),
+            init_fn(restart_key, (m * p, hypers.rank)),
+            init_fn(restart_key, (n * p, hypers.rank)),
+        )
+        opt_state = continuous_optimizer.opt.init(latent_decomp)
+
+        for _ in tqdm.tqdm(range(hypers.phase1_steps), desc="Continuous Search"):
+            latent_decomp, opt_state, loss = jit_train_step_continuous(
+                latent_decomp,
+                opt_state,
+                continuous_optimizer.opt,
+                continuous_optimizer._loss_fn,
+            )
+
+        final_loss = l2_loss_real(
+            target_tensor,
+            jnp.einsum(
+                "ir,jr,kr->ijk",
+                *continuous_optimizer._get_constrained_decomposition(latent_decomp),
+            ),
+        )
+        print(f"End of Trial | Final continuous loss: {final_loss:.8f}")
+
+        if final_loss < best_loss_phase1:
+            best_loss_phase1 = final_loss
+            best_latent_decomp = latent_decomp
+
+    # --- PHASE 2: DISCRETE FINE-TUNING ---
+    print(f"\n{'='*20} PHASE 2: Discrete Fine-tuning (STE) {'='*20}")
+    print(f"Starting with best continuous solution (loss: {best_loss_phase1:.8f})")
+
+    continuous_params = continuous_optimizer._get_constrained_decomposition(best_latent_decomp)
+
+    discrete_optimizer = DiscreteOptimizer(target_tensor, hypers)
+    opt_state = discrete_optimizer.opt.init(continuous_params)
+
+    # JIT the train_step for the discrete phase
+    jit_train_step_discrete = jax.jit(train_step, static_argnums=(2, 3))
+
+    for step in tqdm.tqdm(range(hypers.phase2_steps), desc="Discrete Fine-tuning"):
+        continuous_params, opt_state, loss = jit_train_step_discrete(
+            continuous_params, opt_state, discrete_optimizer.opt, discrete_optimizer._loss_fn
+        )
+        if (step + 1) % 2000 == 0:
+            print(f"Step {step+1} | Discrete Loss: {loss:.8f}")
+        if loss < 1e-7:
+            print("\nFound a perfect solution!")
+            break
+
+    final_discrete_decomposition = jax.tree_util.tree_map(round_to_half_ste, continuous_params)
+    final_loss = l2_loss_real(
+        target_tensor, jnp.einsum("ir,jr,kr->ijk", *final_discrete_decomposition)
+    )
+    print(f"Search complete. Final discrete loss: {final_loss:.8f}")
+
+    final_decomposition_np = jax.tree_util.tree_map(np.array, final_discrete_decomposition)
+    return final_decomposition_np, n, m, p, float(final_loss), hypers.rank
+
+
+# EVOLVE-BLOCK-END
diff --git a/benchmarks/math/minimizing_max_min_dist/2/config.yaml b/benchmarks/math/minimizing_max_min_dist/2/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2cccde5812635c07b2e1390c45bdd90e087cab6a
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/2/config.yaml
@@ -0,0 +1,29 @@
+# Math benchmark: minimizing_max_min_dist/2
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: "SETTING:\nYou are an expert computational geometer and optimization specialist focusing on point dispersion\
+    \ problems.\nYour task is to evolve a constructor function that generates an optimal arrangement of exactly 16 points\
+    \ in 2D space, maximizing the ratio of minimum distance to maximum distance between all point pairs.\n\nPROBLEM CONTEXT:\n\
+    - Target: Beat the AlphaEvolve benchmark of min/max ratio = 1/√12.889266112 ≈ 0.2786\n- Constraint: Points must be placed\
+    \ in 2D Euclidean space (typically normalized to unit square [0,1] × [0,1])\n- Mathematical formulation: For points Pi\
+    \ = (xi, yi), i = 1,...,16:\n  * Distance matrix: dij = √[(xi-xj)² + (yi-yj)²] for all i≠j\n  * Minimum distance: dmin\
+    \ = min{dij : i≠j}\n  * Maximum distance: dmax = max{dij : i≠j}\n  * Objective: maximize dmin/dmax subject to spatial\
+    \ constraints\n\nPERFORMANCE METRICS:\n1. **min_max_ratio**: dmin/dmax ratio (PRIMARY OBJECTIVE - maximize)\n2. **combined_score**:\
+    \ min_max_ratio / 0.2786 (progress toward beating AlphaEvolve benchmark)\n3. **eval_time**: Execution time in seconds\
+    \ (balance accuracy vs. efficiency)\n\nTECHNICAL REQUIREMENTS:\n- **Reproducibility**: Fixed random seeds for all stochastic\
+    \ components\n"
+evaluator:
+  timeout: 360
+  max_retries: 3
diff --git a/benchmarks/math/minimizing_max_min_dist/2/evaluator/Dockerfile b/benchmarks/math/minimizing_max_min_dist/2/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/2/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/minimizing_max_min_dist/2/evaluator/evaluate.sh b/benchmarks/math/minimizing_max_min_dist/2/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/2/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/minimizing_max_min_dist/2/evaluator/evaluator.py b/benchmarks/math/minimizing_max_min_dist/2/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..094ac2fb911487ddefc2373309eaf501bb0a9687
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/2/evaluator/evaluator.py
@@ -0,0 +1,78 @@
+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for problem of minimizing the ratio of maximum
+# to minimum distance on dimension 2 and with 16 points.
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+
+import sys
+import os
+from importlib import __import__
+import scipy as sp
+import time
+import numpy as np
+
+NUM_POINTS = 16
+DIMENSION = 2
+BENCHMARK = 1 / 12.889266112
+
+# Scoring: (dmin/dmax)^2.
+# Key reformulation: maximize auxiliary variable t
+#   subject to d(i,j)^2 >= t AND d(i,j)^2 <= 1 for every pair (i,j).
+# This is a constrained NLP with O(n^2) pairwise inequality constraints.
+
+
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+            start_time = time.time()
+            points = program.min_max_dist_dim2_16()
+            end_time = time.time()
+            eval_time = end_time - start_time
+        except Exception as err:
+            raise err
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+
+        if not isinstance(points, np.ndarray):
+            points = np.array(points)
+
+        if points.shape != (NUM_POINTS, DIMENSION):
+            raise ValueError(
+                f"Invalid shapes: points = {points.shape}, expected {(NUM_POINTS,DIMENSION)}"
+            )
+
+        pairwise_distances = sp.spatial.distance.pdist(points)
+        min_distance = np.min(pairwise_distances)
+        max_distance = np.max(pairwise_distances)
+
+        inv_ratio_squared = (min_distance / max_distance) ** 2 if max_distance > 0 else 0
+        return {
+            "min_max_ratio": float(inv_ratio_squared),
+            "combined_score": float(inv_ratio_squared / BENCHMARK),
+            "eval_time": float(eval_time),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/minimizing_max_min_dist/2/evaluator/requirements.txt b/benchmarks/math/minimizing_max_min_dist/2/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5576e19feaf684e56c8fd6f43f64cef3f800e53d
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/2/evaluator/requirements.txt
@@ -0,0 +1,2 @@
+numpy
+scipy
\ No newline at end of file
diff --git a/benchmarks/math/minimizing_max_min_dist/2/evaluator/wrapper.py b/benchmarks/math/minimizing_max_min_dist/2/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/2/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/minimizing_max_min_dist/2/initial_program.py b/benchmarks/math/minimizing_max_min_dist/2/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..9348ce431ecab8e13a0ed58c8f47d43d7e7db9de
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/2/initial_program.py
@@ -0,0 +1,24 @@
+# EVOLVE-BLOCK-START
+import numpy as np
+
+
+def min_max_dist_dim2_16() -> np.ndarray:
+    """
+    Creates 16 points in 2 dimensions in order to maximize the ratio of minimum to maximum distance.
+
+    Returns
+        points: np.ndarray of shape (16,2) containing the (x,y) coordinates of the 16 points.
+
+    """
+
+    n = 16
+    d = 2
+
+    # places points randomly
+    np.random.seed(42)
+    points = np.random.randn(n, d)
+
+    return points
+
+
+# EVOLVE-BLOCK-END
diff --git a/benchmarks/math/minimizing_max_min_dist/3/config.yaml b/benchmarks/math/minimizing_max_min_dist/3/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..06094abafc75bff2234417206743f2921a4635f2
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/3/config.yaml
@@ -0,0 +1,29 @@
+# Math benchmark: minimizing_max_min_dist/3
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: "SETTING:\nYou are an expert computational geometer and optimization specialist focusing on 3D point dispersion\
+    \ problems.\nYour task is to evolve a constructor function that generates an optimal arrangement of exactly 14 points\
+    \ in 3D space, maximizing the ratio of minimum distance to maximum distance between all point pairs.\n\nPROBLEM CONTEXT:\n\
+    - Target: Beat the current state-of-the-art benchmark of min/max ratio = 1/√4.165849767 ≈ 0.4898\n- Constraint: Points\
+    \ must be placed in 3D Euclidean space (typically normalized to unit cube [0,1]³ or unit sphere)\n- Mathematical formulation:\
+    \ For points Pi = (xi, yi, zi), i = 1,...,14:\n  * Distance matrix: dij = √[(xi-xj)² + (yi-yj)² + (zi-zj)²] for all i≠j\n\
+    \  * Minimum distance: dmin = min{dij : i≠j}\n  * Maximum distance: dmax = max{dij : i≠j}\n  * Objective: maximize dmin/dmax\
+    \ subject to spatial constraints\n\nPERFORMANCE METRICS:\n1. **min_max_ratio**: dmin/dmax ratio (PRIMARY OBJECTIVE - maximize)\n\
+    2. **combined_score**: min_max_ratio / 0.4898 (progress toward beating AlphaEvolve benchmark)\n3. **eval_time**: Execution\
+    \ time in seconds (balance accuracy vs. efficiency)\n\nTECHNICAL REQUIREMENTS:\n- **Reproducibility**: Fixed random seeds\
+    \ for all stochastic components\n"
+evaluator:
+  timeout: 360
+  max_retries: 3
diff --git a/benchmarks/math/minimizing_max_min_dist/3/evaluator/Dockerfile b/benchmarks/math/minimizing_max_min_dist/3/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/3/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/minimizing_max_min_dist/3/evaluator/evaluate.sh b/benchmarks/math/minimizing_max_min_dist/3/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/3/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/minimizing_max_min_dist/3/evaluator/evaluator.py b/benchmarks/math/minimizing_max_min_dist/3/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..05b53639da88ca4a5be745b7435c30fab9703fe0
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/3/evaluator/evaluator.py
@@ -0,0 +1,78 @@
+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for problem of minimizing the ratio of maximum
+# to minimum distance on dimension 3 and with 14 points.
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+
+import sys
+import os
+from importlib import __import__
+import scipy as sp
+import time
+import numpy as np
+
+NUM_POINTS = 14
+DIMENSION = 3
+BENCHMARK = 1 / 4.165849767
+
+# Scoring: (dmin/dmax)^2.
+# Key reformulation: maximize auxiliary variable t
+#   subject to d(i,j)^2 >= t AND d(i,j)^2 <= 1 for every pair (i,j).
+# This is a constrained NLP with O(n^2) pairwise inequality constraints.
+
+
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+            start_time = time.time()
+            points = program.min_max_dist_dim3_14()
+            end_time = time.time()
+            eval_time = end_time - start_time
+        except Exception as err:
+            raise err
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+
+        if not isinstance(points, np.ndarray):
+            points = np.array(points)
+
+        if points.shape != (NUM_POINTS, DIMENSION):
+            raise ValueError(
+                f"Invalid shapes: points = {points.shape}, expected {(NUM_POINTS,DIMENSION)}"
+            )
+
+        pairwise_distances = sp.spatial.distance.pdist(points)
+        min_distance = np.min(pairwise_distances)
+        max_distance = np.max(pairwise_distances)
+
+        inv_ratio_squared = (min_distance / max_distance) ** 2 if max_distance > 0 else 0
+        return {
+            "min_max_ratio": float(inv_ratio_squared),
+            "combined_score": float(inv_ratio_squared / BENCHMARK),
+            "eval_time": float(eval_time),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/minimizing_max_min_dist/3/evaluator/requirements.txt b/benchmarks/math/minimizing_max_min_dist/3/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5576e19feaf684e56c8fd6f43f64cef3f800e53d
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/3/evaluator/requirements.txt
@@ -0,0 +1,2 @@
+numpy
+scipy
\ No newline at end of file
diff --git a/benchmarks/math/minimizing_max_min_dist/3/evaluator/wrapper.py b/benchmarks/math/minimizing_max_min_dist/3/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/3/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/minimizing_max_min_dist/3/initial_program.py b/benchmarks/math/minimizing_max_min_dist/3/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..d58a3179efe75808f5031fc805016324fe2cad8b
--- /dev/null
+++ b/benchmarks/math/minimizing_max_min_dist/3/initial_program.py
@@ -0,0 +1,24 @@
+# EVOLVE-BLOCK-START
+import numpy as np
+
+
+def min_max_dist_dim3_14() -> np.ndarray:
+    """
+    Creates 14 points in 3 dimensions in order to maximize the ratio of minimum to maximum distance.
+
+    Returns
+        points: np.ndarray of shape (14,3) containing the (x,y) coordinates of the 14 points.
+
+    """
+
+    n = 14
+    d = 3
+
+    # places points randomly
+    np.random.seed(42)
+    points = np.random.randn(n, d)
+
+    return points
+
+
+# EVOLVE-BLOCK-END
diff --git a/benchmarks/math/second_autocorr_ineq/config.yaml b/benchmarks/math/second_autocorr_ineq/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f50af73bbdcc158504ec4c6f17fac2c1ef80717d
--- /dev/null
+++ b/benchmarks/math/second_autocorr_ineq/config.yaml
@@ -0,0 +1,61 @@
+# Math benchmark: second_autocorr_ineq
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: "SETTING:\nYou are a world-class expert in functional analysis, harmonic analysis, numerical optimization,\
+    \ and AI-driven mathematical discovery. Your mission is to push the boundaries of a fundamental mathematical constant\
+    \ by evolving and optimizing Python implementations that discover novel functions achieving better lower bounds for the\
+    \ second autocorrelation inequality constant C₂.\n\nMATHEMATICAL PROBLEM CONTEXT:\n**Core Problem**: Find a non-negative\
+    \ function f: ℝ → ℝ that maximizes the constant C₂ in the second autocorrelation inequality:\n||f ★ f||₂² ≤ C₂ ||f ★ f||₁\
+    \ ||f ★ f||_{∞}\n\n**Mathematical Framework**:\n- Objective: Maximize C₂ = ||f ★ f||₂² / (||f ★ f||₁ ||f ★ f||_{∞})\n\
+    - Key simplification: ||f ★ f||₁ = (∫f)², reducing to C₂ = ||f ★ f||₂² / ((∫f)² ||f ★ f||_{∞})\n- Convolution definition:\
+    \ (f ★ f)(x) = ∫_{-∞}^{∞} f(t)f(x-t) dt\n- Norms: ||g||₁ = ∫|g|, ||g||₂ = (∫|g|²)^{1/2}, ||g||_{∞} = sup|g|\n- Constraints:\
+    \ f(x) ≥ 0 for all x ∈ ℝ, ∫f > 0\n\n**Historical Context & Current State**:\n- Theoretical bounds: 0.88922 ≤ C₂ ≤ 1 (Young's\
+    \ inequality provides upper bound)\n- Current best lower bound: **0.8962799441554086** (achieved by Google's AlphaEvolve\
+    \ using step functions)\n- **Target**: Surpass 0.8962799441554086 to establish a new world record\n- Mathematical significance:\
+    \ This constant appears in harmonic analysis and has connections to the uncertainty principle\n\n**Known Function Classes\
+    \ & Their Performance**:\n- Gaussian functions: ~0.886\n- Exponential decay: ~0.885\n- Step functions: 0.8962799441554086\
+    \ (current champion)\n- Polynomial decay: Various results < 0.89\n- Spline functions: Unexplored potential\n- Piecewise\
+    \ functions: High promise based on step function success\n\nPERFORMANCE METRICS & SUCCESS CRITERIA:\n**Primary Objective**:\n\
+    - c2: The C₂ constant achieved (MAXIMIZE THIS - any value > 0.8962799441554086 is groundbreaking)\n\n**Secondary Metrics**:\n\
+    - combined_score: c2 / 0.8962799441554086 (>1.0 means new world record)\n- convergence_stability: Consistency across multiple\
+    \ runs\n- function_complexity: Number of parameters/pieces in the discovered function\n- computational_efficiency: Time\
+    \ to convergence\n\n**Diagnostic Metrics**:\n- loss: Final optimization loss value\n- n_points: Discretization resolution\
+    \ used\n- eval_time: Total execution time\n- gradient_norm: Final gradient magnitude (for gradient-based methods)\n\n\
+    COMPUTATIONAL RESOURCES & IMPLEMENTATION STACK:\n**Core Mathematical Libraries**: \n- numpy, scipy (optimization, integration,\
+    \ FFT for convolutions)\n- sympy (symbolic computation, analytical derivatives)\n- jax (automatic differentiation, GPU\
+    \ acceleration)\n- torch (deep learning optimization, autograd)\n\n**Optimization & ML Libraries**:\n- optax (advanced\
+    \ optimizers), scikit-learn (preprocessing, clustering)\n- numba (JIT compilation for speed)\n\n**Data & Analysis**:\n\
+    - pandas (results analysis), matplotlib/plotly (visualization)\n- networkx (if exploring graph-based function representations)\n\
+    \n**Suggested Advanced Packages** (if available):\n- cvxpy (convex optimization), autograd, casadi (optimal control)\n\
+    - tensorflow-probability (probabilistic methods)\n- pymoo (multi-objective optimization)\n\nTECHNICAL REQUIREMENTS & BEST\
+    \ PRACTICES:\n**Reproducibility (CRITICAL)**:\n- Fixed random seeds for ALL stochastic components: `numpy.random.seed(42)`,\
+    \ `torch.manual_seed(42)`\n- Version control: Document package versions used\n- Deterministic algorithms preferred; if\
+    \ stochastic, average over multiple seeds\n\n**Function Constraints**:\n- f(x) ≥ 0 everywhere (use softplus, exponential,\
+    \ or squared transformations)\n- ∫f > 0 (non-trivial function requirement)\n- Numerical stability: Avoid functions causing\
+    \ overflow in convolution computation\n\n**Computational Efficiency**:\n- Leverage FFT for convolution when possible:\
+    \ O(n log n) vs O(n²)\n- Use JAX for GPU acceleration and automatic differentiation\n- Implement adaptive discretization:\
+    \ start coarse, refine around promising regions\n- Memory management: Handle large convolution arrays efficiently\n\n\
+    STRATEGIC APPROACHES & INNOVATION DIRECTIONS:\n**Optimization Strategies**:\n1. **Multi-scale approach**: Optimize on\
+    \ coarse grid, then refine\n2. **Ensemble methods**: Combine multiple promising functions\n3. **Adaptive parametrization**:\
+    \ Start simple, increase complexity gradually\n4. **Basin hopping**: Global optimization with local refinement\n\n**Function\
+    \ Representation Ideas**:\n1. **Learned basis functions**: Neural networks with mathematical priors\n2. **Spline optimization**:\
+    \ B-splines with optimized knot positions\n3. **Fourier space**: Optimize Fourier coefficients with positivity constraints\n\
+    4. **Mixture models**: Weighted combinations of simple functions\n5. **Fractal/self-similar**: Exploit scale invariance\
+    \ properties\n\n**Advanced Mathematical Techniques**:\n- Variational calculus: Derive optimality conditions analytically\n\
+    - Spectral methods: Leverage eigenfunction decompositions\n- Convex relaxations: Handle non-convex constraints systematically\n\
+    - Symmetry exploitation: Use even functions (f(-x) = f(x)) to reduce complexity\n"
+evaluator:
+  timeout: 360
+  max_retries: 3
diff --git a/benchmarks/math/second_autocorr_ineq/evaluator/Dockerfile b/benchmarks/math/second_autocorr_ineq/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/second_autocorr_ineq/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/second_autocorr_ineq/initial_program.py b/benchmarks/math/second_autocorr_ineq/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..020fb8803ef559c762d73bbb41af2bfcb489c521
--- /dev/null
+++ b/benchmarks/math/second_autocorr_ineq/initial_program.py
@@ -0,0 +1,110 @@
+# EVOLVE-BLOCK-START
+import jax
+import jax.numpy as jnp
+import optax
+import numpy as np
+from dataclasses import dataclass
+
+
+@dataclass
+class Hyperparameters:
+    """Hyperparameters for the optimization process."""
+
+    num_intervals: int = 50
+    learning_rate: float = 0.01
+    num_steps: int = 15000
+    warmup_steps: int = 1000
+
+
+class C2Optimizer:
+    """
+    Optimizes a discretized function to find a lower bound for the C2 constant
+    using the rigorous, unitless, piecewise-linear integral method.
+    """
+
+    def __init__(self, hypers: Hyperparameters):
+        self.hypers = hypers
+
+    def _objective_fn(self, f_values: jnp.ndarray) -> jnp.ndarray:
+        """
+        Computes the objective function using the unitless norm calculation.
+        """
+        f_non_negative = jax.nn.relu(f_values)
+
+        # Unscaled discrete autoconvolution
+        N = self.hypers.num_intervals
+        padded_f = jnp.pad(f_non_negative, (0, N))
+        fft_f = jnp.fft.fft(padded_f)
+        convolution = jnp.fft.ifft(fft_f * fft_f).real
+
+        # Calculate L2-norm squared of the convolution (rigorous method)
+        num_conv_points = len(convolution)
+        h = 1.0 / (num_conv_points + 1)
+        y_points = jnp.concatenate([jnp.array([0.0]), convolution, jnp.array([0.0])])
+        y1, y2 = y_points[:-1], y_points[1:]
+        l2_norm_squared = jnp.sum((h / 3) * (y1**2 + y1 * y2 + y2**2))
+
+        # Calculate L1-norm of the convolution
+        norm_1 = jnp.sum(jnp.abs(convolution)) / (len(convolution) + 1)
+
+        # Calculate infinity-norm of the convolution
+        norm_inf = jnp.max(jnp.abs(convolution))
+
+        # Calculate C2 ratio
+        denominator = norm_1 * norm_inf
+        c2_ratio = l2_norm_squared / denominator
+
+        # We want to MAXIMIZE C2, so the optimizer must MINIMIZE its negative.
+        return -c2_ratio
+
+    def train_step(self, f_values: jnp.ndarray, opt_state: optax.OptState) -> tuple:
+        """Performs a single training step."""
+        loss, grads = jax.value_and_grad(self._objective_fn)(f_values)
+        updates, opt_state = self.optimizer.update(grads, opt_state, f_values)
+        f_values = optax.apply_updates(f_values, updates)
+        return f_values, opt_state, loss
+
+    def run_optimization(self):
+        """Sets up and runs the full optimization process."""
+        schedule = optax.warmup_cosine_decay_schedule(
+            init_value=0.0,
+            peak_value=self.hypers.learning_rate,
+            warmup_steps=self.hypers.warmup_steps,
+            decay_steps=self.hypers.num_steps - self.hypers.warmup_steps,
+            end_value=self.hypers.learning_rate * 1e-4,
+        )
+        self.optimizer = optax.adam(learning_rate=schedule)
+
+        key = jax.random.PRNGKey(42)
+        f_values = jax.random.uniform(key, (self.hypers.num_intervals,))
+
+        opt_state = self.optimizer.init(f_values)
+        print(
+            f"Number of intervals (N): {self.hypers.num_intervals}, Steps: {self.hypers.num_steps}"
+        )
+        train_step_jit = jax.jit(self.train_step)
+
+        loss = jnp.inf
+        for step in range(self.hypers.num_steps):
+            f_values, opt_state, loss = train_step_jit(f_values, opt_state)
+            if step % 1000 == 0 or step == self.hypers.num_steps - 1:
+                print(f"Step {step:5d} | C2 ≈ {-loss:.8f}")
+
+        final_c2 = -self._objective_fn(f_values)
+        print(f"Final C2 lower bound found: {final_c2:.8f}")
+        return jax.nn.relu(f_values), final_c2
+
+
+def run():
+    """Entry point for running the optimization."""
+    hypers = Hyperparameters()
+    optimizer = C2Optimizer(hypers)
+    optimized_f, final_c2_val = optimizer.run_optimization()
+
+    loss_val = -final_c2_val
+    f_values_np = np.array(optimized_f)
+
+    return f_values_np, float(final_c2_val), float(loss_val), hypers.num_intervals
+
+
+# EVOLVE-BLOCK-END
diff --git a/benchmarks/math/signal_processing/README.md b/benchmarks/math/signal_processing/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7494ee3ca15bfc15c3e2eea507c46cbf32b65a51
--- /dev/null
+++ b/benchmarks/math/signal_processing/README.md
@@ -0,0 +1,46 @@
+# Real-Time Adaptive Signal Processing
+
+Evolve a real-time adaptive filtering algorithm for non-stationary time series data. The algorithm must filter noise while preserving signal dynamics and minimizing computational latency.
+
+## Problem
+
+**Input**: Univariate time series with non-linear dynamics, non-stationary statistics, and rapidly changing spectral characteristics.
+
+**Constraints**: Causal processing (finite sliding window), fixed latency, real-time capability.
+
+**Multi-objective function**:
+```
+J(theta) = 0.3*S + 0.2*L_recent + 0.2*L_avg + 0.3*R
+```
+- **S**: Slope change penalty (directional reversals in filtered signal)
+- **L_recent**: Instantaneous lag error
+- **L_avg**: Average tracking error
+- **R**: False reversal penalty (noise-induced trend changes)
+
+The evaluator tests on 5 synthetic signals: sinusoidal, multi-frequency, non-stationary, step changes, and random walk.
+
+## Run
+
+```bash
+# From repo root
+uv run skydiscover-run \
+  benchmarks/math/signal_processing/initial_program.py \
+  benchmarks/math/signal_processing/evaluator.py \
+  -c benchmarks/math/signal_processing/config.yaml \
+  -s [your_algorithm] \
+  -i 100
+```
+
+## Scoring
+
+- **combined_score**: Composite J(theta) metric (higher is better)
+- Also reports: slope changes, correlation, lag error, noise reduction, processing time
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `initial_program.py` | Seed: basic moving average / weighted exponential filters |
+| `evaluator.py` | Multi-objective evaluation across 5 synthetic test signals |
+| `config.yaml` | LLM and evaluator settings |
+| `requirements.txt` | Python dependencies |
diff --git a/benchmarks/math/signal_processing/config.yaml b/benchmarks/math/signal_processing/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9e7f684648c48b70a19a99265ffab8e552f8bad9
--- /dev/null
+++ b/benchmarks/math/signal_processing/config.yaml
@@ -0,0 +1,30 @@
+# Math benchmark: signal_processing
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: 'You are an expert signal processing engineer specializing in real-time adaptive filtering algorithms. Your
+    task is to improve a signal processing algorithm that filters volatile, non-stationary time series data using a sliding
+    window approach. The algorithm must minimize noise while preserving signal dynamics with minimal computational latency
+    and phase delay. Focus on the multi-objective optimization of: (1) Slope change minimization - reducing spurious directional
+    reversals, (2) Lag error minimization - maintaining responsiveness, (3) Tracking accuracy - preserving genuine signal
+    trends, and (4) False reversal penalty - avoiding noise-induced trend changes. Consider advanced techniques like adaptive
+    filtering (Kalman filters, particle filters), multi-scale processing (wavelets, EMD), predictive enhancement (polynomial
+    fitting, neural networks), and trend detection methods.'
+evaluator:
+  timeout: 360
+  cascade_evaluation: true
+  cascade_thresholds:
+  - 0.3
+  - 0.6
+
diff --git a/benchmarks/math/signal_processing/evaluator/Dockerfile b/benchmarks/math/signal_processing/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/signal_processing/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/signal_processing/evaluator/evaluate.sh b/benchmarks/math/signal_processing/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/signal_processing/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/signal_processing/evaluator/evaluator.py b/benchmarks/math/signal_processing/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..e19d989064b39845e3c39d03bf6478d08e0d5d40
--- /dev/null
+++ b/benchmarks/math/signal_processing/evaluator/evaluator.py
@@ -0,0 +1,536 @@
+"""
+Evaluator for the Real-Time Adaptive Signal Processing Algorithm
+
+This evaluator implements the multi-objective optimization function defined in the specification:
+J(θ) = α₁·S(θ) + α₂·L_recent(θ) + α₃·L_avg(θ) + α₄·R(θ)
+
+Where:
+- S(θ): Slope change penalty - counts directional reversals
+- L_recent(θ): Instantaneous lag error - |y[n] - x[n]|
+- L_avg(θ): Average tracking error over window
+- R(θ): False reversal penalty - mismatched trend changes
+- α₁=0.3, α₂=α₃=0.2, α₄=0.3: Weighting coefficients
+"""
+
+import importlib.util
+import numpy as np
+import time
+import concurrent.futures
+import traceback
+from scipy import signal
+from scipy.stats import pearsonr
+
+
+def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=30):
+    """
+    Run a function with a timeout using concurrent.futures
+    """
+    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+        future = executor.submit(func, *args, **kwargs)
+        try:
+            result = future.result(timeout=timeout_seconds)
+            return result
+        except concurrent.futures.TimeoutError:
+            raise TimeoutError(f"Function timed out after {timeout_seconds} seconds")
+
+
+def safe_float(value):
+    """Convert a value to float safely"""
+    try:
+        if np.isnan(value) or np.isinf(value):
+            return 0.0
+        return float(value)
+    except (TypeError, ValueError):
+        return 0.0
+
+
+def calculate_slope_changes(signal_data):
+    """
+    Calculate slope change penalty S(θ) - counts directional reversals
+
+    Args:
+        signal_data: 1D array of signal values
+
+    Returns:
+        Number of slope changes (directional reversals)
+    """
+    if len(signal_data) < 3:
+        return 0
+
+    # Calculate differences
+    diffs = np.diff(signal_data)
+
+    # Count sign changes in consecutive differences
+    sign_changes = 0
+    for i in range(1, len(diffs)):
+        if np.sign(diffs[i]) != np.sign(diffs[i - 1]) and diffs[i - 1] != 0:
+            sign_changes += 1
+
+    return sign_changes
+
+
+def calculate_lag_error(filtered_signal, original_signal, window_size):
+    """
+    Calculate instantaneous lag error L_recent(θ) = |y[n] - x[n]|
+
+    Args:
+        filtered_signal: Output of the filter
+        original_signal: Original input signal
+        window_size: Size of the processing window
+
+    Returns:
+        Instantaneous lag error at the most recent sample
+    """
+    if len(filtered_signal) == 0:
+        return 1.0  # Maximum penalty
+
+    # Account for processing delay
+    delay = window_size - 1
+    if len(original_signal) <= delay:
+        return 1.0
+
+    # Compare the last filtered sample with the corresponding original sample
+    recent_filtered = filtered_signal[-1]
+    recent_original = original_signal[delay + len(filtered_signal) - 1]
+
+    return abs(recent_filtered - recent_original)
+
+
+def calculate_average_tracking_error(filtered_signal, original_signal, window_size):
+    """
+    Calculate average tracking error L_avg(θ) over the window
+
+    Args:
+        filtered_signal: Output of the filter
+        original_signal: Original input signal
+        window_size: Size of the processing window
+
+    Returns:
+        Average absolute error over the processed samples
+    """
+    if len(filtered_signal) == 0:
+        return 1.0  # Maximum penalty
+
+    # Account for processing delay
+    delay = window_size - 1
+    if len(original_signal) <= delay:
+        return 1.0
+
+    # Align signals
+    aligned_original = original_signal[delay : delay + len(filtered_signal)]
+
+    # Ensure same length
+    min_length = min(len(filtered_signal), len(aligned_original))
+    if min_length == 0:
+        return 1.0
+
+    filtered_aligned = filtered_signal[:min_length]
+    original_aligned = aligned_original[:min_length]
+
+    # Calculate mean absolute error
+    return np.mean(np.abs(filtered_aligned - original_aligned))
+
+
+def calculate_false_reversal_penalty(filtered_signal, clean_signal, window_size):
+    """
+    Calculate false reversal penalty R(θ) - mismatched trend changes
+
+    Args:
+        filtered_signal: Output of the filter
+        clean_signal: Ground truth clean signal
+        window_size: Size of the processing window
+
+    Returns:
+        Penalty for trend changes that don't match the clean signal
+    """
+    if len(filtered_signal) < 3 or len(clean_signal) < 3:
+        return 0
+
+    # Account for processing delay
+    delay = window_size - 1
+    if len(clean_signal) <= delay:
+        return 1.0
+
+    # Align signals
+    aligned_clean = clean_signal[delay : delay + len(filtered_signal)]
+    min_length = min(len(filtered_signal), len(aligned_clean))
+
+    if min_length < 3:
+        return 0
+
+    filtered_aligned = filtered_signal[:min_length]
+    clean_aligned = aligned_clean[:min_length]
+
+    # Calculate trend changes for both signals
+    filtered_diffs = np.diff(filtered_aligned)
+    clean_diffs = np.diff(clean_aligned)
+
+    # Count mismatched trend changes
+    false_reversals = 0
+    for i in range(1, len(filtered_diffs)):
+        # Check if there's a trend change in filtered signal
+        filtered_change = (
+            np.sign(filtered_diffs[i]) != np.sign(filtered_diffs[i - 1])
+            and filtered_diffs[i - 1] != 0
+        )
+
+        # Check if there's a corresponding trend change in clean signal
+        clean_change = (
+            np.sign(clean_diffs[i]) != np.sign(clean_diffs[i - 1]) and clean_diffs[i - 1] != 0
+        )
+
+        # Count as false reversal if filtered has change but clean doesn't
+        if filtered_change and not clean_change:
+            false_reversals += 1
+
+    return false_reversals
+
+
+def calculate_composite_score(S, L_recent, L_avg, R, alpha=[0.3, 0.2, 0.2, 0.3]):
+    """
+    Calculate the composite metric J(θ) = α₁·S(θ) + α₂·L_recent(θ) + α₃·L_avg(θ) + α₄·R(θ)
+
+    All metrics are normalized and converted to penalties (higher = worse)
+    The final score is converted to a maximization problem (higher = better)
+    """
+    # Normalize slope changes (typical range 0-100)
+    S_norm = min(S / 50.0, 2.0)
+
+    # Lag errors are already in reasonable range (0-10 typically)
+    L_recent_norm = min(L_recent, 2.0)
+    L_avg_norm = min(L_avg, 2.0)
+
+    # Normalize false reversals (typical range 0-50)
+    R_norm = min(R / 25.0, 2.0)
+
+    # Calculate weighted penalty
+    penalty = (
+        alpha[0] * S_norm + alpha[1] * L_recent_norm + alpha[2] * L_avg_norm + alpha[3] * R_norm
+    )
+
+    # Convert to maximization score (higher is better)
+    score = 1.0 / (1.0 + penalty)
+
+    return score
+
+
+def generate_test_signals(num_signals=5):
+    """
+    Generate multiple test signals with different characteristics
+    """
+    test_signals = []
+
+    for i in range(num_signals):
+        np.random.seed(42 + i)  # Different seed for each signal
+        length = 500 + i * 100  # Varying lengths
+        noise_level = 0.2 + i * 0.1  # Varying noise levels
+
+        t = np.linspace(0, 10, length)
+
+        # Different signal characteristics
+        if i == 0:
+            # Smooth sinusoidal with trend
+            clean = 2 * np.sin(2 * np.pi * 0.5 * t) + 0.1 * t
+        elif i == 1:
+            # Multiple frequency components
+            clean = (
+                np.sin(2 * np.pi * 0.5 * t)
+                + 0.5 * np.sin(2 * np.pi * 2 * t)
+                + 0.2 * np.sin(2 * np.pi * 5 * t)
+            )
+        elif i == 2:
+            # Non-stationary with changing frequency
+            clean = np.sin(2 * np.pi * (0.5 + 0.2 * t) * t)
+        elif i == 3:
+            # Step changes
+            clean = np.concatenate(
+                [
+                    np.ones(length // 3),
+                    2 * np.ones(length // 3),
+                    0.5 * np.ones(length - 2 * (length // 3)),
+                ]
+            )
+        else:
+            # Random walk with trend
+            clean = np.cumsum(np.random.randn(length) * 0.1) + 0.05 * t
+
+        # Add noise
+        noise = np.random.normal(0, noise_level, length)
+        noisy = clean + noise
+
+        test_signals.append((noisy, clean))
+
+    return test_signals
+
+
+# Input: run_signal_processing(noisy_signal, window_size) — full signal array and window size.
+# Scoring: composite of smoothness, tracking accuracy, correlation, and noise reduction.
+
+def evaluate(program_path):
+    """
+    Main evaluation function that tests the signal processing algorithm
+    on multiple test signals and calculates the composite performance metric.
+    """
+    try:
+        # Load the program
+        spec = importlib.util.spec_from_file_location("program", program_path)
+        program = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(program)
+
+        # Check if required function exists
+        if not hasattr(program, "run_signal_processing"):
+            return {"combined_score": 0.0, "composite_score": 0.0, "error": "Missing run_signal_processing function"}
+
+        # Generate test signals
+        test_signals = generate_test_signals(5)
+
+        # Collect metrics across all test signals
+        all_scores = []
+        all_metrics = []
+        successful_runs = 0
+
+        for i, (noisy_signal, clean_signal) in enumerate(test_signals):
+            try:
+                # Run the algorithm with timeout
+                start_time = time.time()
+
+                # Call the program's main function
+                result = run_with_timeout(
+                    program.run_signal_processing,
+                    kwargs={
+                        "noisy_signal": noisy_signal,
+                        "window_size": 20,
+                    },
+                    timeout_seconds=10,
+                )
+
+                execution_time = time.time() - start_time
+
+                # Validate result format
+                if not isinstance(result, dict):
+                    print(f"Signal {i}: Invalid result format")
+                    continue
+
+                if "filtered_signal" not in result:
+                    print(f"Signal {i}: Missing filtered_signal in result")
+                    continue
+
+                filtered_signal = result["filtered_signal"]
+
+                if len(filtered_signal) == 0:
+                    print(f"Signal {i}: Empty filtered signal")
+                    continue
+
+                # Convert to numpy arrays
+                filtered_signal = np.array(filtered_signal)
+
+                # Calculate metrics using the generated test signal
+                window_size = 20
+
+                # Calculate all penalty components
+                S = calculate_slope_changes(filtered_signal)
+                L_recent = calculate_lag_error(filtered_signal, noisy_signal, window_size)
+                L_avg = calculate_average_tracking_error(filtered_signal, noisy_signal, window_size)
+                R = calculate_false_reversal_penalty(filtered_signal, clean_signal, window_size)
+
+                # Calculate composite score
+                composite_score = calculate_composite_score(S, L_recent, L_avg, R)
+
+                # Additional quality metrics
+                correlation = 0.0
+                noise_reduction = 0.0
+
+                try:
+                    # Calculate correlation with clean signal
+                    delay = window_size - 1
+                    aligned_clean = clean_signal[delay : delay + len(filtered_signal)]
+                    min_length = min(len(filtered_signal), len(aligned_clean))
+
+                    if min_length > 1:
+                        corr_result = pearsonr(
+                            filtered_signal[:min_length], aligned_clean[:min_length]
+                        )
+                        correlation = corr_result[0] if not np.isnan(corr_result[0]) else 0.0
+
+                    # Calculate noise reduction
+                    aligned_noisy = noisy_signal[delay : delay + len(filtered_signal)]
+                    aligned_noisy = aligned_noisy[:min_length]
+                    aligned_clean = aligned_clean[:min_length]
+
+                    if min_length > 0:
+                        noise_before = np.var(aligned_noisy - aligned_clean)
+                        noise_after = np.var(filtered_signal[:min_length] - aligned_clean)
+                        noise_reduction = (
+                            (noise_before - noise_after) / noise_before if noise_before > 0 else 0
+                        )
+                        noise_reduction = max(0, noise_reduction)  # Ensure non-negative
+
+                except Exception as e:
+                    print(f"Signal {i}: Error calculating additional metrics: {e}")
+
+                # Store metrics
+                metrics = {
+                    "slope_changes": safe_float(S),
+                    "lag_error": safe_float(L_recent),
+                    "avg_error": safe_float(L_avg),
+                    "false_reversals": safe_float(R),
+                    "composite_score": safe_float(composite_score),
+                    "correlation": safe_float(correlation),
+                    "noise_reduction": safe_float(noise_reduction),
+                    "execution_time": safe_float(execution_time),
+                    "signal_length": len(filtered_signal),
+                }
+
+                all_scores.append(composite_score)
+                all_metrics.append(metrics)
+                successful_runs += 1
+
+            except TimeoutError:
+                print(f"Signal {i}: Timeout")
+                continue
+            except Exception as e:
+                print(f"Signal {i}: Error - {str(e)}")
+                continue
+
+        # If no successful runs, return minimal scores
+        if successful_runs == 0:
+            return {
+                "combined_score": 0.0,
+                "composite_score": 0.0,
+                "slope_changes": 100.0,
+                "lag_error": 1.0,
+                "avg_error": 1.0,
+                "false_reversals": 50.0,
+                "correlation": 0.0,
+                "noise_reduction": 0.0,
+                "success_rate": 0.0,
+                "error": "All test signals failed",
+            }
+
+        # Calculate aggregate metrics
+        avg_composite_score = np.mean(all_scores)
+        avg_slope_changes = np.mean([m["slope_changes"] for m in all_metrics])
+        avg_lag_error = np.mean([m["lag_error"] for m in all_metrics])
+        avg_avg_error = np.mean([m["avg_error"] for m in all_metrics])
+        avg_false_reversals = np.mean([m["false_reversals"] for m in all_metrics])
+        avg_correlation = np.mean([m["correlation"] for m in all_metrics])
+        avg_noise_reduction = np.mean([m["noise_reduction"] for m in all_metrics])
+        avg_execution_time = np.mean([m["execution_time"] for m in all_metrics])
+        success_rate = successful_runs / len(test_signals)
+
+        # Calculate additional derived scores
+        smoothness_score = 1.0 / (1.0 + avg_slope_changes / 20.0)  # Higher is better
+        responsiveness_score = 1.0 / (1.0 + avg_lag_error)  # Higher is better
+        accuracy_score = max(0, avg_correlation)  # 0-1, higher is better
+        efficiency_score = min(1.0, 1.0 / max(0.001, avg_execution_time))  # Speed bonus
+
+        # Overall score combining multiple factors
+        overall_score = (
+            0.4 * avg_composite_score  # Primary metric
+            + 0.2 * smoothness_score  # Smoothness
+            + 0.2 * accuracy_score  # Correlation with clean signal
+            + 0.1 * avg_noise_reduction  # Noise reduction capability
+            + 0.1 * success_rate  # Reliability
+        )
+
+        # Gate: zero out score if accuracy is too low
+        if accuracy_score < 0.1:
+            overall_score = 0.0
+
+        return {
+            "combined_score": safe_float(overall_score),  # Primary selection metric for SkyDiscover
+            "composite_score": safe_float(avg_composite_score),
+            "overall_score": safe_float(overall_score),
+            "slope_changes": safe_float(avg_slope_changes),
+            "lag_error": safe_float(avg_lag_error),
+            "avg_error": safe_float(avg_avg_error),
+            "false_reversals": safe_float(avg_false_reversals),
+            "correlation": safe_float(avg_correlation),
+            "noise_reduction": safe_float(avg_noise_reduction),
+            "smoothness_score": safe_float(smoothness_score),
+            "responsiveness_score": safe_float(responsiveness_score),
+            "accuracy_score": safe_float(accuracy_score),
+            "efficiency_score": safe_float(efficiency_score),
+            "execution_time": safe_float(avg_execution_time),
+            "success_rate": safe_float(success_rate),
+        }
+
+    except Exception as e:
+        print(f"Evaluation failed: {str(e)}")
+        print(traceback.format_exc())
+        return {"combined_score": 0.0, "composite_score": 0.0, "overall_score": 0.0, "error": str(e)}
+
+
+def evaluate_stage1(program_path):
+    """
+    Stage 1 evaluation: Quick validation that the program runs without errors
+    """
+    try:
+        # Load the program
+        spec = importlib.util.spec_from_file_location("program", program_path)
+        program = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(program)
+
+        # Check if required function exists
+        if not hasattr(program, "run_signal_processing"):
+            return {"runs_successfully": 0.0, "error": "Missing run_signal_processing function"}
+
+        # Generate a small test signal (consistent with evaluate() API)
+        np.random.seed(42)
+        signal_length = 100
+        window_size = 10
+        t = np.linspace(0, 2, signal_length)
+        clean_signal = np.sin(2 * np.pi * 0.5 * t)
+        noisy_signal = clean_signal + np.random.normal(0, 0.3, signal_length)
+
+        # Quick test with small signal
+        try:
+            result = run_with_timeout(
+                program.run_signal_processing,
+                kwargs={"noisy_signal": noisy_signal, "window_size": window_size},
+                timeout_seconds=5,
+            )
+
+            if isinstance(result, dict) and "filtered_signal" in result:
+                filtered_signal = result["filtered_signal"]
+                if len(filtered_signal) > 0:
+                    # Quick quality check
+                    composite_score = 0.5  # Baseline score for working programs
+
+                    # Bonus for reasonable output length
+                    expected_length = signal_length - window_size + 1
+                    if len(filtered_signal) == expected_length:
+                        composite_score += 0.2
+
+                    return {
+                        "runs_successfully": 1.0,
+                        "composite_score": composite_score,
+                        "output_length": len(filtered_signal),
+                    }
+                else:
+                    return {"runs_successfully": 0.5, "error": "Empty filtered signal"}
+            else:
+                return {"runs_successfully": 0.3, "error": "Invalid result format"}
+
+        except TimeoutError:
+            return {"runs_successfully": 0.0, "error": "Timeout in stage 1"}
+        except Exception as e:
+            return {"runs_successfully": 0.0, "error": f"Stage 1 error: {str(e)}"}
+
+    except Exception as e:
+        return {"runs_successfully": 0.0, "error": f"Stage 1 failed: {str(e)}"}
+
+
+def evaluate_stage2(program_path):
+    """
+    Stage 2 evaluation: Full evaluation with all test signals
+    """
+    return evaluate(program_path)
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/signal_processing/evaluator/requirements.txt b/benchmarks/math/signal_processing/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9e8a3b7abecc7bf6cb941f155e91fb39a40afd6e
--- /dev/null
+++ b/benchmarks/math/signal_processing/evaluator/requirements.txt
@@ -0,0 +1,4 @@
+# Requirements for Real-Time Signal Processing Example
+numpy>=1.21.0
+scipy>=1.7.0
+PyWavelets
diff --git a/benchmarks/math/signal_processing/evaluator/wrapper.py b/benchmarks/math/signal_processing/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/signal_processing/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/signal_processing/initial_program.py b/benchmarks/math/signal_processing/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8fef3b2e848629915b1f76feb0d494c3309b60f
--- /dev/null
+++ b/benchmarks/math/signal_processing/initial_program.py
@@ -0,0 +1,207 @@
+# EVOLVE-BLOCK-START
+"""
+Real-Time Adaptive Signal Processing Algorithm for Non-Stationary Time Series
+
+This algorithm implements a sliding window approach to filter volatile, non-stationary
+time series data while minimizing noise and preserving signal dynamics.
+"""
+import numpy as np
+
+
+def adaptive_filter(x, window_size=20):
+    """
+    Adaptive signal processing algorithm using sliding window approach.
+
+    Args:
+        x: Input signal (1D array of real-valued samples)
+        window_size: Size of the sliding window (W samples)
+
+    Returns:
+        y: Filtered output signal with length = len(x) - window_size + 1
+    """
+    if len(x) < window_size:
+        raise ValueError(f"Input signal length ({len(x)}) must be >= window_size ({window_size})")
+
+    # Initialize output array
+    output_length = len(x) - window_size + 1
+    y = np.zeros(output_length)
+
+    # Simple moving average as baseline
+    for i in range(output_length):
+        window = x[i : i + window_size]
+
+        # Basic moving average filter
+        y[i] = np.mean(window)
+
+    return y
+
+
+def enhanced_filter_with_trend_preservation(x, window_size=20):
+    """
+    Enhanced version with trend preservation using weighted moving average.
+
+    Args:
+        x: Input signal (1D array of real-valued samples)
+        window_size: Size of the sliding window
+
+    Returns:
+        y: Filtered output signal
+    """
+    if len(x) < window_size:
+        raise ValueError(f"Input signal length ({len(x)}) must be >= window_size ({window_size})")
+
+    output_length = len(x) - window_size + 1
+    y = np.zeros(output_length)
+
+    # Create weights that emphasize recent samples
+    weights = np.exp(np.linspace(-2, 0, window_size))
+    weights = weights / np.sum(weights)
+
+    for i in range(output_length):
+        window = x[i : i + window_size]
+
+        # Weighted moving average with exponential weights
+        y[i] = np.sum(window * weights)
+
+    return y
+
+
+def process_signal(input_signal, window_size=20, algorithm_type="enhanced"):
+    """
+    Main signal processing function that applies the selected algorithm.
+
+    Args:
+        input_signal: Input time series data
+        window_size: Window size for processing
+        algorithm_type: Type of algorithm to use ("basic" or "enhanced")
+
+    Returns:
+        Filtered signal
+    """
+    if algorithm_type == "enhanced":
+        return enhanced_filter_with_trend_preservation(input_signal, window_size)
+    else:
+        return adaptive_filter(input_signal, window_size)
+
+
+# EVOLVE-BLOCK-END
+
+
+def generate_test_signal(length=1000, noise_level=0.3, seed=42):
+    """
+    Generate synthetic test signal with known characteristics.
+
+    Args:
+        length: Length of the signal
+        noise_level: Standard deviation of noise to add
+        seed: Random seed for reproducibility
+
+    Returns:
+        Tuple of (noisy_signal, clean_signal)
+    """
+    np.random.seed(seed)
+    t = np.linspace(0, 10, length)
+
+    # Create a complex signal with multiple components
+    clean_signal = (
+        2 * np.sin(2 * np.pi * 0.5 * t)  # Low frequency component
+        + 1.5 * np.sin(2 * np.pi * 2 * t)  # Medium frequency component
+        + 0.5 * np.sin(2 * np.pi * 5 * t)  # Higher frequency component
+        + 0.8 * np.exp(-t / 5) * np.sin(2 * np.pi * 1.5 * t)  # Decaying oscillation
+    )
+
+    # Add non-stationary behavior
+    trend = 0.1 * t * np.sin(0.2 * t)  # Slowly varying trend
+    clean_signal += trend
+
+    # Add random walk component for non-stationarity
+    random_walk = np.cumsum(np.random.randn(length) * 0.05)
+    clean_signal += random_walk
+
+    # Add noise
+    noise = np.random.normal(0, noise_level, length)
+    noisy_signal = clean_signal + noise
+
+    return noisy_signal, clean_signal
+
+
+def run_signal_processing(noisy_signal=None, signal_length=1000, noise_level=0.3, window_size=20):
+    """
+    Run the signal processing algorithm on a test signal.
+
+    Args:
+        noisy_signal: Input signal to filter (if provided, use this; otherwise generate)
+        signal_length: Length if generating signal (for backward compatibility)
+        noise_level: Noise level if generating signal (for backward compatibility)
+        window_size: Window size for processing
+
+    Returns:
+        Dictionary containing results and metrics
+    """
+    # Use provided signal or generate test signal (for backward compatibility)
+    if noisy_signal is not None:
+        # Filter the provided signal
+        filtered_signal = process_signal(noisy_signal, window_size, "enhanced")
+        clean_signal = None  # Not available when using provided signal
+    else:
+        # Generate test signal (for __main__ and backward compatibility)
+        noisy_signal, clean_signal = generate_test_signal(signal_length, noise_level)
+        filtered_signal = process_signal(noisy_signal, window_size, "enhanced")
+
+    # Calculate basic metrics (only if we have clean_signal from generation)
+    if len(filtered_signal) > 0 and clean_signal is not None:
+        # Align signals for comparison (account for processing delay)
+        delay = window_size - 1
+        aligned_clean = clean_signal[delay:]
+        aligned_noisy = noisy_signal[delay:]
+
+        # Ensure same length
+        min_length = min(len(filtered_signal), len(aligned_clean))
+        filtered_signal = filtered_signal[:min_length]
+        aligned_clean = aligned_clean[:min_length]
+        aligned_noisy = aligned_noisy[:min_length]
+
+        # Calculate correlation with clean signal
+        correlation = np.corrcoef(filtered_signal, aligned_clean)[0, 1] if min_length > 1 else 0
+
+        # Calculate noise reduction
+        noise_before = np.var(aligned_noisy - aligned_clean)
+        noise_after = np.var(filtered_signal - aligned_clean)
+        noise_reduction = (noise_before - noise_after) / noise_before if noise_before > 0 else 0
+
+        return {
+            "filtered_signal": filtered_signal,
+            "clean_signal": aligned_clean,
+            "noisy_signal": aligned_noisy,
+            "correlation": correlation,
+            "noise_reduction": noise_reduction,
+            "signal_length": min_length,
+        }
+    elif len(filtered_signal) > 0:
+        # When using provided signal (no clean_signal available), just return filtered signal
+        return {
+            "filtered_signal": filtered_signal,
+            "clean_signal": None,
+            "noisy_signal": None,
+            "correlation": 0,
+            "noise_reduction": 0,
+            "signal_length": len(filtered_signal),
+        }
+    else:
+        return {
+            "filtered_signal": [],
+            "clean_signal": [],
+            "noisy_signal": [],
+            "correlation": 0,
+            "noise_reduction": 0,
+            "signal_length": 0,
+        }
+
+
+if __name__ == "__main__":
+    # Test the algorithm
+    results = run_signal_processing()
+    print("Signal processing completed!")
+    print(f"Correlation with clean signal: {results['correlation']:.3f}")
+    print(f"Noise reduction: {results['noise_reduction']:.3f}")
+    print(f"Processed signal length: {results['signal_length']}")
diff --git a/benchmarks/math/sums_diffs_finite_sets/config.yaml b/benchmarks/math/sums_diffs_finite_sets/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..72441284aae838b9cf735ccf640715eb05d62fce
--- /dev/null
+++ b/benchmarks/math/sums_diffs_finite_sets/config.yaml
@@ -0,0 +1,41 @@
+# Math benchmark: sums_diffs_finite_sets
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: |
+    SETTING:
+    You are an expert in number theory, combinatorial optimization, and AI-driven mathematical discovery.
+    Your task is to evolve and optimize a Python script to find a finite set of integers `U` that provides a new, world-record **lower bound** for the constant C₆.
+
+    PROBLEM CONTEXT:
+    Target: Find a finite set `U` of non-negative integers (containing 0) that **maximizes** the objective function:
+    C6(U) = 1 + log(|U-U| / |U+U|) / log(2*max(U) + 1)
+
+    This maximum value provides a tight lower bound for the constant C6.
+
+    Current best known lower bound: C6 ≥ 1.158417281556896
+    Goal: Find a set `U` that results in a C6 value greater than 1.158417281556896.
+
+    PERFORMANCE METRICS:
+    - c6_bound: Bound found.
+    - combined_score: c6_bound/1.158417281556896 (The primary objective is to MAXIMIZE this value - a value > 1 means a new record).
+    - set_size: size of the set U.
+    - max_val: max value in the set U.
+    - eval_time: evaluation time of the main program.
+
+    VALIDATION FRAMEWORK:
+    - The evaluation script re-computes the C6 value using standard NumPy set operations and verifies the constraints on `U`.
+evaluator:
+  timeout: 600
+  max_retries: 3
diff --git a/benchmarks/math/sums_diffs_finite_sets/evaluator/evaluate.sh b/benchmarks/math/sums_diffs_finite_sets/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/sums_diffs_finite_sets/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/sums_diffs_finite_sets/evaluator/evaluator.py b/benchmarks/math/sums_diffs_finite_sets/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..04deb415b541c38754118112e88f742e77598404
--- /dev/null
+++ b/benchmarks/math/sums_diffs_finite_sets/evaluator/evaluator.py
@@ -0,0 +1,97 @@
+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for the sums and differences of finite sets problem.
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+
+import sys
+import os
+from importlib import __import__
+import time
+import numpy as np
+
+BENCHMARK = 1.158417281556896
+
+
+def verify_c6_solution(u_set: np.ndarray, c6_achieved: float):
+    """Verifies the C6 lower bound solution."""
+
+    if not isinstance(u_set, np.ndarray) or u_set.ndim != 1:
+        raise ValueError("Solution U must be a 1D numpy array of integers.")
+
+    # Verify constraints
+    if 0 not in u_set:
+        raise ValueError("Set U must contain 0.")
+    if np.any(u_set < 0):
+        raise ValueError("Set U must contain non-negative integers.")
+
+    # Re-calculate the C6 bound using NumPy
+    u_plus_u = np.unique(u_set[:, None] + u_set[None, :])
+    u_minus_u = np.unique(u_set[:, None] - u_set[None, :])
+
+    size_U_plus_U = len(u_plus_u)
+    size_U_minus_U = len(u_minus_u)
+    max_U = np.max(u_set)
+
+    ratio = size_U_minus_U / size_U_plus_U
+    log_ratio = np.log(ratio)
+    log_denom = np.log(2 * max_U + 1)
+
+    computed_c6 = 1 + log_ratio / log_denom
+
+    # Check for consistency
+    if not np.isclose(computed_c6, c6_achieved):
+        raise ValueError(f"C6 mismatch: reported {c6_achieved:.6f}, computed {computed_c6:.6f}")
+
+    print(f"C6 lower bound achieved: {c6_achieved:.6f}")
+    print(f"Known best bound (AlphaEvolve): {BENCHMARK}")
+
+    if c6_achieved > BENCHMARK:
+        print("Successfully found a new, better lower bound!")
+    else:
+        print("Result is not better than the known lower bounds.")
+
+
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+            start_time = time.time()
+            u_set, c6_bound = program.run()
+            end_time = time.time()
+            eval_time = end_time - start_time
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+
+        verify_c6_solution(u_set, c6_bound)
+
+        return {
+            "c6_bound": float(c6_bound),
+            "combined_score": float(c6_bound) / BENCHMARK,
+            "set_size": len(u_set),
+            "max_val": int(np.max(u_set)),
+            "eval_time": float(eval_time),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/sums_diffs_finite_sets/evaluator/requirements.txt b/benchmarks/math/sums_diffs_finite_sets/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3dee69521695f692e340afea5918ed74f057d6aa
--- /dev/null
+++ b/benchmarks/math/sums_diffs_finite_sets/evaluator/requirements.txt
@@ -0,0 +1,3 @@
+numpy
+jax
+optax
\ No newline at end of file
diff --git a/benchmarks/math/sums_diffs_finite_sets/evaluator/wrapper.py b/benchmarks/math/sums_diffs_finite_sets/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/sums_diffs_finite_sets/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/sums_diffs_finite_sets/initial_program.py b/benchmarks/math/sums_diffs_finite_sets/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..d63997af4b28cd0838812006562c60d1cf710bed
--- /dev/null
+++ b/benchmarks/math/sums_diffs_finite_sets/initial_program.py
@@ -0,0 +1,117 @@
+# EVOLVE-BLOCK-START
+import jax
+import jax.numpy as jnp
+from dataclasses import dataclass
+import numpy as np
+import tqdm
+
+
+@dataclass
+class Hyperparameters:
+    max_integer: int = 250
+    num_restarts: int = 5
+    num_search_steps: int = 1000
+    initial_temperature: float = 0.01
+
+
+class C6Searcher:
+    """
+    Searches for a set U by running the search in pure Python for correctness.
+    """
+
+    def __init__(self, hypers: Hyperparameters):
+        self.hypers = hypers
+        self.allowed_values = jnp.array((-1, 0, 1), dtype=jnp.int32)
+
+    @staticmethod
+    def _objective_fn(u_mask: jnp.ndarray) -> jnp.ndarray:
+        """Calculates the C6 lower bound using jnp.unique"""
+        U = jnp.where(u_mask)[0]
+
+        sums = U[:, None] + U[None, :]
+        diffs = U[:, None] - U[None, :]
+
+        size_U_plus_U = jnp.unique(sums).shape[0]
+        size_U_minus_U = jnp.unique(diffs).shape[0]
+        max_U = jnp.max(U)
+
+        # Handle the case where max_U is 0 to avoid log(1)=0 in denominator
+        if max_U == 0:
+            return -1.0  # Return a low value for trivial sets
+
+        ratio = size_U_minus_U / size_U_plus_U
+        c6_bound = 1 + jnp.log(ratio) / jnp.log(2 * max_U + 1)
+
+        return -c6_bound  # Return negative for maximization
+
+    def anneal_step(self, key, temp, current_mask, current_loss):
+        """Performs one step of Simulated Annealing (not JIT-compiled)."""
+        # Propose a random mutation
+        idx_to_flip = jax.random.randint(key, (), 1, len(current_mask))
+        neighbor_mask = current_mask.at[idx_to_flip].set(1 - current_mask[idx_to_flip])
+
+        neighbor_loss = self._objective_fn(neighbor_mask)
+        delta_loss = neighbor_loss - current_loss
+
+        # Metropolis acceptance criterion
+        should_accept = False
+        if delta_loss < 0:
+            should_accept = True
+        else:
+            accept_prob = jnp.exp(-delta_loss / temp)
+            if jax.random.uniform(key) < accept_prob:
+                should_accept = True
+
+        if should_accept:
+            return neighbor_mask, neighbor_loss
+        else:
+            return current_mask, current_loss
+
+
+def run():
+    hypers = Hyperparameters()
+    main_key = jax.random.PRNGKey(42)
+
+    best_loss = float("inf")
+    best_set_np = None
+
+    for i in range(hypers.num_restarts):
+        print(f"\n{'='*20} Restart {i+1}/{hypers.num_restarts} {'='*20}")
+        restart_key, main_key = jax.random.split(main_key)
+        loss, u_set_np = run_single_trial(hypers, restart_key)
+
+        if loss < best_loss:
+            print(f"New best C6 bound found: {-loss:.8f}")
+            best_loss = loss
+            best_set_np = u_set_np
+
+    c6_bound = -best_loss
+    print(f"\nSearch complete. Best C6 lower bound found: {c6_bound:.8f}")
+    return best_set_np, c6_bound
+
+
+def run_single_trial(hypers, key):
+    # Initialize a random sparse set, ensuring 0 is included
+    key, subkey = jax.random.split(key)
+    sparsity = 0.95
+    u_mask = jax.random.bernoulli(subkey, p=(1 - sparsity), shape=(hypers.max_integer + 1,))
+    u_mask = u_mask.at[0].set(True)
+
+    searcher = C6Searcher(hypers)
+    current_loss = searcher._objective_fn(u_mask)
+
+    print(f"Starting SA search. Initial C6 bound: {-current_loss:.6f}")
+
+    current_mask = u_mask
+    for step in tqdm.tqdm(range(hypers.num_search_steps), desc="Annealing Progress"):
+        key, subkey = jax.random.split(key)
+        current_temp = hypers.initial_temperature * (1 - step / hypers.num_search_steps)
+        current_mask, current_loss = searcher.anneal_step(
+            subkey, jnp.maximum(current_temp, 1e-6), current_mask, current_loss
+        )
+
+    final_set = np.where(current_mask)[0]
+    return current_loss, final_set
+
+
+# EVOLVE-BLOCK-END
diff --git a/benchmarks/math/third_autocorr_ineq/config.yaml b/benchmarks/math/third_autocorr_ineq/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6abd6d4eb665b10f22973c78496b7d64c5d90ce8
--- /dev/null
+++ b/benchmarks/math/third_autocorr_ineq/config.yaml
@@ -0,0 +1,66 @@
+# Math benchmark: third_autocorr_ineq
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: 'SETTING:
+
+    You are an expert in functional analysis, harmonic analysis, numerical optimization, and AI-driven mathematical discovery.
+
+    Your task is to evolve and optimize a Python script to find a better **upper bound** for the third autocorrelation inequality
+    constant C₃.
+
+
+    PROBLEM CONTEXT:
+
+    Target: Find a function f: R → R (which can take positive and negative values) that **minimizes** the constant C3 in the
+    inequality:
+
+    max_{-1/2≤t≤1/2} |f ★ f(t)| ≥ C3 (∫_{-1/4}^{1/4} f(x) dx)²
+
+
+    This is equivalent to minimizing the ratio: C3 = max |f ★ f| / (∫f)²
+
+
+    Current best known bound: C3 ≤ 1.45810
+
+    Goal: Beat the AlphaEvolve upper bound of 1.4556427953745406.
+
+
+    Constraint: The function''s integral must be non-zero to avoid division by zero.
+
+
+    PERFORMANCE METRICS:
+
+    - c3: The C3 constant achieved by the discovered function.
+
+    - combined_score: 1.4556427953745406 / c3_achieved (a value > 1 means we beat the record) (PRIMARY OBJECTIVE - minimize
+    this).
+
+    - loss: loss value returned by the loss function.
+
+    - n_points: number of points used to the discretization of the interval.
+
+    - eval_time: evaluation time to run the solution script.
+
+
+    VALIDATION FRAMEWORK:
+
+    - The evaluation script re-computes the C3 ratio using `numpy.convolve` and `numpy.abs` to verify the value from the optimizer.
+
+    - It checks that the function''s integral is not close to zero.
+
+    '
+evaluator:
+  timeout: 600
+  max_retries: 3
diff --git a/benchmarks/math/third_autocorr_ineq/evaluator/Dockerfile b/benchmarks/math/third_autocorr_ineq/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/third_autocorr_ineq/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/third_autocorr_ineq/evaluator/evaluate.sh b/benchmarks/math/third_autocorr_ineq/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/third_autocorr_ineq/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/third_autocorr_ineq/evaluator/evaluator.py b/benchmarks/math/third_autocorr_ineq/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..6481d1e4b849d28e248dc3bf8615696568903abe
--- /dev/null
+++ b/benchmarks/math/third_autocorr_ineq/evaluator/evaluator.py
@@ -0,0 +1,91 @@
+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for the third autocorrelation inequality problem.
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+
+import sys
+import os
+from importlib import __import__
+import time
+import numpy as np
+
+# Known bounds
+BENCHMARK = 1.4556427953745406
+# Note: This is a non-convex optimization problem with multiple local optima.
+# Observed performance depends on search space coverage and exploration thoroughness.
+# Computational budget can be allocated toward improving solution quality.
+
+
+def verify_c3_solution(f_values: np.ndarray, c3_achieved: float, n_points: int):
+    """Verify the solution for the C3 UPPER BOUND optimization."""
+
+    if f_values.shape != (n_points,):
+        raise ValueError(f"Expected function values shape {(n_points,)}. Got {f_values.shape}.")
+
+    # Recompute C3 using NumPy to verify
+    dx = 0.5 / n_points
+
+    # Squared integral of f
+    integral_f_sq = (np.sum(f_values) * dx) ** 2
+
+    if integral_f_sq < 1e-9:
+        raise ValueError("Function integral is close to zero, ratio is unstable.")
+
+    # Max absolute value of the scaled autoconvolution
+    conv = np.convolve(f_values, f_values, mode="full")
+    scaled_conv = conv * dx
+    max_abs_conv = np.max(np.abs(scaled_conv))
+
+    computed_c3 = max_abs_conv / integral_f_sq
+
+    delta = abs(computed_c3 - c3_achieved)
+    if delta > 1e-3:
+        raise ValueError(
+            f"C3 mismatch: reported {c3_achieved:.6f}, computed {computed_c3:.6f}, delta: {delta:.6f}"
+        )
+
+
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+            start_time = time.time()
+            f_values, c3_achieved, loss, n_points = program.run()
+            end_time = time.time()
+            eval_time = end_time - start_time
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+
+        verify_c3_solution(f_values, c3_achieved, n_points)
+
+        return {
+            "c3": float(c3_achieved),
+            "combined_score": BENCHMARK / float(c3_achieved),
+            "loss": float(loss),
+            "n_points": int(n_points),
+            "eval_time": float(eval_time),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/third_autocorr_ineq/evaluator/requirements.txt b/benchmarks/math/third_autocorr_ineq/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3dee69521695f692e340afea5918ed74f057d6aa
--- /dev/null
+++ b/benchmarks/math/third_autocorr_ineq/evaluator/requirements.txt
@@ -0,0 +1,3 @@
+numpy
+jax
+optax
\ No newline at end of file
diff --git a/benchmarks/math/third_autocorr_ineq/evaluator/wrapper.py b/benchmarks/math/third_autocorr_ineq/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/third_autocorr_ineq/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/third_autocorr_ineq/initial_program.py b/benchmarks/math/third_autocorr_ineq/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f8bbfb40b60285d3d9b6ea841fbfe1045735235
--- /dev/null
+++ b/benchmarks/math/third_autocorr_ineq/initial_program.py
@@ -0,0 +1,107 @@
+# EVOLVE-BLOCK-START
+import jax
+import jax.numpy as jnp
+import optax
+import numpy as np
+from dataclasses import dataclass
+
+
+@dataclass
+class Hyperparameters:
+    """Hyperparameters for the optimization process."""
+
+    num_intervals: int = 400
+    learning_rate: float = 0.005
+    num_steps: int = 20000
+    warmup_steps: int = 2000
+
+
+class C3Optimizer:
+    """
+    Optimizes a function f (with positive and negative values) to find an
+    upper bound for the C3 constant.
+    """
+
+    def __init__(self, hypers: Hyperparameters):
+        self.hypers = hypers
+        self.domain_width = 0.5
+        self.dx = self.domain_width / self.hypers.num_intervals
+
+    def _objective_fn(self, f_values: jnp.ndarray) -> jnp.ndarray:
+        """
+        Computes the C3 ratio. The goal is to minimize this value.
+        """
+        # The squared integral of f.
+        integral_f = jnp.sum(f_values) * self.dx
+        eps = 1e-9
+        integral_f_sq_safe = jnp.maximum(integral_f**2, eps)
+
+        # The max of the absolute value of the autoconvolution.
+        N = self.hypers.num_intervals
+        padded_f = jnp.pad(f_values, (0, N))
+
+        fft_f = jnp.fft.fft(padded_f)
+        conv_f_f = jnp.fft.ifft(fft_f * fft_f).real
+
+        # Scale the unscaled convolution sum by dx to approximate the integral.
+        scaled_conv_f_f = conv_f_f * self.dx
+
+        # Take the maximum of the absolute value.
+        max_abs_conv = jnp.max(jnp.abs(scaled_conv_f_f))
+
+        c3_ratio = max_abs_conv / integral_f_sq_safe
+
+        # We want to MINIMIZE the ratio.
+        return c3_ratio
+
+    def train_step(self, f_values: jnp.ndarray, opt_state: optax.OptState) -> tuple:
+        """Performs a single training step."""
+        loss, grads = jax.value_and_grad(self._objective_fn)(f_values)
+        updates, opt_state = self.optimizer.update(grads, opt_state, f_values)
+        f_values = optax.apply_updates(f_values, updates)
+        return f_values, opt_state, loss
+
+    def run_optimization(self):
+        """Sets up and runs the full optimization process."""
+        schedule = optax.warmup_cosine_decay_schedule(
+            init_value=0.0,
+            peak_value=self.hypers.learning_rate,
+            warmup_steps=self.hypers.warmup_steps,
+            decay_steps=self.hypers.num_steps - self.hypers.warmup_steps,
+            end_value=self.hypers.learning_rate * 1e-4,
+        )
+        self.optimizer = optax.adam(learning_rate=schedule)
+
+        key = jax.random.PRNGKey(42)
+        f_values = jax.random.normal(key, (self.hypers.num_intervals,))
+
+        opt_state = self.optimizer.init(f_values)
+        print(
+            f"Number of intervals (N): {self.hypers.num_intervals}, Steps: {self.hypers.num_steps}"
+        )
+        train_step_jit = jax.jit(self.train_step)
+
+        loss = jnp.inf
+        for step in range(self.hypers.num_steps):
+            f_values, opt_state, loss = train_step_jit(f_values, opt_state)
+            if step % 1000 == 0 or step == self.hypers.num_steps - 1:
+                print(f"Step {step:5d} | C3 ≈ {loss:.8f}")
+
+        final_c3 = loss
+        print(f"Final C3 upper bound found: {final_c3:.8f}")
+        return f_values, final_c3
+
+
+def run():
+    """Entry point for running the optimization."""
+    hypers = Hyperparameters()
+    optimizer = C3Optimizer(hypers)
+    optimized_f, final_c3_val = optimizer.run_optimization()
+
+    loss_val = final_c3_val
+    f_values_np = np.array(optimized_f)
+
+    return f_values_np, float(final_c3_val), float(loss_val), hypers.num_intervals
+
+
+# EVOLVE-BLOCK-END
diff --git a/benchmarks/math/uncertainty_ineq/config.yaml b/benchmarks/math/uncertainty_ineq/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..91a826b02896ef79c4d2e7d60e3b28ab8732a28b
--- /dev/null
+++ b/benchmarks/math/uncertainty_ineq/config.yaml
@@ -0,0 +1,45 @@
+# Math benchmark: uncertainty_ineq
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: |
+    SETTING:
+    You are an expert in harmonic analysis, numerical optimization, and AI-driven mathematical discovery.
+    Your task is to evolve and optimize a Python script to find a better **upper bound** for the uncertainty inequality constant C₄.
+
+    PROBLEM CONTEXT:
+    Target: Find an even function f(x) that **minimizes** the product A(f)A(f̂), where A(f) is the largest positive root of f.
+    This minimal product provides a tight upper bound for the constant C₄.
+
+    Current best known upper bound: C₄ ≤ 0.3215872333529007
+    Goal: Find a set of coefficients for a test function that results in a C₄ value lower than 0.3215872333529007.
+
+    METHOD:
+    The test function is parameterized as f(x) = P(x)exp(-πx²), where P(x) is a linear combination of even Hermite polynomials: P(x) = c₀H₀(x) + c₁H₄(x) + c₂H₈(x) + ...
+    The problem simplifies to finding coefficients [c₀, c₁, c₂, ...] that minimize the largest positive root of P(x), subject to the constraint P(0) = 0.
+    The final C₄ bound is the square of this minimal root, (r_max)².
+
+    PERFORMANCE METRICS:
+    - c4_bound: bound found by the algorithm.
+    - combined_score: 0.3215872333529007 / c4_bound (The primary objective is to MAXIMIZE this value - a value > 1 means a new record).
+    - r_max: largest positive root found.
+    - coeffs: coefficients found in the optimization.
+    - eval_time: evaluation time of the script.
+
+    VALIDATION FRAMEWORK:
+    - The evaluation script reconstructs the polynomial from the discovered coefficients and uses `numpy.roots` to independently verify the largest positive root and the C4 bound.
+    - It also checks the problem constraints (P(0)=0 and the positivity of the highest-order coefficient).
+evaluator:
+  timeout: 600
+  max_retries: 3
diff --git a/benchmarks/math/uncertainty_ineq/evaluator/Dockerfile b/benchmarks/math/uncertainty_ineq/evaluator/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..dc521d98b2727a60647a195e4115f6fabc99d75c
--- /dev/null
+++ b/benchmarks/math/uncertainty_ineq/evaluator/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-slim
+WORKDIR /benchmark
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+
+ENTRYPOINT ["./evaluate.sh"]
diff --git a/benchmarks/math/uncertainty_ineq/evaluator/evaluate.sh b/benchmarks/math/uncertainty_ineq/evaluator/evaluate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..38de730d16fd916c64743f6d2d60af20ab1a8634
--- /dev/null
+++ b/benchmarks/math/uncertainty_ineq/evaluator/evaluate.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+
+python /benchmark/evaluator.py "$PROGRAM"
diff --git a/benchmarks/math/uncertainty_ineq/evaluator/evaluator.py b/benchmarks/math/uncertainty_ineq/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e7ea5dda4e60ee3184679d9e1fe726d73a2c9c3
--- /dev/null
+++ b/benchmarks/math/uncertainty_ineq/evaluator/evaluator.py
@@ -0,0 +1,137 @@
+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for the uncertainty inequality problem.
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+
+import sys, os, time, numpy as np
+import sympy as sp
+
+BENCHMARK = 0.3215872333529007
+
+x = sp.symbols("x")
+
+
+def _hermite_4k_polys(m: int):
+    degrees = [4 * k for k in range(m)]
+    Hs = [sp.polys.orthopolys.hermite_poly(n=d, x=x, polys=False) for d in degrees]
+    return Hs, degrees
+
+
+def _construct_P_with_forced_zero(coeffs: np.ndarray) -> sp.Expr:
+    """
+    Given m input coeffs (c0..c_{m-1}), build the Hermite combo
+    c0*H0 + ... + c_{m-1}*H_{4(m-1)} + c_last*H_{4m}
+    where c_last is chosen so that P(0) = 0.
+    Also flip sign if limit at +inf is negative.
+    """
+    m = len(coeffs)
+    Hs, _ = _hermite_4k_polys(m + 1)  # include the (m)-th term for solving P(0)=0
+    rc = [sp.Rational(c) for c in coeffs]
+
+    partial = sum(rc[i] * Hs[i] for i in range(m))
+    a = Hs[m].subs(x, 0)
+    b = -partial.subs(x, 0)
+    c_last = sp.Rational(b) / sp.Rational(a)
+
+    P = partial + c_last * Hs[m]
+
+    # Ensure positivity at +inf (all degrees are multiples of 4, so sign is well-defined)
+    if sp.limit(P, x, sp.oo) < 0:
+        P = -P
+
+    return sp.simplify(P)
+
+
+def _largest_positive_root_of_P_over_x2(P: sp.Expr) -> float:
+    # P is even and has P(0)=0; divide by x^2 and find the largest positive real root.
+    gq = sp.exquo(P, x**2)  # exact division (should divide cleanly if multiplicity >= 2)
+    roots = sp.real_roots(gq, x)
+    if not roots:
+        raise ValueError("No real roots for P(x)/x^2.")
+
+    # Validate sign change around each candidate
+    best = None
+    for r in roots:
+        r_approx = r.eval_rational(n=200)
+        eps = sp.Rational(1, 10**198)
+        left = gq.subs(x, r_approx - eps)
+        right = gq.subs(x, r_approx + eps)
+        if (left > 0 and right < 0) or (left < 0 and right > 0):
+            if best is None or r_approx > best:
+                best = r_approx
+
+    if best is None:
+        raise ValueError("No root with a verified sign change for P(x)/x^2.")
+    return float(best)
+
+
+def compute_c4_and_rmax(input_coeffs: np.ndarray):
+    P = _construct_P_with_forced_zero(input_coeffs)
+    # Quick sanity checks
+    assert P.subs(x, 0) == 0, "P(0) != 0 after forcing."
+    assert sp.limit(P, x, sp.oo) > 0, "Limit at +inf is not positive."
+
+    rmax = _largest_positive_root_of_P_over_x2(P)
+    c4 = (rmax**2) / (2.0 * np.pi)
+    return c4, rmax
+
+
+def verify_c4_solution_strict(
+    user_coeffs: np.ndarray, reported_c4: float, reported_rmax: float, atol=1e-9, rtol=1e-9
+):
+    c4, rmax = compute_c4_and_rmax(np.asarray(user_coeffs, dtype=float))
+
+    if not np.isclose(c4, reported_c4, rtol=rtol, atol=atol):
+        raise ValueError(f"C4 mismatch: reported {reported_c4:.12f}, recomputed {c4:.12f}")
+
+    if not np.isclose(rmax, reported_rmax, rtol=rtol, atol=atol):
+        raise ValueError(f"r_max mismatch: reported {reported_rmax:.12f}, recomputed {rmax:.12f}")
+
+    return c4, rmax
+
+
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+            t0 = time.time()
+            coeffs, c4_bound, r_max = program.run()
+            t1 = time.time()
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+
+        coeffs = np.asarray(coeffs, dtype=float)
+
+        c4, rmax = verify_c4_solution_strict(coeffs, float(c4_bound), float(r_max))
+
+        return {
+            "c4_bound": float(c4),
+            "combined_score": float(BENCHMARK / c4),
+            "r_max": float(rmax),
+            "coeffs": coeffs.tolist(),
+            "eval_time": float(t1 - t0),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+
+
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+
+    run(evaluate)
diff --git a/benchmarks/math/uncertainty_ineq/evaluator/requirements.txt b/benchmarks/math/uncertainty_ineq/evaluator/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..48d8952e69a26a1327dd3fecc28f7b45d92bb41f
--- /dev/null
+++ b/benchmarks/math/uncertainty_ineq/evaluator/requirements.txt
@@ -0,0 +1,4 @@
+numpy
+sympy
+scipy
+tqdm
\ No newline at end of file
diff --git a/benchmarks/math/uncertainty_ineq/evaluator/wrapper.py b/benchmarks/math/uncertainty_ineq/evaluator/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..1813ec713bae8b5a79288748e93c4fdc02a2d303
--- /dev/null
+++ b/benchmarks/math/uncertainty_ineq/evaluator/wrapper.py
@@ -0,0 +1,98 @@
+"""Backwards-compat wrapper for old Python-based evaluators.
+
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+
+Usage — add this to the bottom of your evaluator.py::
+
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+
+import json
+import sys
+import traceback
+
+
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+
+    program_path = sys.argv[1]
+
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+
+    print(json.dumps(output))
diff --git a/benchmarks/math/uncertainty_ineq/initial_program.py b/benchmarks/math/uncertainty_ineq/initial_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..0da75dd3ed271fdd6c55306081cd3ad83dd7fc83
--- /dev/null
+++ b/benchmarks/math/uncertainty_ineq/initial_program.py
@@ -0,0 +1,211 @@
+# Disable progress bar for cleaner output logs
+import os
+
+os.environ["TQDM_DISABLE"] = "1"
+
+# EVOLVE-BLOCK-START
+import jax
+import jax.numpy as jnp
+import optax
+import numpy as np
+from dataclasses import dataclass
+from scipy.special import hermite
+import tqdm
+
+
+@dataclass
+class Hyperparameters:
+    learning_rate: float = 0.001
+    num_steps: int = 100000
+    num_restarts: int = 20
+    num_hermite_coeffs: int = 4  # uses H0, H4, H8, H12
+
+
+class UncertaintyOptimizer:
+    """
+    Finds coefficients for a generalized Hermite polynomial P(x) that minimize
+    the largest positive root, providing an upper bound for C4.
+    """
+
+    def __init__(self, hypers: Hyperparameters):
+        self.hypers = hypers
+        self.degrees = [4 * k for k in range(hypers.num_hermite_coeffs)]
+        max_degree = self.degrees[-1]
+        hermite_polys = [hermite(d) for d in self.degrees]
+
+        basis = []
+        for poly in hermite_polys:
+            pad_amount = max_degree - poly.order
+            basis.append(jnp.array(np.pad(poly.coef, (pad_amount, 0))))
+        self.hermite_basis = jnp.stack(basis)
+
+        self.H_vals_at_zero = jnp.array([p(0) for p in hermite_polys])
+        self.x_grid = jnp.linspace(0.0, 10.0, 3000)
+        self.optimizer = optax.adam(self.hypers.learning_rate)
+
+    @staticmethod
+    def _objective_fn(params: jnp.ndarray, hermite_basis, H_vals_at_zero, x_grid):
+        """Penalize negative values of P(x) on [0, Xmax]; mild weighting to emphasize larger x."""
+        c_others, log_c_last = params[:-1], params[-1]
+        c_last = jnp.exp(log_c_last)
+
+        # Enforce P(0) = 0
+        c0 = (
+            -(jnp.sum(c_others * H_vals_at_zero[1:-1]) + c_last * H_vals_at_zero[-1])
+            / H_vals_at_zero[0]
+        )
+        hermite_coeffs = jnp.concatenate([jnp.array([c0]), c_others, jnp.array([c_last])])
+
+        poly_coeffs_std = jnp.sum(hermite_coeffs[:, None] * hermite_basis, axis=0)
+        p_values = jnp.polyval(poly_coeffs_std, x_grid)
+
+        # Slightly increasing weight toward the right end of the interval
+        weights = 1.0 + (x_grid / (x_grid[-1] + 1e-12))
+        loss = jnp.sum(weights * jax.nn.relu(-p_values))
+        return loss
+
+    @staticmethod
+    def train_step(
+        params: jnp.ndarray,
+        opt_state: optax.OptState,
+        optimizer,
+        hermite_basis,
+        H_vals_at_zero,
+        x_grid,
+    ):
+        loss, grads = jax.value_and_grad(UncertaintyOptimizer._objective_fn)(
+            params, hermite_basis, H_vals_at_zero, x_grid
+        )
+        updates, opt_state = optimizer.update(grads, opt_state, params)
+        params = optax.apply_updates(params, updates)
+        return params, opt_state, loss
+
+
+def run_single_trial(optimizer: UncertaintyOptimizer, key: jax.random.PRNGKey):
+    """Runs one full optimization from a near-good starting point with small noise."""
+    num_params_to_opt = optimizer.hypers.num_hermite_coeffs - 1  # = 3 when using H0,H4,H8,H12
+    assert num_params_to_opt == 3, "This initialization assumes num_hermite_coeffs == 4."
+
+    base_c1 = -0.01158510802599293
+    base_c2 = -8.921606035407065e-05
+    base_log_c_last = np.log(1e-6)
+
+    base = jnp.array([base_c1, base_c2, base_log_c_last], dtype=jnp.float32)
+    noise = jax.random.normal(key, (num_params_to_opt,)) * 1e-3
+    params = base + noise
+
+    opt_state = optimizer.optimizer.init(params)
+    jit_train_step = jax.jit(UncertaintyOptimizer.train_step, static_argnums=(2,))
+
+    for _ in range(optimizer.hypers.num_steps):
+        params, opt_state, _ = jit_train_step(
+            params,
+            opt_state,
+            optimizer.optimizer,
+            optimizer.hermite_basis,
+            optimizer.H_vals_at_zero,
+            optimizer.x_grid,
+        )
+    return params
+
+
+def _build_P_from_hermite_coeffs(hermite_coeffs: np.ndarray, degrees: list[int]) -> np.poly1d:
+    """Build monomial-basis polynomial P from Hermite-basis coefficients."""
+    max_degree = degrees[-1]
+    hermite_polys = [hermite(d) for d in degrees]
+
+    P_poly_coeffs = np.zeros(max_degree + 1)
+    for i, c in enumerate(hermite_coeffs):
+        poly = hermite_polys[i]
+        pad_amount = max_degree - poly.order
+        P_poly_coeffs[pad_amount:] += c * poly.coef
+
+    if P_poly_coeffs[0] < 0:
+        P_poly_coeffs = -P_poly_coeffs
+        hermite_coeffs[:] = -hermite_coeffs
+    return np.poly1d(P_poly_coeffs)
+
+
+def _c4_from_hermite_coeffs(hermite_coeffs: np.ndarray, num_hermite_coeffs: int):
+    """Compute r_max and C4 from full Hermite coefficient vector."""
+    degrees = [4 * k for k in range(num_hermite_coeffs)]
+    P = _build_P_from_hermite_coeffs(hermite_coeffs.copy(), degrees)
+
+    # Divide by x^2
+    Q, R = np.polydiv(P, np.poly1d([1.0, 0.0, 0.0]))
+    if np.max(np.abs(R.c)) > 1e-10:
+        return None, None
+
+    roots = Q.r
+    real_pos = roots[(np.isreal(roots)) & (roots.real > 0)].real
+    if real_pos.size == 0:
+        return None, None
+
+    # Tiny sign-change check around candidates
+    r_candidates = np.sort(real_pos)
+    r_max = None
+    for r in r_candidates:
+        eps = 1e-10 * max(1.0, abs(r))
+        left = np.polyval(Q, r - eps)
+        right = np.polyval(Q, r + eps)
+        if left * right < 0:
+            r_max = float(r)
+    if r_max is None:
+        r_max = float(r_candidates[-1])
+
+    c4 = (r_max**2) / (2 * np.pi)
+    return c4, r_max
+
+
+def get_c4_from_params(params: np.ndarray, hypers: Hyperparameters):
+    """Calculates the precise C4 bound from a final set of parameters."""
+    c_others, log_c_last = params[:-1], params[-1]
+    c_last = np.exp(log_c_last)
+
+    degrees = [4 * k for k in range(hypers.num_hermite_coeffs)]
+    hermite_polys = [hermite(d) for d in degrees]
+    H_vals_at_zero = np.array([p(0) for p in hermite_polys])
+
+    # Enforce P(0) = 0
+    c0 = (
+        -(np.sum(c_others * H_vals_at_zero[1:-1]) + c_last * H_vals_at_zero[-1]) / H_vals_at_zero[0]
+    )
+    hermite_coeffs = np.concatenate([[c0], np.array(c_others), [c_last]])
+
+    c4, rmax = _c4_from_hermite_coeffs(hermite_coeffs, hypers.num_hermite_coeffs)
+    if c4 is None:
+        return None, None, None
+    return hermite_coeffs, c4, rmax
+
+
+def run():
+    hypers = Hyperparameters()
+    optimizer = UncertaintyOptimizer(hypers)
+    main_key = jax.random.PRNGKey(42)
+    best_c4_bound = float("inf")
+    best_coeffs, best_r_max = None, None
+
+    print(f"Running {hypers.num_restarts} trials to find the best C4 upper bound...")
+    for _ in tqdm.tqdm(range(hypers.num_restarts), desc="Searching", disable=True):
+        main_key, restart_key = jax.random.split(main_key)
+        final_params = run_single_trial(optimizer, restart_key)
+
+        coeffs, c4_bound, r_max = get_c4_from_params(np.array(final_params), hypers)
+
+        if c4_bound is not None and c4_bound < best_c4_bound:
+            best_c4_bound = c4_bound
+            best_coeffs = coeffs
+            best_r_max = r_max
+
+    if best_coeffs is None:
+        raise RuntimeError("Failed to find a valid solution in any restart.")
+
+    print("\nSearch complete.")
+    print(f"Best Hermite coeffs: {best_coeffs}")
+    print(f"Best largest positive root r_max: {best_r_max:.8f}")
+    print(f"Resulting best C4 upper bound: {best_c4_bound:.8f}")
+
+    return best_coeffs, best_c4_bound, best_r_max
+
+
+# EVOLVE-BLOCK-END