JustinTX commited on 20 days ago

Commit

b0e88cf

verified ·

1 Parent(s): af83196

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +5 -0
assets/architecture.png +3 -0
assets/benchmarks.png +3 -0
assets/comparison.png +3 -0
assets/logo_vector.png +3 -0
assets/scaling_comparison.png +3 -0
benchmarks/ADRS/README.md +63 -0
benchmarks/ADRS/eplb/config.yaml +37 -0
benchmarks/ADRS/llm_sql/README.md +56 -0
benchmarks/ADRS/llm_sql/initial_program.py +365 -0
benchmarks/ADRS/prism/config.yaml +24 -0
benchmarks/ADRS/prism/evaluator/evaluate.sh +7 -0
benchmarks/ADRS/prism/evaluator/evaluator.py +259 -0
benchmarks/ADRS/prism/initial_program.py +75 -0
benchmarks/ADRS/prism/initial_program_naive.py +30 -0
benchmarks/arc_benchmark/README.md +108 -0
benchmarks/arc_benchmark/config.yaml +51 -0
benchmarks/arc_benchmark/convert_arc_agi2_data.py +63 -0
benchmarks/arc_benchmark/evaluator/Dockerfile +13 -0
benchmarks/arc_benchmark/evaluator/evaluate.sh +7 -0
benchmarks/arc_benchmark/evaluator/evaluator.py +407 -0
benchmarks/arc_benchmark/evaluator/requirements.txt +1 -0
benchmarks/arc_benchmark/evaluator/wrapper.py +98 -0
benchmarks/arc_benchmark/generate_config.py +101 -0
benchmarks/arc_benchmark/initial_program.py +42 -0
benchmarks/arc_benchmark/post_discovery_eval.py +157 -0
benchmarks/frontier-cs-eval/README.md +72 -0
benchmarks/frontier-cs-eval/analyze_results.py +105 -0
benchmarks/frontier-cs-eval/combine_results.py +66 -0
benchmarks/frontier-cs-eval/config.yaml +57 -0
benchmarks/frontier-cs-eval/evaluator.py +174 -0
benchmarks/frontier-cs-eval/initial_program.cpp +6 -0
benchmarks/frontier-cs-eval/run_all_frontiercs.py +70 -0
benchmarks/frontier-cs-eval/run_best_programs_frontiercs.py +404 -0
benchmarks/gpu_mode/mla_decode/config.yaml +355 -0
benchmarks/gpu_mode/mla_decode/initial_program.py +245 -0
benchmarks/gpu_mode/mla_decode/reference.py +520 -0
benchmarks/gpu_mode/mla_decode/requirements.txt +2 -0
benchmarks/gpu_mode/trimul/initial_program.py +84 -0
benchmarks/image_gen/README.md +40 -0
benchmarks/image_gen/sky_festival/evaluator.py +220 -0
benchmarks/math/README.md +43 -0
benchmarks/math/circle_packing/README.md +38 -0
benchmarks/math/circle_packing/codebase/reference/hex_grid.py +43 -0
benchmarks/math/circle_packing/codebase/reference/optimization_patterns.py +94 -0
benchmarks/math/circle_packing/codebase/reference/packing_strategies.md +45 -0
benchmarks/math/circle_packing/config.yaml +54 -0
benchmarks/math/circle_packing/evaluator.py +338 -0
benchmarks/math/circle_packing/evaluator/Dockerfile +11 -0
benchmarks/math/circle_packing/evaluator/evaluate.sh +8 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/logo_vector.png filter=lfs diff=lfs merge=lfs -text
+assets/benchmarks.png filter=lfs diff=lfs merge=lfs -text
+assets/scaling_comparison.png filter=lfs diff=lfs merge=lfs -text
+assets/architecture.png filter=lfs diff=lfs merge=lfs -text
+assets/comparison.png filter=lfs diff=lfs merge=lfs -text

assets/architecture.png ADDED Viewed

Git LFS Details

SHA256: 3b10c6bfb1734211abab7fa2e53b36931428d842ade3c96cbef255543b3889d8
Pointer size: 131 Bytes
Size of remote file: 278 kB

assets/benchmarks.png ADDED Viewed

Git LFS Details

SHA256: 42a69cb4c8119b79901ecfcdf93088e932643d6e0890d3c984dead40c407dc5b
Pointer size: 131 Bytes
Size of remote file: 758 kB

assets/comparison.png ADDED Viewed

Git LFS Details

SHA256: 8d68074ff5106764b1328b23ef5e949332aab3541172f8d91e2580d6f168e184
Pointer size: 131 Bytes
Size of remote file: 399 kB

assets/logo_vector.png ADDED Viewed

Git LFS Details

SHA256: d74ce6a1024e519a5afc85706133e31bafeb06b48b603a11284845b549cb586e
Pointer size: 131 Bytes
Size of remote file: 891 kB

assets/scaling_comparison.png ADDED Viewed

Git LFS Details

SHA256: d2aa00d9f59b5e14fc10d2569b872632fb992ab61fcfbba2ae946bef9deb22d8
Pointer size: 131 Bytes
Size of remote file: 297 kB

benchmarks/ADRS/README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+# ADRS: AI-Driven Research for Systems
+This directory contains the systems optimization benchmarks from the **AI-Driven Research for Systems (ADRS)** initiative at UC Berkeley.
+ADRS investigates how AI — large language models, evolutionary algorithms, and multi-agent architectures — can autonomously design, optimize, and evaluate computer systems. Instead of treating systems research as a purely manual process, ADRS frames it as a closed-loop optimization problem: propose candidate algorithms, evaluate them against system-level objectives, analyze failure modes, adapt the search strategy, and iterate.
+Each benchmark below defines a concrete systems task with a provided evaluator, initial program, and configuration. Solutions are evolved using SkyDiscover's evolutionary search loop.
+## Benchmarks
+### Cloudcast — Multi-Cloud Data Transfer
+**Directory:** `cloudcast/`
+Given a network of cloud regions with heterogeneous egress pricing and bandwidth, broadcast a dataset from a source region to multiple destinations at minimum total cost. The evolved algorithm must construct routing topologies (e.g., relay trees, Steiner-like structures) that exploit shared intermediate hops across transfers.
+### Expert Parallelism Load Balancer (EPLB)
+**Directory:** `eplb/`
+In Mixture-of-Experts (MoE) model inference, a small subset of experts handles each token, leading to GPU load imbalance when certain experts become disproportionately popular. This task evolves an algorithm that decides how many replicas each expert should have and how to assign them across GPUs, optimizing both load-balance quality and rebalancing runtime.
+### Model Placement (Prism)
+**Directory:** `prism/`
+Assign multiple LLM models to a fixed GPU cluster (80 GB per GPU) such that the worst-case KV-cache pressure ratio across GPUs is minimized. Lower pressure means more memory headroom for serving, improving throughput and stability under varying request loads.
+### LLM-SQL — Column Reordering for Prefix Caching
+**Directory:** `llm_sql/`
+When rows of a table are serialized into LLM prompts sequentially, consecutive rows that share leading column values can reuse cached prefixes. This task evolves a column-reordering strategy that maximizes prefix-cache hit rates across multiple real-world datasets without altering the underlying data.
+### Transaction Scheduling (TXN)
+**Directory:** `txn_scheduling/`
+Given a set of database transactions with read/write dependencies on shared keys, find an execution ordering that minimizes the total makespan. The evolved scheduler must respect conflict constraints (read-write and write-write on the same key) while compressing the overall completion time.
+### Telemetry Repair
+**Coming soon.** The Telemetry Repair benchmark is under active development and will be released in a future update.
+## Quick Start
+Each benchmark directory contains:
+- `initial_program.py` — the seed solution for evolution
+- `evaluator.py` — the scoring function
+- `config.yaml` — run configuration
+Run any benchmark from the repo root:
+```bash
+uv run skydiscover-run \
+  benchmarks/ADRS/cloudcast/initial_program.py \
+  benchmarks/ADRS/cloudcast/evaluator.py \
+  -c benchmarks/ADRS/cloudcast/config.yaml \
+  -s [your_algorithm] \
+  -i 100
+```
+See the individual benchmark directories for task-specific setup instructions (e.g., dataset downloads, GPU dependencies).

benchmarks/ADRS/eplb/config.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# Expert Parallelism Load Balancer (EPLB) — MoE Expert Rearrangement
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+# NOTE: Requires expert-load.json — see README.md for download instructions.
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 5
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: |-
+    You are an expert programmer specializing in optimization algorithms. Your task
+    is to improve the Mixture-of-Expert models Expert Parallelism Load Balancer
+    (MoE EPLB) expert rearrangement algorithm.
+    This algorithm will take the load metrics recorded by the vLLM server, and
+    rearrange the experts to balance the load. It can make replicas of some experts
+    to achieve better load balancing.
+    Your goal will be two-fold:
+    1. Improve the algorithm to achieve better load balancing; while
+    2. Improve the algorithm to be more efficient, i.e. reduce the execution time
+       of the algorithm itself, since perfect load balancing is NP-hard.
+    The current algorithm is implemented in the `rebalance_experts` function.
+evaluator:
+  timeout: 360

benchmarks/ADRS/llm_sql/README.md ADDED Viewed

	@@ -0,0 +1,56 @@

+# LLM-SQL — Column Reordering for Prefix Caching
+When rows of a table are serialized into LLM prompts sequentially, consecutive rows that share leading column values can reuse cached prefixes. This task evolves a column-reordering strategy that maximizes prefix-cache hit rates across multiple real-world datasets without altering the underlying data.
+## Setup
+1. **Download the datasets** (~69 MB total):
+   ```bash
+   cd benchmarks/ADRS/llm_sql
+   bash download_dataset.sh
+   ```
+   This downloads 5 CSV datasets into `datasets/`:
+   - `movies.csv` — Rotten Tomatoes movie reviews (~9 MB)
+   - `beer.csv` — Beer review dataset (~2.5 MB)
+   - `BIRD.csv` — BIRD text-to-SQL dataset (~34 MB)
+   - `PDMX.csv` — PDMX metadata dataset (~7.4 MB)
+   - `products.csv` — Amazon product catalog (~16 MB)
+2. **Set your API key:**
+   ```bash
+   export OPENAI_API_KEY=...
+   ```
+## Run
+From the repo root:
+```bash
+uv run skydiscover-run \
+  benchmarks/ADRS/llm_sql/initial_program.py \
+  benchmarks/ADRS/llm_sql/evaluator.py \
+  -c benchmarks/ADRS/llm_sql/config.yaml \
+  -s [your_algorithm] \
+  -i 100
+```
+## Scoring
+Combined score: `0.95 * average_hit_rate + 0.05 * (12 - min(12, avg_runtime)) / 12`
+- **Hit rate** (95% weight): prefix-cache hit count normalized across 5 datasets
+- **Runtime** (5% weight): wall-clock seconds for the reordering algorithm
+## Files
+| File | Description |
+|------|-------------|
+| `initial_program.py` | Baseline `Evolved` class with `reorder()` method to evolve |
+| `evaluator.py` | Scores programs on prefix hit rate and runtime across 5 datasets |
+| `config.yaml` | Task-specific config (LLM, evaluator timeout, system prompt) |
+| `solver.py` | Base `Algorithm` class and greedy baseline |
+| `utils.py` | Prefix hit count evaluation utilities |
+| `download_dataset.sh` | Script to download required CSV datasets |

benchmarks/ADRS/llm_sql/initial_program.py ADDED Viewed

	@@ -0,0 +1,365 @@

+# EVOLVE-BLOCK-START
+import pandas as pd
+from solver import Algorithm
+from typing import Tuple, List, Dict
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from functools import lru_cache
+from collections import Counter
+import networkx as nx
+class Evolved(Algorithm):
+    """
+    GGR algorithm
+    """
+    def __init__(self, df: pd.DataFrame = None):
+        self.df = df
+        self.dep_graph = None  # NOTE: not used, for one way dependency
+        self.num_rows = 0
+        self.num_cols = 0
+        self.column_stats = None
+        self.val_len = None
+        self.row_stop = None
+        self.col_stop = None
+        self.base = 2000
+    def find_max_group_value(self, df: pd.DataFrame, value_counts: Dict, early_stop: int = 0) -> str:
+        # NOTE: recalculate value counts and length for each value
+        value_counts = Counter(df.stack())
+        weighted_counts = {val: self.val_len[val] * (count - 1) for val, count in value_counts.items()}  # if count > 1
+        if not weighted_counts:
+            return None
+        max_group_val, max_weighted_count = max(weighted_counts.items(), key=lambda x: x[1])
+        if max_weighted_count < early_stop:
+            return None
+        return max_group_val
+    def reorder_columns_for_value(self, row, value, column_names, grouped_rows_len: int = 1):
+        # cols_with_value will now use attribute access instead of indexing with row[]
+        cols_with_value = []
+        for idx, col in enumerate(column_names):
+            if hasattr(row, col) and getattr(row, col) == value:
+                cols_with_value.append(col)
+            elif hasattr(row, col.replace(" ", "_")) and getattr(row, col.replace(" ", "_")) == value:
+                cols_with_value.append(col)
+            else:
+                attr_name = f"_{idx}"
+                if hasattr(row, attr_name) and getattr(row, attr_name) == value:
+                    cols_with_value.append(attr_name)
+        if self.dep_graph is not None and grouped_rows_len > 1:
+            # NOTE: experimental
+            reordered_cols = []
+            for col in cols_with_value:
+                dependent_cols = self.get_dependent_columns(col)
+                # check if dependent columns are in row, and if column exists in row attributes
+                valid_dependent_cols = []
+                for idx, dep_col in enumerate(dependent_cols):
+                    if hasattr(row, dep_col):
+                        valid_dependent_cols.append(dep_col)
+                    elif hasattr(row, dep_col.replace(" ", "_")):
+                        valid_dependent_cols.append(dep_col)
+                    else:
+                        attr_name = f"_{idx}"
+                        if hasattr(row, attr_name):
+                            valid_dependent_cols.append(dep_col)
+                reordered_cols.extend([col] + valid_dependent_cols)
+            cols_without_value = [col for col in column_names if col not in reordered_cols]
+            reordered_cols.extend(cols_without_value)
+            assert len(reordered_cols) == len(
+                column_names
+            ), f"Reordered cols len: {len(reordered_cols)}  Original cols len: {len(column_names)}"
+            return [getattr(row, col) for col in reordered_cols], cols_with_value
+        else:
+            cols_without_value = []
+            for idx, col in enumerate(column_names):
+                if hasattr(row, col) and getattr(row, col) != value:
+                    cols_without_value.append(col)
+                elif hasattr(row, col.replace(" ", "_")) and getattr(row, col.replace(" ", "_")) != value:
+                    cols_without_value.append(col)
+                else:
+                    # Handle some edge cases
+                    attr_name = f"_{idx}"
+                    if hasattr(row, attr_name) and getattr(row, attr_name) != value:
+                        cols_without_value.append(attr_name)
+            reordered_cols = cols_with_value + cols_without_value
+            assert len(reordered_cols) == len(
+                column_names
+            ), f"Reordered cols len: {len(reordered_cols)}  Original cols len: {len(column_names)}"
+            return [getattr(row, col) for col in reordered_cols], cols_with_value
+    def get_dependent_columns(self, col: str) -> List[str]:
+        if self.dep_graph is None or not self.dep_graph.has_node(col):
+            return []
+        return list(nx.descendants(self.dep_graph, col))
+    @lru_cache(maxsize=None)
+    def get_cached_dependent_columns(self, col: str) -> List[str]:
+        return self.get_dependent_columns(col)
+    def fixed_reorder(self, df: pd.DataFrame, row_sort: bool = True) -> Tuple[pd.DataFrame, List[List[str]]]:
+        num_rows, column_stats = self.calculate_col_stats(df, enable_index=True)
+        reordered_columns = [col for col, _, _, _ in column_stats]
+        reordered_df = df[reordered_columns]
+        assert reordered_df.shape == df.shape
+        column_orderings = [reordered_columns] * num_rows
+        if row_sort:
+            reordered_df = reordered_df.sort_values(by=reordered_columns, axis=0)
+        return reordered_df, column_orderings
+    def column_recursion(self, result_df, max_value, grouped_rows, row_stop, col_stop, early_stop):
+        cols_settled = []
+        with ThreadPoolExecutor() as executor:
+            futures = [
+                executor.submit(self.reorder_columns_for_value, row, max_value, grouped_rows.columns.tolist(), len(grouped_rows))
+                for row in grouped_rows.itertuples(index=False)
+            ]
+            for i, future in enumerate(as_completed(futures)):
+                reordered_row, cols_settled = future.result()
+                result_df.loc[i] = reordered_row
+        grouped_value_counts = Counter()
+        if not result_df.empty:
+            # Group by the first column
+            grouped_result_df = result_df.groupby(result_df.columns[0])
+            grouped_value_counts = Counter(grouped_rows.stack())  # this is still faster than updating from cached value counts
+            for _, group in grouped_result_df:
+                if group[group.columns[0]].iloc[0] != max_value:
+                    continue
+                dependent_cols = self.get_cached_dependent_columns(group.columns[0])
+                length_of_settle_cols = len(cols_settled)
+                if dependent_cols:
+                    assert length_of_settle_cols >= 1, f"Dependent columns should be no less than 1, but got {length_of_settle_cols}"
+                    # test the first length_of_settle_cols columns, each column has nunique == 1
+                    for col in group.columns[:length_of_settle_cols]:
+                        assert group[col].nunique() == 1, f"Column {col} should have nunique == 1, but got {group[col].nunique()}"
+                    # drop all the settled columns and reorder the rest
+                    group_remainder = group.iloc[:, length_of_settle_cols:]
+                else:
+                    group_remainder = group.iloc[:, 1:]
+                grouped_remainder_value_counts = Counter(group_remainder.stack())
+                reordered_group_remainder, _ = self.recursive_reorder(
+                    group_remainder, grouped_remainder_value_counts, early_stop=early_stop, row_stop=row_stop, col_stop=col_stop + 1
+                )
+                # Update the group with the reordered columns
+                if dependent_cols:
+                    group.iloc[:, length_of_settle_cols:] = reordered_group_remainder.values
+                else:
+                    group.iloc[:, 1:] = reordered_group_remainder.values
+                result_df.update(group)
+                break
+        return result_df, grouped_value_counts
+    def recursive_reorder(
+        self,
+        df: pd.DataFrame,
+        value_counts: Dict,
+        early_stop: int = 0,
+        original_columns: List[str] = None,
+        row_stop: int = 0,
+        col_stop: int = 0,
+    ) -> Tuple[pd.DataFrame, List[List[str]]]:
+        if df.empty or len(df.columns) == 0 or len(df) == 0:
+            return df, []
+        if self.row_stop is not None and row_stop >= self.row_stop:
+            return self.fixed_reorder(df)
+        if self.col_stop is not None and col_stop >= self.col_stop:
+            return self.fixed_reorder(df)
+        if original_columns is None:
+            original_columns = df.columns.tolist()
+        # Find the max group value using updated counts
+        max_value = self.find_max_group_value(df, value_counts, early_stop=early_stop)
+        if max_value is None:
+            # If there is no max value, then fall back to fixed reorder
+            return self.fixed_reorder(df)
+        grouped_rows = df[df.isin([max_value]).any(axis=1)]
+        remaining_rows = df[~df.isin([max_value]).any(axis=1)]
+        # If there is no grouped rows, return the original DataFrame
+        if grouped_rows.empty:
+            return self.fixed_reorder(df)
+        result_df = pd.DataFrame(columns=df.columns)
+        reordered_remaining_rows = pd.DataFrame(columns=df.columns)  # Initialize empty dataframe first
+        # Column Recursion
+        result_df, grouped_value_counts = self.column_recursion(result_df, max_value, grouped_rows, row_stop, col_stop, early_stop)
+        remaining_value_counts = value_counts - grouped_value_counts  # Approach 1 - update remaining value counts with subtraction
+        # Row Recursion
+        reordered_remaining_rows, _ = self.recursive_reorder(
+            remaining_rows, remaining_value_counts, early_stop=early_stop, row_stop=row_stop + 1, col_stop=col_stop
+        )
+        old_column_names = result_df.columns.tolist()
+        result_cols_reset = result_df.reset_index(drop=True)
+        result_rows_reset = reordered_remaining_rows.reset_index(drop=True)
+        final_result_df = pd.DataFrame(result_cols_reset.values.tolist() + result_rows_reset.values.tolist())
+        if row_stop == 0 and col_stop == 0:
+            final_result_df.columns = old_column_names
+            final_result_df.columns = final_result_df.columns.tolist()[:-1] + ["original_index"]
+        return final_result_df, []
+    def recursive_split_and_reorder(self, df: pd.DataFrame, original_columns: List[str] = None, early_stop: int = 0):
+        """
+        Recursively split the DataFrame into halves until the size is <= 1000, then apply the recursive reorder function.
+        """
+        if len(df) <= self.base:
+            initial_value_counts = Counter(df.stack())
+            return self.recursive_reorder(df, initial_value_counts, early_stop, original_columns, row_stop=0, col_stop=0)[0]
+        mid_index = len(df) // 2
+        df_top_half = df.iloc[:mid_index]
+        df_bottom_half = df.iloc[mid_index:]
+        with ThreadPoolExecutor() as executor:
+            future_top = executor.submit(self.recursive_split_and_reorder, df_top_half, original_columns, early_stop)
+            future_bottom = executor.submit(self.recursive_split_and_reorder, df_bottom_half, original_columns, early_stop)
+        reordered_top_half = future_top.result()
+        reordered_bottom_half = future_bottom.result()
+        assert reordered_bottom_half.shape == df_bottom_half.shape
+        reordered_df = pd.concat([reordered_top_half, reordered_bottom_half], axis=0, ignore_index=True)
+        assert reordered_df.shape == df.shape
+        return reordered_df
+    @lru_cache(maxsize=None)
+    def calculate_length(self, value):
+        if isinstance(value, bool):
+            return 4**2
+        if isinstance(value, (int, float)):
+            return len(str(value)) ** 2
+        if isinstance(value, str):
+            return len(value) ** 2
+        return 0
+    def reorder(
+        self,
+        df: pd.DataFrame,
+        early_stop: int = 0,
+        row_stop: int = None,
+        col_stop: int = None,
+        col_merge: List[List[str]] = [],
+        one_way_dep: List[Tuple[str, str]] = [],
+        distinct_value_threshold: float = 0.8,
+        parallel: bool = True,
+    ) -> Tuple[pd.DataFrame, List[List[str]]]:
+        # Prepare
+        initial_df = df.copy()
+        if col_merge:
+            self.num_rows, self.column_stats = self.calculate_col_stats(df, enable_index=True)
+            reordered_columns = [col for col, _, _, _ in self.column_stats]
+            for col_to_merge in col_merge:
+                final_col_order = [col for col in reordered_columns if col in col_to_merge]
+                df = self.merging_columns(df, final_col_order, prepended=False)
+        self.num_rows, self.column_stats = self.calculate_col_stats(df, enable_index=True)
+        self.column_stats = {col: (num_groups, avg_len, score) for col, num_groups, avg_len, score in self.column_stats}
+        # One way dependency statistics [not used]
+        if one_way_dep is not None and len(one_way_dep) > 0:
+            self.dep_graph = nx.DiGraph()
+            for dep in one_way_dep:
+                col1 = [col for col in df.columns if dep[0] in col]
+                col2 = [col for col in df.columns if dep[1] in col]
+                assert len(col1) == 1, f"Expected one column to match {dep[0]}, but got {len(col1)}"
+                assert len(col2) == 1, f"Expected one column to match {dep[1]}, but got {len(col2)}"
+                col1 = col1[0]
+                col2 = col2[0]
+                self.dep_graph.add_edge(col1, col2)
+        # Discard too distinct columns by threshold [optional]
+        nunique_threshold = len(df) * distinct_value_threshold
+        columns_to_discard = [col for col in df.columns if df[col].nunique() > nunique_threshold]
+        columns_to_discard = sorted(columns_to_discard, key=lambda x: self.column_stats[x][2], reverse=True)
+        columns_to_recurse = [col for col in df.columns if col not in columns_to_discard]
+        df["original_index"] = range(len(df))
+        discarded_columns_df = df[columns_to_discard + ["original_index"]]
+        df_to_recurse = df[columns_to_recurse + ["original_index"]]
+        recurse_df = df_to_recurse
+        self.column_stats = {col: stats for col, stats in self.column_stats.items() if col not in columns_to_discard}
+        initial_value_counts = Counter(recurse_df.stack())
+        self.val_len = {val: self.calculate_length(val) for val in initial_value_counts.keys()}
+        self.row_stop = row_stop if row_stop else len(recurse_df)
+        self.col_stop = col_stop if col_stop else len(recurse_df.columns.tolist())
+        print("*" * 80)
+        print(f"DF columns = {df.columns}")
+        # print(f"Early stop = {early_stop}")
+        # print(f"Row recursion stop depth = {self.row_stop}, Column recursion stop depth = {self.col_stop}")
+        print("*" * 80)
+        # Eary stop and fall back
+        recurse_df, _ = self.fixed_reorder(recurse_df)
+        # Recursive reordering
+        self.num_cols = len(recurse_df.columns)
+        if parallel:
+            reordered_df = self.recursive_split_and_reorder(recurse_df, original_columns=columns_to_recurse, early_stop=early_stop)
+        else:
+            reordered_df, _ = self.recursive_reorder(
+                recurse_df,
+                initial_value_counts,
+                early_stop=early_stop,
+            )
+        assert (
+            reordered_df.shape == recurse_df.shape
+        ), f"Reordered DataFrame shape {reordered_df.shape} does not match original DataFrame shape {recurse_df.shape}"
+        assert recurse_df["original_index"].is_unique, "Passed in recurse index contains duplicates!"
+        assert reordered_df["original_index"].is_unique, "Reordered index contains duplicates!"
+        if len(columns_to_discard) > 0:
+            final_df = pd.merge(reordered_df, discarded_columns_df, on="original_index", how="left")
+        else:
+            final_df = reordered_df
+        final_df = final_df.drop(columns=["original_index"])
+        if not col_merge:
+            assert (
+                final_df.shape == initial_df.shape
+            ), f"Final DataFrame shape {final_df.shape} does not match original DataFrame shape {initial_df.shape}"
+        else:
+            assert (
+                final_df.shape[0] == initial_df.shape[0]
+            ), f"Final DataFrame shape {final_df.shape} does not match original DataFrame shape {initial_df.shape}"
+            assert (
+                final_df.shape[1] == recurse_df.shape[1] + len(columns_to_discard) - 1
+            ), f"Final DataFrame shape {final_df.shape} does not match original DataFrame shape {recurse_df.shape}"
+        # sort by the first column to get the final order
+        final_df = final_df.sort_values(by=final_df.columns.to_list(), axis=0)
+        return final_df, []
+# EVOLVE-BLOCK-END

benchmarks/ADRS/prism/config.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+# Prism (GPU Model Placement) — Prompt Caching Column Reordering Optimization
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 5
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: |-
+    You are an expert for model placement on GPUs. Your task is to improve a model placement algorithm by improve the function named compute_model_placement in the intial program that places models to available GPUs.
+    The algorithm must MINIMIZE the maximum KVPR across all GPUs while ensuring models can fit into the GPUs' memory. Note that KVPR is KV cache pressure for a GPU. It indicates how crowded a GPU is. For a specific GPU, its KVPR is computed as sum(model.req_rate/model.slo for model in models) / (GPU_MEM_SIZE - sum(model.model_size for model in models)), where models are the models on this GPU. The generated program should be as simple as possible and the code should be executed correctly without errors.
+evaluator:
+  timeout: 360

benchmarks/ADRS/prism/evaluator/evaluate.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+#!/usr/bin/env bash
+set -euo pipefail
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+python /benchmark/evaluator.py "$PROGRAM"

benchmarks/ADRS/prism/evaluator/evaluator.py ADDED Viewed

	@@ -0,0 +1,259 @@

+import importlib.util
+import numpy as np
+import time
+import concurrent.futures
+import traceback
+from dataclasses import dataclass
+GPU_MEM_SIZE = 80 # GB
+MIN_INT = float('-inf')  # Define MIN_INT as negative infinity
+@dataclass
+class Model:
+    model_name: str
+    model_size: int
+    req_rate: int
+    slo: int
+    cur_gpu_id: int
+def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=30):
+    """
+    Run a function with a timeout using concurrent.futures
+    """
+    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+        future = executor.submit(func, *args, **kwargs)
+        try:
+            result = future.result(timeout=timeout_seconds)
+            return result
+        except concurrent.futures.TimeoutError:
+            raise TimeoutError(f"Function timed out after {timeout_seconds} seconds")
+def safe_float(value):
+    """Convert a value to float safely"""
+    try:
+        if np.isnan(value) or np.isinf(value):
+            return 0.0
+        return float(value)
+    except (TypeError, ValueError):
+        return 0.0
+def verify_gpu_mem_constraint(placement_data: dict[int, list[Model]]) -> bool:
+    """
+    Verify the whether models can fit into GPU memory
+    """
+    # Check if the placement data is valid
+    if placement_data is None:
+        return False
+    # Check if the placement data is valid
+    for gpu_id, models in placement_data.items():
+        if sum(model.model_size for model in models) > GPU_MEM_SIZE:
+            return False
+    return True
+def calculate_kvcache_pressure(placement_data: dict[int, list[Model]]) -> float:
+    """
+    Calculate the KVCache pressure
+    """
+    max_kvpr = MIN_INT
+    for gpu_id, models in placement_data.items():
+        total_model_size = sum(model.model_size for model in models)
+        total_weighted_req_rate = sum(model.req_rate / model.slo for model in models)
+        if GPU_MEM_SIZE - total_model_size > 0:
+            kvpr = total_weighted_req_rate / (GPU_MEM_SIZE - total_model_size)
+        else:
+            kvpr = 1000000
+        max_kvpr = max(max_kvpr, kvpr)
+    return max_kvpr
+def generate_test_gpu_models(num_tests=50):
+    """
+    Generate multiple test signals with different characteristics
+    """
+    test_cases = []
+    np.random.seed(42)
+    for i in range(num_tests):
+        gpu_num = np.random.randint(5, 10)
+        gpu_models = []
+        for j in range(gpu_num*2):
+            model_size = np.random.randint(10, 30)
+            req_rate = np.random.randint(1, 10)
+            slo = np.random.randint(5, 10)
+            gpu_models.append(Model(model_name=f"model_{j}", model_size=model_size, req_rate=req_rate, slo=slo, cur_gpu_id=j))
+        test_cases.append((gpu_num, gpu_models))
+    return test_cases
+def evaluate(program_path):
+    """
+    Main evaluation function that tests the signal processing algorithm
+    on multiple test signals and calculates the composite performance metric.
+    """
+    try:
+        # Load the program
+        spec = importlib.util.spec_from_file_location("program", program_path)
+        program = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(program)
+        # Check if required function exists
+        if not hasattr(program, "compute_model_placement"):
+            return {
+                "max_kvpr": 0.0,
+                "success_rate": 0.0,
+                "combined_score": 0.0,
+                "error": "Missing compute_model_placement function",
+                }
+        # Generate test gpu and models
+        test_gpu_models = generate_test_gpu_models()
+        # Collect metrics across all tests
+        all_kvpr = []
+        all_metrics = []
+        successful_runs = 0
+        for i, (gpu_num, gpu_models) in enumerate(test_gpu_models):
+            try:
+                # Run the algorithm with timeout
+                start_time = time.time()
+                # Call the program's main function
+                result = run_with_timeout(
+                    program.compute_model_placement,
+                    kwargs={
+                        'gpu_num': gpu_num,
+                        'models': gpu_models
+                    },
+                    timeout_seconds=10
+                )
+                execution_time = time.time() - start_time
+                # Validate result format
+                if not isinstance(result, dict):
+                    return {
+                        "max_kvpr": 0.0,
+                        "success_rate": 0.0,
+                        "combined_score": 0.0,
+                        "error": f"Placement {i}: Expected dict, got {type(result).__name__}",
+                    }
+                # Validate all models are placed
+                placed_models = []
+                for gpu_id, assigned_models in result.items():
+                    if not isinstance(assigned_models, list):
+                        return {
+                            "max_kvpr": 0.0,
+                            "success_rate": 0.0,
+                            "combined_score": 0.0,
+                            "error": f"GPU {gpu_id} value must be list, got {type(assigned_models).__name__}",
+                        }
+                    placed_models.extend(assigned_models)
+                if len(placed_models) != len(gpu_models):
+                    return {
+                        "max_kvpr": 0.0,
+                        "success_rate": 0.0,
+                        "combined_score": 0.0,
+                        "error": f"Not all models placed: {len(placed_models)}/{len(gpu_models)}",
+                    }
+                # Check for duplicate placements (by object identity)
+                placed_ids = [id(m) for m in placed_models]
+                if len(set(placed_ids)) != len(placed_ids):
+                    return {
+                        "max_kvpr": 0.0,
+                        "success_rate": 0.0,
+                        "combined_score": 0.0,
+                        "error": f"Duplicate models detected",
+                    }
+                # Check placed models are the exact input objects
+                original_ids = {id(m) for m in gpu_models}
+                if set(placed_ids) != original_ids:
+                    return {
+                        "max_kvpr": 0.0,
+                        "success_rate": 0.0,
+                        "combined_score": 0.0,
+                        "error": "Placed models don't match input models (missing or foreign models)",
+                    }
+                # Verify GPU memory constraints
+                if not verify_gpu_mem_constraint(result):
+                    return {
+                        "max_kvpr": 0.0,
+                        "success_rate": 0.0,
+                        "combined_score": 0.0,
+                        "error": f"GPU memory constraint violated",
+                    }
+                # Calculate metrics using the generated test signal
+                max_kvpr = calculate_kvcache_pressure(result)
+                # Store metrics
+                metrics = {
+                    'max_kvpr': safe_float(max_kvpr),
+                    'execution_time': safe_float(execution_time),
+                }
+                all_kvpr.append(safe_float(max_kvpr))
+                all_metrics.append(metrics)
+                successful_runs += 1
+            except TimeoutError:
+                print(f"Placement {i}: Timeout")
+                continue
+            except Exception as e:
+                print(f"Placement {i}: Error - {str(e)}")
+                continue
+        # If no successful runs, return minimal scores
+        if successful_runs == 0:
+            return {
+                    "max_kvpr": 0.0,
+                    "success_rate": 0.0,
+                    "combined_score": 0.0,
+                    "error": "All test signals failed"
+                }
+        print(all_metrics)
+        # Calculate aggregate metrics
+        avg_kvpr = np.mean(all_kvpr)
+        if avg_kvpr != 0:
+            avg_kvpr = 1.0 / avg_kvpr
+        avg_execution_time = np.mean([m['execution_time'] for m in all_metrics])
+        success_rate = successful_runs / len(test_gpu_models)
+        return {
+                "max_kvpr": safe_float(avg_kvpr),
+                "execution_time": safe_float(avg_execution_time),
+                "success_rate": safe_float(success_rate),
+                "combined_score": safe_float(avg_kvpr) + safe_float(success_rate),
+            }
+    except Exception as e:
+        print(f"Evaluation failed: {str(e)}")
+        print(traceback.format_exc())
+        return {
+                "max_kvpr": 0.0,
+                "success_rate": 0.0,
+                "combined_score": 0.0,
+                "error": str(e)
+            }
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is auto-injected at build time from
+    # skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+    run(evaluate)

benchmarks/ADRS/prism/initial_program.py ADDED Viewed

	@@ -0,0 +1,75 @@

+GPU_MEM_SIZE = 80 # GB
+# EVOLVE-BLOCK-START
+def compute_model_placement(gpu_num, models):
+    """
+    Compute a model placement that minimizes the maximum KVPR across all GPUs.
+    Args:
+        gpu_num: Number of GPUs
+        models: List of models to place
+    Returns:
+        A placement of models to GPUs
+    """
+    # Greedy KVPR-minimizing placement based on Algorithm 1 (without τ check)
+    # 1) Sort models by r_j / s_j in descending order
+    sorted_models = sorted(models, key=lambda m: (m.req_rate / m.slo), reverse=True)
+    # 2) Initialize per-GPU states
+    placement = {gpu_id: [] for gpu_id in range(gpu_num)}
+    shared_kv = [GPU_MEM_SIZE for _ in range(gpu_num)]  # remaining memory per GPU
+    weighted_req_rate = [0.0 for _ in range(gpu_num)]   # sum of r_j / s_j per GPU
+    # 3) Assign each model to the GPU that minimizes current KVPR while fitting in memory
+    for model in sorted_models:
+        best_idx = None
+        best_ratio = float('inf')
+        for gpu_id in range(gpu_num):
+            if model.model_size <= shared_kv[gpu_id] and shared_kv[gpu_id] > 0:
+                current_ratio = weighted_req_rate[gpu_id] / shared_kv[gpu_id]
+                if current_ratio < best_ratio:
+                    best_ratio = current_ratio
+                    best_idx = gpu_id
+        # Failure: if no GPU can fit, raise an error instead of overcommitting
+        if best_idx is None:
+            raise ValueError(
+                f"Unable to place model of size {model.model_size} GB on any GPU. "
+                f"Remaining per-GPU memory: {shared_kv}"
+            )
+        placement[best_idx].append(model)
+        weighted_req_rate[best_idx] += model.req_rate / model.slo
+        shared_kv[best_idx] -= model.model_size
+    return placement
+# EVOLVE-BLOCK-END
+if __name__ == "__main__":
+    # Test the algorithm
+    from evaluator import generate_test_gpu_models
+    from evaluator import calculate_kvcache_pressure
+    from evaluator import safe_float
+    import numpy as np
+    test_cases = generate_test_gpu_models()
+    all_kvpr = []
+    for i, (gpu_num, gpu_models) in enumerate(test_cases):
+        results = compute_model_placement(gpu_num, gpu_models)
+        max_kvpr = calculate_kvcache_pressure(results)
+        all_kvpr.append(safe_float(max_kvpr))
+    avg_kvpr = np.mean(all_kvpr)
+    if avg_kvpr != 0:
+        avg_kvpr = 1.0 / avg_kvpr
+    print(f"Max KVPR: {avg_kvpr:.3f}")

benchmarks/ADRS/prism/initial_program_naive.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# EVOLVE-BLOCK-START
+GPU_MEM_SIZE = 80 # GB
+def compute_model_placement(gpu_num, models):
+    """
+    Compute a model placement that minimizes the maximum KVPR across all GPUs.
+    Args:
+        gpu_num: Number of GPUs
+        models: List of models to place
+    Returns:
+        A placement of models to GPUs
+    """
+    # gready algorithm to place models to the GPUs with smallest gpu_id first
+    placement = dict()
+    for gpu_id in range(gpu_num):
+        placement[gpu_id] = []
+    for model in models:
+        for gpu_id in range(gpu_num):
+            if model.model_size <= GPU_MEM_SIZE - sum(model.model_size for model in placement[gpu_id]):
+                placement[gpu_id].append(model)
+                break
+    return placement
+# EVOLVE-BLOCK-END

benchmarks/arc_benchmark/README.md ADDED Viewed

	@@ -0,0 +1,108 @@

+# ARC Benchmark
+Evolves ARC-AGI visual reasoning task solutions using SkyDiscover.
+## Setup
+### 1. Download ARC data
+Clone the ARC-AGI-2 repo and convert the data:
+```bash
+cd benchmarks/arc_benchmark
+git clone https://github.com/arcprize/ARC-AGI-2.git /tmp/ARC-AGI-2
+OUT_DIR=./data uv run python convert_arc_agi2_data.py /tmp/ARC-AGI-2
+rm -rf /tmp/ARC-AGI-2
+```
+This creates 4 files in `data/`:
+- `arc-agi_training_challenges.json` (1000 tasks)
+- `arc-agi_training_solutions.json`
+- `arc-agi_evaluation_challenges.json` (120 tasks)
+- `arc-agi_evaluation_solutions.json`
+### 2. Set your API key
+```bash
+export OPENAI_API_KEY=...
+```
+## Run a single task
+ARC requires a per-task config (each task has unique training examples as the prompt). Use `generate_config.py` to create one, then run with any search backend:
+```bash
+cd benchmarks/arc_benchmark
+# Generate task-specific config
+TASK_NUM=0 ARC_TASK_FILE=training CONFIG_OUT=./config_task_0.yaml \
+  uv run python generate_config.py
+# Run with any backend
+uv run skydiscover-run initial_program.py evaluator.py \
+  -c config_task_0.yaml -s [your_algorithm] -i 30
+# Or with evox, openevolve, gepa:
+uv run skydiscover-run initial_program.py evaluator.py \
+  -c config_task_0.yaml -s [your_algorithm] -i 30
+```
+## Run all evaluation tasks
+```bash
+cd benchmarks/arc_benchmark
+export ARC_TASK_FILE=evaluation
+NUM_TASKS=$(uv run python -c "import json; print(len(json.load(open('data/arc-agi_evaluation_challenges.json'))))")
+for i in $(seq 0 $((NUM_TASKS - 1))); do
+  TASK_NUM=$i CONFIG_OUT=./config_task_${i}.yaml uv run python generate_config.py
+  TASK_NUM=$i uv run skydiscover-run initial_program.py evaluator.py \
+    -c config_task_${i}.yaml -s [your_algorithm] -i 30 \
+    -o outputs/eval_task_${i}
+done
+```
+## Post-discovery test evaluation
+After the discovery process, evaluate the best program on held-out test inputs:
+```bash
+TASK_NUM=0 ARC_TASK_FILE=evaluation \
+  OUTS_DIR=./outputs/eval_task_0/adaevolve \
+  uv run python post_discovery_eval.py
+```
+## Config: GPT vs Gemini
+Edit `config.yaml` — comment the GPT block and uncomment the Gemini block, or override with `--model`:
+```bash
+uv run skydiscover-run ... -m gemini/gemini-3-pro-preview
+```
+## Files
+| File | Description |
+|------|-------------|
+| `initial_program.py` | Seed program with two transform functions to evolve |
+| `evaluator.py` | Scores programs on pass@2 + cell accuracy |
+| `config.yaml` | Base config template (prompt injected by generate_config.py) |
+| `generate_config.py` | Injects task-specific training examples into config as system prompt |
+| `post_discovery_eval.py` | Evaluates best program on held-out test inputs |
+| `convert_arc_agi2_data.py` | Converts raw ARC-AGI-2 data to benchmark format |
+| `requirements.txt` | Dependencies (numpy) |
+## Environment variables
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `OPENAI_API_KEY` | (required) | API key |
+| `ARC_TASK_FILE` | `training` | `training` or `evaluation` |
+| `TASK_NUM` | `0` | Task index within the dataset |
+| `BASE_CONFIG` | `./config.yaml` | Base config template path |
+| `CONFIG_OUT` | `./config_task_{N}.yaml` | Output path for generated config |
+| `DATA_ROOT` | `./data` | Path to ARC data directory |
+| `MAX_ITERATIONS` | (from config) | Override `max_iterations` at runtime |
+| `ARC_EVAL_INCLUDE_TEST` | `0` | Set to `1` to also run the held-out test inputs during evolution |
+| `ARC_EVAL_USE_TEST_FOR_SCORE` | `0` | Set to `1` to average train and test scores into `combined_score` (only used when `ARC_EVAL_INCLUDE_TEST=1`) |

benchmarks/arc_benchmark/config.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+# ARC Benchmark base config
+# This file is used by generate_config.py to inject a task-specific prompt.
+# Switch models by editing the 'llm' section below.
+# General settings
+max_iterations: 30
+checkpoint_interval: 10
+log_level: "INFO"
+random_seed: 42
+diff_based_generation: true
+max_solution_length: 50000
+# LLM configuration
+llm:
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  api_base: "https://api.openai.com/v1"
+  temperature: 0.7
+  # top_p: 0.95  # omitted by default; some providers (e.g. Anthropic) reject both temperature and top_p
+  max_tokens: 32768
+  timeout: 3000
+# Option B: Gemini 3 Pro (comment Option A and uncomment below)
+# llm:
+#   models:
+#     - name: "gemini-3-pro-preview"
+#       weight: 1.0
+#   api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
+#   temperature: 0.7
+#   top_p: 0.95
+#   max_tokens: 32768
+#   timeout: 3000
+# Search configuration (default: top-k)
+search:
+  type: "topk"
+  database:
+    random_seed: 42
+  num_context_programs: 4
+# Prompt configuration
+# NOTE: generate_config.py overwrites prompt.system_message per task.
+prompt:
+  system_message: "PLACEHOLDER_REPLACED_BY_GENERATE_CONFIG"
+# Evaluator configuration
+evaluator:
+  timeout: 360
+  max_retries: 3
+  cascade_evaluation: false

benchmarks/arc_benchmark/convert_arc_agi2_data.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env python3
+"""
+Convert ARC-AGI-2-style data (data/training/*.json, data/evaluation/*.json)
+into the format expected by this benchmark:
+  - arc-agi_{split}_challenges.json  (task_id -> { train, test with inputs only })
+  - arc-agi_{split}_solutions.json   (task_id -> list of test output grids)
+Usage (from benchmarks/arc_benchmark, with data already in ./data/training and ./data/evaluation):
+  OUT_DIR=./data python3 convert_arc_agi2_data.py .
+Or with an external ARC-AGI-2 clone:
+  python3 convert_arc_agi2_data.py /path/to/ARC-AGI-2
+  # Writes into that path by default; set OUT_DIR to write elsewhere.
+"""
+import json
+import os
+import sys
+def convert_split(repo_root: str, split: str, out_dir: str) -> None:
+    """Convert data/{split}/*.json into challenges + solutions JSON."""
+    split_dir = os.path.join(repo_root, "data", split)
+    if not os.path.isdir(split_dir):
+        print(f"Skip {split}: no directory {split_dir}")
+        return
+    challenges = {}
+    solutions = {}
+    for name in sorted(os.listdir(split_dir)):
+        if not name.endswith(".json"):
+            continue
+        task_id = name[:-5]  # strip .json
+        path = os.path.join(split_dir, name)
+        with open(path, "r") as f:
+            task = json.load(f)
+        # Challenge: train as-is; test with only "input" (no output)
+        challenges[task_id] = {
+            "train": task["train"],
+            "test": [{"input": p["input"]} for p in task["test"]],
+        }
+        # Solutions: list of test output grids
+        solutions[task_id] = [p["output"] for p in task["test"]]
+    challenges_path = os.path.join(out_dir, f"arc-agi_{split}_challenges.json")
+    solutions_path = os.path.join(out_dir, f"arc-agi_{split}_solutions.json")
+    with open(challenges_path, "w") as f:
+        json.dump(challenges, f)
+    with open(solutions_path, "w") as f:
+        json.dump(solutions, f)
+    print(f"Wrote {challenges_path} ({len(challenges)} tasks)")
+    print(f"Wrote {solutions_path} ({len(solutions)} tasks)")
+def main():
+    repo_root = os.path.abspath(sys.argv[1] if len(sys.argv) > 1 else ".")
+    out_dir = os.getenv("OUT_DIR", repo_root)
+    for split in ("training", "evaluation"):
+        convert_split(repo_root, split, out_dir)
+if __name__ == "__main__":
+    main()

benchmarks/arc_benchmark/evaluator/Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.12-slim
+WORKDIR /benchmark
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+ENTRYPOINT ["./evaluate.sh"]

benchmarks/arc_benchmark/evaluator/evaluate.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+#!/usr/bin/env bash
+set -euo pipefail
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+python /benchmark/evaluator.py "$PROGRAM"

benchmarks/arc_benchmark/evaluator/evaluator.py ADDED Viewed

	@@ -0,0 +1,407 @@

+import numpy as np
+from typing import List, Tuple, Dict, Any
+import json
+import os
+try:
+    from skydiscover.evaluation.evaluation_result import EvaluationResult
+except ImportError:
+    from dataclasses import dataclass, field
+    from typing import Union
+    @dataclass
+    class EvaluationResult:
+        metrics: Dict[str, float]
+        artifacts: Dict[str, Union[str, bytes]] = field(default_factory=dict)
+import importlib.util
+TASK_FILE = os.getenv("ARC_TASK_FILE", "training")
+TASK_NUM = os.getenv("TASK_NUM", 0)
+DATA_ROOT = os.getenv("DATA_ROOT", os.path.join(os.path.dirname(os.path.abspath(__file__)), "data"))
+INCLUDE_TEST = os.getenv("ARC_EVAL_INCLUDE_TEST", "0").lower() in ("1", "true", "yes")
+USE_TEST_IN_SCORE = os.getenv("ARC_EVAL_USE_TEST_FOR_SCORE", "0").lower() in ("1", "true", "yes")
+def cell_accuracy_single(pred: np.ndarray, gt: np.ndarray) -> float:
+    """
+    Compute continuous cell-level accuracy between prediction and ground truth.
+    Returns a float in [0, 1]. Handles shape mismatches gracefully.
+    """
+    if pred.shape != gt.shape:
+        # Partial credit for getting shape partially right
+        shape_score = 0.0
+        if len(pred.shape) == len(gt.shape) == 2:
+            row_match = 1.0 if pred.shape[0] == gt.shape[0] else 0.0
+            col_match = 1.0 if pred.shape[1] == gt.shape[1] else 0.0
+            shape_score = (row_match + col_match) * 0.1  # up to 0.2 for correct dimensions
+        return shape_score
+    # Cell-level accuracy
+    total_cells = gt.size
+    if total_cells == 0:
+        return 1.0
+    correct_cells = int(np.sum(pred == gt))
+    return correct_cells / total_cells
+def best_attempt_cell_accuracy(attempts: List[np.ndarray], gt: np.ndarray) -> float:
+    """Return the best cell accuracy across all attempts for one example."""
+    return max(cell_accuracy_single(a, gt) for a in attempts)
+def pass_at_2_accuracy_single(
+    attempts: List[np.ndarray],
+    gt: np.ndarray
+) -> Tuple[int, Dict[int, Any]]:
+    """
+    Compute pass@2 accuracy for a single ARC test case.
+    Args:
+        attempts: List of 2 numpy arrays representing model attempts.
+        gt: Ground-truth output as a 2D numpy array.
+    Returns:
+        pass_at_2: int (1 if any attempt is perfectly correct, else 0)
+        diagnostics: dict mapping attempt index -> diagnostic info.
+                     If sizes match, includes indices of incorrect cells.
+    """
+    assert len(attempts) == 2, "Expected exactly 2 attempts for pass@2 evaluation."
+    diagnostics = {}
+    passed = False
+    for i, pred in enumerate(attempts):
+        attempt_info = {}
+        # Size check
+        if pred.shape != gt.shape:
+            attempt_info["size_match"] = False
+            attempt_info["pred_shape"] = list(pred.shape)
+            attempt_info["gt_shape"] = list(gt.shape)
+            attempt_info["incorrect_indices"] = None
+            attempt_info["cell_accuracy"] = 0.0
+            attempt_passed = False
+        else:
+            attempt_info["size_match"] = True
+            # Find incorrect cells
+            incorrect_mask = pred != gt
+            incorrect_indices = np.argwhere(incorrect_mask)
+            attempt_info["incorrect_indices"] = incorrect_indices.tolist()
+            attempt_info["num_incorrect"] = int(incorrect_mask.sum())
+            attempt_info["num_total"] = int(gt.size)
+            attempt_info["cell_accuracy"] = float(np.sum(~incorrect_mask)) / gt.size
+            # Perfect match
+            if incorrect_mask.sum() == 0:
+                attempt_passed = True
+            else:
+                attempt_passed = False
+        attempt_info["perfect_match"] = attempt_passed
+        passed = attempt_passed or passed
+        diagnostics[i] = attempt_info
+    pass_at_2 = 1 if passed else 0
+    return pass_at_2, diagnostics
+def pass_at_2_accuracy_multi_test(
+    all_attempts: List[List[np.ndarray]],
+    all_gt: List[np.ndarray]
+) -> Tuple[List[int], List[Dict[int, Any]]]:
+    """
+    Compute pass@2 accuracy across multiple ARC test cases.
+    Args:
+        all_attempts: List of lists of 2 numpy arrays for each test case.
+        all_gt: List of ground-truth outputs as 2D numpy arrays.
+    """
+    assert len(all_attempts) == len(all_gt), "Mismatched number of test cases."
+    all_diagnostics = []
+    all_pass = []
+    for attempts, gt in zip(all_attempts, all_gt):
+        pass_at_2, diagnostics = pass_at_2_accuracy_single(attempts, gt)
+        all_pass.append(pass_at_2)
+        all_diagnostics.append(diagnostics)
+    return all_pass, all_diagnostics
+def extract_failure_artifacts(diagnostics, pred=None, gt=None):
+    """
+    Extract failure artifacts from diagnostics for a given example.
+    Includes actual vs expected output snippets for better LLM feedback.
+    """
+    artifacts = {}
+    if not diagnostics["size_match"]:
+        artifacts["error_type"] = "SizeMismatch"
+        artifacts["error_message"] = (
+            f"Output shape {diagnostics['pred_shape']} does not match "
+            f"expected shape {diagnostics['gt_shape']}."
+        )
+        artifacts["suggestion"] = (
+            f"Your output has shape {diagnostics['pred_shape']} but the correct output "
+            f"has shape {diagnostics['gt_shape']}. Review how you determine output dimensions."
+        )
+    else:
+        num_incorrect = diagnostics['num_incorrect']
+        num_total = diagnostics['num_total']
+        accuracy = diagnostics['cell_accuracy']
+        artifacts["error_type"] = "IncorrectCells"
+        artifacts["error_message"] = (
+            f"{num_incorrect}/{num_total} cells incorrect "
+            f"(cell accuracy: {accuracy:.1%})."
+        )
+        # Show a compact diff of expected vs actual for first few wrong cells
+        if diagnostics['incorrect_indices'] and pred is not None and gt is not None:
+            wrong = diagnostics['incorrect_indices'][:8]  # first 8 wrong cells
+            diff_lines = []
+            for r, c in wrong:
+                diff_lines.append(f"  [{r},{c}]: got {int(pred[r,c])}, expected {int(gt[r,c])}")
+            artifacts["cell_diffs"] = "\n".join(diff_lines)
+            if len(diagnostics['incorrect_indices']) > 8:
+                artifacts["cell_diffs"] += f"\n  ... and {len(diagnostics['incorrect_indices'])-8} more"
+        artifacts["suggestion"] = (
+            f"Your solution gets {accuracy:.1%} of cells correct. "
+            f"Review the transformation logic for the failing cells."
+        )
+    return artifacts
+def evaluate(program_path):
+    """
+    Evaluate the program on ARC task training (and optionally test) examples.
+    Returns a combined_score that blends:
+      - pass@2 (binary perfect-match, weighted 0.6)
+      - cell accuracy (continuous partial credit, weighted 0.4)
+    This gives evolution gradient signal even when no example is solved perfectly.
+    """
+    spec = importlib.util.spec_from_file_location("program_module", program_path)
+    program_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(program_module)
+    if not hasattr(program_module, 'transform_grid_attempt_1') or not hasattr(program_module, 'transform_grid_attempt_2'):
+        print(f"Stage 1 validation failed: Program must define 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.")
+        error_artifacts = {
+                "error_type": "MissingFunction",
+                "error_message": "Stage 1: Program is missing required 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.",
+                "suggestion": "Make sure your program includes a functions named 'transform_grid_attempt_1' and 'transform_grid_attempt_2' that take as an argument a 2D numpy array and return a 2D numpy array."
+            }
+        return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0,
+                    "combined_score": 0.0,
+                    "error": "Missing transform_grid_attempt_1 and transform_grid_attempt_2 functions"
+                },
+                artifacts=error_artifacts
+            )
+    # Load ARC tasks
+    challenge_path = os.path.join(DATA_ROOT, f"arc-agi_{TASK_FILE}_challenges.json")
+    with open(challenge_path, 'r') as f:
+        tasks = json.load(f)
+    task_id = list(tasks.keys())[int(TASK_NUM)]
+    task = tasks[task_id]
+    train_inputs = [np.array(inp["input"]) for inp in task['train']]
+    train_gts = [np.array(gt["output"]) for gt in task['train']]
+    train_attempts = []
+    # Generate attempts for training data
+    for inp in train_inputs:
+        attempt_1 = program_module.transform_grid_attempt_1(inp)
+        if not isinstance(attempt_1, np.ndarray):
+            print(f"transform_grid_attempt_1 did not return a numpy array")
+            error_artifacts = {
+                "error_type": "InvalidReturnType",
+                "error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array.",
+                "suggestion": "Make sure your transform_grid_attempt_1 function returns a 2D numpy array."
+            }
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0,
+                    "combined_score": 0.0,
+                    "error": "transform_grid_attempt_1 did not return a numpy array"
+                },
+                artifacts=error_artifacts
+            )
+        attempt_2 = program_module.transform_grid_attempt_2(inp)
+        if not isinstance(attempt_2, np.ndarray):
+            print(f"transform_grid_attempt_2 did not return a numpy array")
+            error_artifacts = {
+                "error_type": "InvalidReturnType",
+                "error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array.",
+                "suggestion": "Make sure your transform_grid_attempt_2 function returns a 2D numpy array."
+            }
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0,
+                    "combined_score": 0.0,
+                    "error": "transform_grid_attempt_2 did not return a numpy array"
+                },
+                artifacts=error_artifacts
+            )
+        train_attempts.append([attempt_1, attempt_2])
+    pass_at_2_train, train_diagnostics_list = pass_at_2_accuracy_multi_test(train_attempts, train_gts)
+    # Compute both binary pass@2 and continuous cell accuracy
+    train_pass_score = sum(pass_at_2_train) / len(pass_at_2_train)
+    train_cell_acc = sum(
+        best_attempt_cell_accuracy(attempts, gt)
+        for attempts, gt in zip(train_attempts, train_gts)
+    ) / len(train_gts)
+    # Blended score: pass@2 (60%) + cell accuracy (40%) gives gradient signal
+    train_score = 0.6 * train_pass_score + 0.4 * train_cell_acc
+    metrics = {
+        "runs_successfully": 1.0,
+        "combined_score": train_score,
+        "train_combined_score": train_score,
+        "train_pass_at_2_score": train_pass_score,
+        "train_cell_accuracy": round(train_cell_acc, 4),
+    }
+    error_artifacts = {}
+    for i, (train_pass, train_diagnostics) in enumerate(zip(pass_at_2_train, train_diagnostics_list)):
+        example_name = f"train_example_{i}"
+        metrics[f"{example_name}_pass_at_2"] = train_pass
+        best_acc = best_attempt_cell_accuracy(train_attempts[i], train_gts[i])
+        metrics[f"{example_name}_cell_accuracy"] = round(best_acc, 4)
+        for attempt in train_diagnostics:
+            attempt_pass = train_diagnostics[attempt]["perfect_match"]
+            metrics[f"{example_name}_attempt_{attempt}"] = attempt_pass
+            if not attempt_pass:
+                pred = train_attempts[i][attempt]
+                gt = train_gts[i]
+                error_artifacts[f"{example_name}_attempt_{attempt}_diagnostics"] = extract_failure_artifacts(
+                    train_diagnostics[attempt], pred=pred, gt=gt
+                )
+    # Optional: include test feedback (uses solutions if available)
+    if INCLUDE_TEST:
+        solution_path = os.path.join(DATA_ROOT, f"arc-agi_{TASK_FILE}_solutions.json")
+        if os.path.isfile(solution_path):
+            with open(solution_path, 'r') as f:
+                solutions = json.load(f)
+            task_id = list(tasks.keys())[int(TASK_NUM)]
+            solution = solutions.get(task_id)
+            if solution is not None and "test" in task:
+                if len(task["test"]) != len(solution):
+                    raise ValueError(
+                        f"Train/test data mismatch: task {task_id} has {len(task['test'])} test inputs "
+                        f"but {len(solution)} solution outputs. Check that arc-agi_{TASK_FILE}_challenges.json "
+                        f"and arc-agi_{TASK_FILE}_solutions.json were generated together."
+                    )
+                test_inputs = [np.array(inp["input"]) for inp in task['test']]
+                test_gts = [np.array(gt) for gt in solution]
+                test_attempts = []
+                for inp in test_inputs:
+                    attempt_1 = program_module.transform_grid_attempt_1(inp)
+                    if not isinstance(attempt_1, np.ndarray):
+                        print(f"transform_grid_attempt_1 did not return a numpy array (test)")
+                        return EvaluationResult(
+                            metrics={
+                                "runs_successfully": 0.0,
+                                "combined_score": 0.0,
+                                "error": "transform_grid_attempt_1 did not return a numpy array (test)"
+                            },
+                            artifacts={
+                                "error_type": "InvalidReturnType",
+                                "error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array (test).",
+                                "suggestion": "Make sure transform_grid_attempt_1 returns a 2D numpy array."
+                            }
+                        )
+                    attempt_2 = program_module.transform_grid_attempt_2(inp)
+                    if not isinstance(attempt_2, np.ndarray):
+                        print(f"transform_grid_attempt_2 did not return a numpy array (test)")
+                        return EvaluationResult(
+                            metrics={
+                                "runs_successfully": 0.0,
+                                "combined_score": 0.0,
+                                "error": "transform_grid_attempt_2 did not return a numpy array (test)"
+                            },
+                            artifacts={
+                                "error_type": "InvalidReturnType",
+                                "error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array (test).",
+                                "suggestion": "Make sure transform_grid_attempt_2 returns a 2D numpy array."
+                            }
+                        )
+                    test_attempts.append([attempt_1, attempt_2])
+                pass_at_2_test, test_diagnostics_list = pass_at_2_accuracy_multi_test(test_attempts, test_gts)
+                test_pass_score = sum(pass_at_2_test) / len(pass_at_2_test)
+                test_cell_acc = sum(
+                    best_attempt_cell_accuracy(attempts, gt)
+                    for attempts, gt in zip(test_attempts, test_gts)
+                ) / len(test_gts)
+                test_score = 0.6 * test_pass_score + 0.4 * test_cell_acc
+                metrics["test_combined_score"] = test_score
+                metrics["test_pass_at_2_score"] = test_pass_score
+                metrics["test_cell_accuracy"] = round(test_cell_acc, 4)
+                metrics["test_included"] = 1
+                for i, (test_pass, test_diagnostics) in enumerate(zip(pass_at_2_test, test_diagnostics_list)):
+                    example_name = f"test_example_{i}"
+                    metrics[f"{example_name}_pass_at_2"] = test_pass
+                    best_acc = best_attempt_cell_accuracy(test_attempts[i], test_gts[i])
+                    metrics[f"{example_name}_cell_accuracy"] = round(best_acc, 4)
+                    for attempt in test_diagnostics:
+                        metrics[f"{example_name}_attempt_{attempt}"] = test_diagnostics[attempt]["perfect_match"]
+                    if test_pass == 0:
+                        first_failing_idx = next(
+                            (a for a in test_diagnostics if not test_diagnostics[a]["perfect_match"]),
+                            0,
+                        )
+                        pred = test_attempts[i][first_failing_idx]
+                        gt = test_gts[i]
+                        error_artifacts[f"{example_name}"] = extract_failure_artifacts(
+                            test_diagnostics[first_failing_idx], pred=pred, gt=gt
+                        )
+                if USE_TEST_IN_SCORE:
+                    metrics["combined_score"] = (train_score + test_score) / 2.0
+            else:
+                metrics["test_included"] = 0
+        else:
+            metrics["test_included"] = 0
+    return EvaluationResult(
+        metrics=metrics,
+        artifacts=error_artifacts
+    )
+def _evaluate_as_dict(program_path):
+    """Adapter: calls evaluate() and converts EvaluationResult to a plain dict."""
+    result = evaluate(program_path)
+    d = dict(result.metrics)
+    for k, v in result.artifacts.items():
+        d[k] = v
+    return d
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> EvaluationResult to the
+    # container JSON protocol.  wrapper.py is copied from
+    # skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+    run(_evaluate_as_dict)

benchmarks/arc_benchmark/evaluator/requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ numpy

benchmarks/arc_benchmark/evaluator/wrapper.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""Backwards-compat wrapper for old Python-based evaluators.
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+Usage — add this to the bottom of your evaluator.py::
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+import json
+import sys
+import traceback
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+    program_path = sys.argv[1]
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+    print(json.dumps(output))

benchmarks/arc_benchmark/generate_config.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import os
+import yaml
+import json
+def load_task_as_prompt(task_json, task_num):
+    with open(task_json, 'r') as f:
+        tasks = json.load(f)
+    task_id = list(tasks.keys())[int(task_num)]
+    task = tasks[task_id]
+    train_inputs = [inp["input"] for inp in task['train']]
+    train_outputs = [gt["output"] for gt in task['train']]
+    train_pairs = ""
+    for i, (inp, out) in enumerate(zip(train_inputs, train_outputs)):
+        train_pairs += f"In {i} - {inp}\nOut {i} - {out}\n"
+    prompt = f"""You are participating in a puzzle solving competition. You are an expert at solving puzzles.
+Find the common pattern that transforms each input grid into its corresponding output grid.
+Your task is to write python functions that implement the MOST GENERAL transformation rule. The rule must:
+- Apply consistently to ALL training examples
+- Generalize to unseen inputs (critical for success)
+- Be based on structural patterns, not memorized examples
+- Use relative/spatial rules rather than absolute coordinates
+Generalization rules (THIS IS CRITICAL):
+- Infer the transformation ONLY from the training input-output pairs
+- If multiple rules fit the training data, choose the SIMPLEST and MOST GENERAL one
+- Prefer structural/relational rules (shapes, adjacency, symmetry, patterns) over coordinate-based rules
+- Do NOT hardcode any values, coordinates, or specific grid sizes that appear in training examples
+- Think: "What is the underlying principle?" not "What fits these specific examples?"
+- Use numpy only (no external libraries)
+Common failure modes to avoid:
+- Overfitting to specific grid sizes or positions in training examples
+- Hardcoding colors, coordinates, or counts from training data
+- Assuming global properties (like separator colors) without verifying across ALL examples
+- Using absolute positions when relative/structural rules would generalize better
+Solution approach:
+- Analyze the training examples to identify the CORE transformation principle
+- Prefer block-wise, object-wise, or pattern-based rules that work locally
+- If the grid has distinct regions, solve each region independently
+- Build flexible rules that adapt to different input sizes and structures
+Training examples:
+{train_pairs}
+Your task: Write 2 different Python functions that implement the general transformation rule.
+- Each function takes a 2D numpy array as input and returns the transformed 2D numpy array
+- The two attempts should use genuinely different strategies (e.g., different algorithmic approaches)
+- Focus on generalization - your solution will be evaluated on BOTH training examples AND unseen test cases
+CRITICAL: Write general transformations that discover the underlying rule, not memorize the training examples.
+Remember to only output the modified python functions as your solution."""
+    return prompt
+def generate_config(task_num, task_file, dataset_root=None, base_config=None):
+    if dataset_root is None:
+        dataset_root = os.getenv("DATA_ROOT")
+        if not dataset_root:
+            dataset_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+    task_json = os.path.join(dataset_root, f"arc-agi_{task_file}_challenges.json")
+    prompt = load_task_as_prompt(task_json, task_num)
+    if base_config is None:
+        default_base = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config.yaml")
+        base_config = os.getenv("BASE_CONFIG", default_base)
+    with open(base_config, 'r') as file:
+        config = yaml.safe_load(file)
+    config['prompt']['system_message'] = prompt
+    # Use OPENAI_API_KEY at runtime if set (keeps real key out of committed config)
+    api_key_env = os.getenv("OPENAI_API_KEY")
+    if api_key_env and api_key_env.strip() and api_key_env != "your-gemini-api-key":
+        config["llm"]["api_key"] = api_key_env.strip()
+    # Override max_iterations from env if set (e.g. by run_discovery.sh)
+    max_iter_env = os.getenv("MAX_ITERATIONS")
+    if max_iter_env is not None and str(max_iter_env).strip() != "":
+        try:
+            config["max_iterations"] = int(max_iter_env)
+        except ValueError:
+            pass
+    # Write to a per-task config file so parallel runs don't conflict
+    out_path = os.getenv("CONFIG_OUT", f"./config_task_{task_num}.yaml")
+    with open(out_path, 'w') as file:
+        yaml.dump(config, file)
+    return out_path
+if __name__ == "__main__":
+    TASK_FILE = os.getenv("ARC_TASK_FILE", "training")
+    TASK_NUM = os.getenv("TASK_NUM", 0)
+    path = generate_config(TASK_NUM, TASK_FILE)
+    print(path)

benchmarks/arc_benchmark/initial_program.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# EVOLVE-BLOCK-START
+import numpy as np
+def transform_grid_attempt_1(grid):
+    """
+    Example transformation:
+    - Validate input (2D, integer values 0-9).
+    - Rotate the grid 90 degrees clockwise.
+    - Increment every cell by 1 modulo 10 (keeps values 0-9).
+    Returns a new numpy int array.
+    """
+    arr = _validate_grid(grid)
+    out = np.rot90(arr, k=-1)  # 90 degrees clockwise
+    out = (out + 1) % 10
+    return out.astype(np.int32)
+def transform_grid_attempt_2(grid):
+    """
+    Example transformation:
+    - Validate input (2D, integer values 0-9).
+    - Upsample each cell to a 2x2 block (doubling both dimensions).
+    - Invert colors by mapping v -> 9 - v (keeps values 0-9).
+    Returns a new numpy int array.
+    """
+    arr = _validate_grid(grid)
+    out = np.repeat(np.repeat(arr, 2, axis=0), 2, axis=1)
+    out = 9 - out
+    return out.astype(np.int32)
+# EVOLVE-BLOCK-END
+def _validate_grid(grid):
+    arr = np.asarray(grid)
+    if arr.ndim != 2:
+        raise ValueError("Input must be a 2D array.")
+    # cast to integer type for value checks
+    if not np.issubdtype(arr.dtype, np.integer):
+        arr = arr.astype(int)
+    if arr.size and (arr.min() < 0 or arr.max() > 9):
+        raise ValueError("Array values must be integers in the range 0-9.")
+    return arr

benchmarks/arc_benchmark/post_discovery_eval.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import importlib.util
+import os
+import json
+import numpy as np
+from evaluator import pass_at_2_accuracy_multi_test, extract_failure_artifacts
+TASK_FILE = os.getenv("ARC_TASK_FILE", "training")
+TASK_NUM = os.getenv("TASK_NUM", 0)
+OUTS_DIR = os.getenv("OUTS_DIR", "")
+# Optional: path to a checkpoint dir (e.g. outputs/evaluation_task_0/checkpoints/checkpoint_10) to eval that best_program.py on test set
+PROGRAM_DIR = os.getenv("PROGRAM_DIR", "")
+def _program_path():
+    """Path to best_program.py: PROGRAM_DIR if set, else OUTS_DIR/best/."""
+    if PROGRAM_DIR:
+        return os.path.join(PROGRAM_DIR, "best_program.py")
+    return os.path.join(OUTS_DIR, "best", "best_program.py")
+def _result_path():
+    """Where to write post_evolution_evaluation_result.json."""
+    if PROGRAM_DIR:
+        return os.path.join(PROGRAM_DIR, "post_evolution_evaluation_result.json")
+    return os.path.join(OUTS_DIR, "best", "post_evolution_evaluation_result.json")
+def load_program_module():
+    """Dynamically load the best_program.py module from the specified directory."""
+    path = _program_path()
+    if not os.path.isfile(path):
+        raise FileNotFoundError(f"Program not found: {path}. Set PROGRAM_DIR to a checkpoint dir (e.g. .../checkpoints/checkpoint_10) or ensure OUTS_DIR/best/best_program.py exists.")
+    spec = importlib.util.spec_from_file_location("program_module", path)
+    program_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(program_module)
+    return program_module
+def evaluate():
+    """Evaluate the program module located in the specified directory."""
+    program_module = load_program_module()
+    if not hasattr(program_module, 'transform_grid_attempt_1') or not hasattr(program_module, 'transform_grid_attempt_2'):
+        print(f"Stage 1 validation failed: Program must define 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.")
+        error_artifacts = {
+                "error_type": "MissingFunction",
+                "error_message": "Stage 1: Program is missing required 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.",
+                "suggestion": "Make sure your program includes a functions named 'transform_grid_attempt_1' and 'transform_grid_attempt_2' that take as an argument a 2D numpy array and return a 2D numpy array."
+            }
+        return dict(
+                metrics={
+                    "runs_successfully": 0.0,
+                    "combined_score": 0.0,
+                    "error": "Missing transform_grid_attempt_1 and transform_grid_attempt_2 functions"
+                },
+                artifacts=error_artifacts
+            )
+    # Load ARC tasks
+    data_root = os.getenv("DATA_ROOT")
+    if not data_root:
+        data_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+    challenge_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_challenges.json")
+    solution_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_solutions.json")
+    with open(challenge_path, 'r') as f:
+        tasks = json.load(f)
+    with open(solution_path, 'r') as f:
+        solutions = json.load(f)
+    task_id = list(tasks.keys())[int(TASK_NUM)]
+    solution = solutions[task_id]
+    task = tasks[task_id]
+    # Sanity check: test inputs and solutions must align (same task, same order)
+    if len(task["test"]) != len(solution):
+        raise ValueError(
+            f"Train/test data mismatch: task {task_id} has {len(task['test'])} test inputs "
+            f"but {len(solution)} solution outputs. Check that arc-agi_{TASK_FILE}_challenges.json "
+            f"and arc-agi_{TASK_FILE}_solutions.json were generated together (convert_arc_agi2_data.py)."
+        )
+    test_inputs = [np.array(inp["input"]) for inp in task['test']]
+    test_gts = [np.array(gt) for gt in solution]
+    test_attempts = []
+    for inp in test_inputs:
+        attempt_1 = program_module.transform_grid_attempt_1(inp)
+        if not isinstance(attempt_1, np.ndarray):
+            print(f"transform_grid_attempt_1 did not return a numpy array")
+            error_artifacts = {
+                "error_type": "InvalidReturnType",
+                "error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array.",
+                "suggestion": "Make sure your transform_grid_attempt_1 function returns a 2D numpy array."
+            }
+            return dict(
+                metrics={
+                    "runs_successfully": 0.0,
+                    "combined_score": 0.0,
+                    "error": "transform_grid_attempt_1 did not return a numpy array"
+                },
+                artifacts=error_artifacts
+            )
+        attempt_2 = program_module.transform_grid_attempt_2(inp)
+        if not isinstance(attempt_2, np.ndarray):
+            print(f"transform_grid_attempt_2 did not return a numpy array")
+            error_artifacts = {
+                "error_type": "InvalidReturnType",
+                "error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array.",
+                "suggestion": "Make sure your transform_grid_attempt_2 function returns a 2D numpy array."
+            }
+            return dict(
+                metrics={
+                    "runs_successfully": 0.0,
+                    "combined_score": 0.0,
+                    "error": "transform_grid_attempt_2 did not return a numpy array"
+                },
+                artifacts=error_artifacts
+            )
+        test_attempts.append([attempt_1, attempt_2])
+    pass_at_2_test, test_diagnostics_list = pass_at_2_accuracy_multi_test(test_attempts, test_gts)
+    metrics = {
+        "runs_successfully": 1.0,
+        "combined_score": sum(pass_at_2_test) / len(pass_at_2_test),
+    }
+    error_artifacts = {}
+    for i, (test_pass, test_diagnostics) in enumerate(zip(pass_at_2_test, test_diagnostics_list)):
+        example_name = f"test_example_{i}"
+        metrics[f"{example_name}_pass_at_2"] = test_pass
+        for attempt in test_diagnostics:
+            metrics[f"{example_name}_attempt_{attempt}"] = test_diagnostics[attempt]["perfect_match"]
+        if test_pass == 0:
+            # test_diagnostics is {0: {...}, 1: {...}}; extract_failure_artifacts expects one attempt's dict
+            first_failing = next(
+                (test_diagnostics[a] for a in test_diagnostics if not test_diagnostics[a]["perfect_match"]),
+                test_diagnostics[0],
+            )
+            error_artifacts[f"{example_name}"] = extract_failure_artifacts(first_failing)
+    return dict(
+        metrics=metrics,
+        artifacts=error_artifacts
+    )
+if __name__ == "__main__":
+    evaluation_result = evaluate()
+    result_path = _result_path()
+    os.makedirs(os.path.dirname(result_path), exist_ok=True)
+    with open(result_path, 'w') as f:
+        json.dump(evaluation_result, f, indent=4)
+    print(f"Test-set evaluation written to {result_path}")

benchmarks/frontier-cs-eval/README.md ADDED Viewed

	@@ -0,0 +1,72 @@

+# Frontier-CS Benchmark
+Evolves C++ solutions for [Frontier-CS](https://github.com/facebookresearch/Frontier-CS) algorithmic optimization problems using SkyDiscover.
+## Setup
+```bash
+# 1. Clone Frontier-CS
+cd benchmarks/frontier-cs-eval
+git clone https://github.com/FrontierCS/Frontier-CS.git
+# 2. Start the judge server (requires Docker)
+cd Frontier-CS/algorithmic
+docker compose up -d
+# 3. Install dependencies (from project root)
+cd ../../..
+uv sync --extra frontier-cs
+# 4. Set your API key
+export OPENAI_API_KEY=...
+```
+## Run
+Supported algorithms: `adaevolve`, `evox`, `openevolve`, `gepa`, `shinkaevolve`
+Single problem:
+```bash
+cd benchmarks/frontier-cs-eval
+FRONTIER_CS_PROBLEM=0 uv run skydiscover-run initial_program.cpp evaluator.py \
+  -c config.yaml -s [search_algorithm] -i 50
+```
+All problems in parallel:
+```bash
+uv run python run_all_frontiercs.py --search [search_algorithm] --iterations 50 --workers 6
+```
+## Evaluate best programs (post-discovery)
+```bash
+uv run python run_best_programs_frontiercs.py
+```
+## Analyze results
+```bash
+uv run python combine_results.py   # merge training/testing scores into CSV
+uv run python analyze_results.py   # generate plots and statistics
+```
+## Files
+| File | Description |
+|------|-------------|
+| `initial_program.cpp` | Seed C++ program |
+| `evaluator.py` | Evaluates C++ solutions via Frontier-CS docker judge |
+| `config.yaml` | Config with system prompt template |
+| `run_all_frontiercs.py` | Parallelizes evolution across all problems |
+| `run_best_programs_frontiercs.py` | Re-evaluates best programs after evolution |
+| `combine_results.py` | Combines training/testing scores into CSV |
+| `analyze_results.py` | Generates score analysis plots and statistics |
+## Environment variables
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `OPENAI_API_KEY` | (required) | API key |
+| `FRONTIER_CS_PROBLEM` | `0` | Problem ID to evolve |
+| `JUDGE_URLS` | `http://localhost:8081` | Comma-separated judge server URLs |

benchmarks/frontier-cs-eval/analyze_results.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+from pathlib import Path
+# Define paths
+_script_dir = str(Path(__file__).resolve().parent)
+input_csv = str(Path(_script_dir) / "combined_results.csv")
+output_dir = _script_dir
+# Read the CSV file
+df = pd.read_csv(input_csv)
+# Calculate average of training and testing scores
+df['average_score'] = (df['training_score'] + df['testing_score']) / 2
+# Remove rows where either score is None (NaN)
+df_complete = df.dropna(subset=['training_score', 'testing_score'])
+print(f"\n=== Analysis Results ===")
+print(f"Total problems: {len(df)}")
+print(f"Problems with complete data: {len(df_complete)}")
+print(f"\nTraining Scores:")
+print(f"  Mean: {df_complete['training_score'].mean():.4f}")
+print(f"  Median: {df_complete['training_score'].median():.4f}")
+print(f"  Std Dev: {df_complete['training_score'].std():.4f}")
+print(f"  Min: {df_complete['training_score'].min():.4f}")
+print(f"  Max: {df_complete['training_score'].max():.4f}")
+print(f"\nTesting Scores:")
+print(f"  Mean: {df_complete['testing_score'].mean():.4f}")
+print(f"  Median: {df_complete['testing_score'].median():.4f}")
+print(f"  Std Dev: {df_complete['testing_score'].std():.4f}")
+print(f"  Min: {df_complete['testing_score'].min():.4f}")
+print(f"  Max: {df_complete['testing_score'].max():.4f}")
+print(f"\nAverage Scores:")
+print(f"  Mean: {df_complete['average_score'].mean():.4f}")
+print(f"  Median: {df_complete['average_score'].median():.4f}")
+print(f"  Std Dev: {df_complete['average_score'].std():.4f}")
+# Save the updated CSV with averages
+output_csv = Path(output_dir) / "combined_results_with_averages.csv"
+df.to_csv(output_csv, index=False)
+print(f"\nUpdated CSV with averages saved to {output_csv}")
+# Create visualizations
+fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+# 1. Scatter plot: Training vs Testing scores
+ax = axes[0, 0]
+ax.scatter(df_complete['training_score'], df_complete['testing_score'], alpha=0.6, s=50)
+# Add diagonal line for reference (where training == testing)
+lim = [min(df_complete['training_score'].min(), df_complete['testing_score'].min()),
+       max(df_complete['training_score'].max(), df_complete['testing_score'].max())]
+ax.plot(lim, lim, 'r--', alpha=0.5, label='Training = Testing')
+ax.set_xlabel('Training Score')
+ax.set_ylabel('Testing Score')
+ax.set_title('Training vs Testing Scores')
+ax.legend()
+ax.grid(True, alpha=0.3)
+# 2. Distribution comparison - histograms
+ax = axes[0, 1]
+ax.hist(df_complete['training_score'], bins=20, alpha=0.6, label='Training', edgecolor='black')
+ax.hist(df_complete['testing_score'], bins=20, alpha=0.6, label='Testing', edgecolor='black')
+ax.set_xlabel('Score')
+ax.set_ylabel('Frequency')
+ax.set_title('Distribution of Training vs Testing Scores')
+ax.legend()
+ax.grid(True, alpha=0.3, axis='y')
+# 3. Box plot comparison
+ax = axes[1, 0]
+box_data = [df_complete['training_score'], df_complete['testing_score'], df_complete['average_score']]
+bp = ax.boxplot(box_data, labels=['Training', 'Testing', 'Average'])
+ax.set_ylabel('Score')
+ax.set_title('Score Comparison (Box Plot)')
+ax.grid(True, alpha=0.3, axis='y')
+# 4. Difference plot: Training - Testing
+ax = axes[1, 1]
+difference = df_complete['training_score'] - df_complete['testing_score']
+ax.scatter(df_complete['problem_id'].astype(int), difference, alpha=0.6, s=50)
+ax.axhline(y=0, color='r', linestyle='--', alpha=0.5, label='No Difference')
+ax.set_xlabel('Problem ID')
+ax.set_ylabel('Training Score - Testing Score')
+ax.set_title('Score Difference (Training - Testing)')
+ax.legend()
+ax.grid(True, alpha=0.3)
+plt.tight_layout()
+plot_path = Path(output_dir) / "results_analysis.png"
+plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+print(f"Plot saved to {plot_path}")
+# Additional statistics about differences
+print(f"\nScore Differences (Training - Testing):")
+print(f"  Mean Difference: {difference.mean():.4f}")
+print(f"  Median Difference: {difference.median():.4f}")
+print(f"  Std Dev: {difference.std():.4f}")
+print(f"  Problems where training > testing: {(difference > 0).sum()}")
+print(f"  Problems where testing > training: {(difference < 0).sum()}")
+plt.show()

benchmarks/frontier-cs-eval/combine_results.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import json
+import csv
+import os
+from pathlib import Path
+# Define paths
+_script_dir = Path(__file__).resolve().parent
+_repo_root = _script_dir.parent.parent
+training_dir = str(_repo_root / "outputs" / "frontier_cs")
+testing_dir = str(_script_dir / "evaluation_results")
+output_csv = str(_script_dir / "combined_results.csv")
+# Collect all problems
+results = []
+# Get all problem directories from training data
+training_problems = sorted([d for d in os.listdir(training_dir) if d.startswith("problem_")])
+print(f"Found {len(training_problems)} training problems")
+for problem_dir in training_problems:
+    problem_id = problem_dir.replace("problem_", "")
+    # Get training score from best_program_info.json
+    training_score = None
+    training_info_path = os.path.join(training_dir, problem_dir, "best", "best_program_info.json")
+    if os.path.exists(training_info_path):
+        try:
+            with open(training_info_path, 'r') as f:
+                training_data = json.load(f)
+                training_score = training_data.get("metrics", {}).get("combined_score")
+        except Exception as e:
+            print(f"Error reading training data for problem {problem_id}: {e}")
+    # Get testing score from evaluation_results json
+    testing_score = None
+    testing_json_path = os.path.join(testing_dir, f"problem_{problem_id}.json")
+    if os.path.exists(testing_json_path):
+        try:
+            with open(testing_json_path, 'r') as f:
+                testing_data = json.load(f)
+                testing_score = testing_data.get("combined_score")
+        except Exception as e:
+            print(f"Error reading testing data for problem {problem_id}: {e}")
+    results.append({
+        "problem_id": problem_id,
+        "training_score": training_score,
+        "testing_score": testing_score
+    })
+# Write to CSV
+with open(output_csv, 'w', newline='') as csvfile:
+    fieldnames = ["problem_id", "training_score", "testing_score"]
+    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+    writer.writeheader()
+    writer.writerows(results)
+print(f"\nResults written to {output_csv}")
+print(f"Total problems: {len(results)}")
+print(f"Problems with both scores: {sum(1 for r in results if r['training_score'] is not None and r['testing_score'] is not None)}")
+print(f"Problems missing training score: {sum(1 for r in results if r['training_score'] is None)}")
+print(f"Problems missing testing score: {sum(1 for r in results if r['testing_score'] is None)}")

benchmarks/frontier-cs-eval/config.yaml ADDED Viewed

	@@ -0,0 +1,57 @@

+# Frontier-CS Benchmark
+# Usage: uv run skydiscover-run initial_program.cpp evaluator.py -c config.yaml -s <strategy> -i 50
+max_iterations: 100
+checkpoint_interval: 10
+log_level: INFO
+llm:
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  api_base: https://api.openai.com/v1
+  temperature: 0.7
+  # top_p: 0.95  # omitted by default; some providers (e.g. Anthropic) reject both temperature and top_p
+  max_tokens: 32000
+  timeout: 600
+  # To use Gemini: override with --model gemini-3-pro-preview
+prompt:
+  system_message: |
+    You are an expert competitive programmer specializing in algorithmic optimization.
+    PROBLEM STATEMENT:
+    {problem_statement}
+    CONSTRAINTS:
+    {problem_constraints}
+    OBJECTIVE: Maximize the score returned by the Frontier-CS judge (higher is better).
+    Your solution must be valid C++ code that compiles and runs correctly.
+    KEY STRATEGIES:
+    - Analyze the problem structure carefully before coding
+    - Consider time and space complexity constraints
+    - Use efficient data structures (vectors, maps, sets, priority queues)
+    - Implement clean, well-structured code
+    - Handle edge cases properly
+    - Optimize hot loops and critical sections
+    COMMON TECHNIQUES:
+    - Dynamic programming for optimization problems
+    - Greedy algorithms with proper ordering
+    - Graph algorithms (BFS, DFS, shortest paths)
+    - Binary search for monotonic functions
+    - Divide and conquer approaches
+    - Heuristic search (simulated annealing, genetic algorithms, local search)
+    OUTPUT: Complete C++ program with main() function that reads from stdin and writes to stdout.
+evaluator:
+  timeout: 300
+  max_retries: 3
+  cascade_evaluation: false
+diff_based_generation: true
+max_solution_length: 50000
+random_seed: 42

benchmarks/frontier-cs-eval/evaluator.py ADDED Viewed

	@@ -0,0 +1,174 @@

+"""
+Evaluator for Frontier-CS algorithmic problems.
+This evaluator integrates with SkyDiscover to evaluate generated C++ solutions
+against Frontier-CS benchmark problems using the local judge server.
+"""
+import traceback
+from pathlib import Path
+import logging
+import sys
+import os
+import random
+logger = logging.getLogger(__name__)
+# Support multiple judge servers for load balancing
+DEFAULT_JUDGE_URL = "http://localhost:8081"
+JUDGE_URLS = os.environ.get("JUDGE_URLS", DEFAULT_JUDGE_URL).split(",")
+JUDGE_URLS = [url.strip() for url in JUDGE_URLS if url.strip()]
+def get_judge_url() -> str:
+    """Get a judge URL using random selection for load balancing."""
+    return random.choice(JUDGE_URLS)
+# Add Frontier-CS to path
+frontier_cs_path = Path(__file__).resolve().parent / "Frontier-CS" / "src"
+if str(frontier_cs_path) not in sys.path:
+    sys.path.insert(0, str(frontier_cs_path))
+try:
+    from frontier_cs.single_evaluator import SingleEvaluator as FrontierCSEvaluator
+    from frontier_cs.runner.base import EvaluationStatus
+except ImportError as e:
+    logger.error(f"Failed to import Frontier-CS: {e}")
+    logger.error("Please ensure Frontier-CS is installed as a submodule in benchmarks/frontier-cs-eval/Frontier-CS")
+    raise
+def evaluate(program_path: str, problem_id: str = None, **kwargs) -> dict:
+    """
+    Evaluate a C++ solution for a Frontier-CS algorithmic problem.
+    Args:
+        program_path: Path to the C++ solution file
+        problem_id: Frontier-CS problem ID (e.g., "0", "1", "2", etc.)
+                    If None, will be read from FRONTIER_CS_PROBLEM env var or config
+    Returns:
+        dict with evaluation results:
+            - combined_score: The score from the judge (higher is better)
+            - runs_successfully: 1.0 if evaluation succeeded, 0.0 otherwise
+            - status: Evaluation status string
+            - message: Any error or status messages
+            - problem_id: The problem ID
+            - program_path: Path to the evaluated program
+            - score_unbounded: Unbounded score if available
+            - metadata: Additional evaluation metadata
+    """
+    # Get problem_id from parameter, environment, or kwargs
+    if problem_id is None:
+        import os
+        problem_id = os.environ.get('FRONTIER_CS_PROBLEM')
+        if problem_id is None:
+            problem_id = kwargs.get('frontier_cs_problem', '0')
+    logger.info(f"Evaluating program {program_path} for Frontier-CS problem {problem_id}")
+    try:
+        # Initialize evaluator with judge server (load balanced if multiple configured)
+        judge_url = get_judge_url()
+        logger.info(f"Using judge server: {judge_url}")
+        evaluator = FrontierCSEvaluator(
+            backend="docker",
+            judge_url=judge_url,
+            register_cleanup=False,
+        )
+        # Read the solution code
+        solution_path = Path(program_path)
+        if not solution_path.exists():
+            error_msg = f"Solution file not found: {program_path}"
+            logger.error(error_msg)
+            return {
+                "combined_score": 0.0,
+                "runs_successfully": 0.0,
+                "status": "error",
+                "message": error_msg,
+                "problem_id": problem_id,
+                "program_path": program_path,
+            }
+        # Extract code and remove any EVOLVE-BLOCK markers
+        code = solution_path.read_text().replace(
+            "// EVOLVE-BLOCK-START", ""
+        ).replace(
+            "// EVOLVE-BLOCK-END", ""
+        ).strip()
+        logger.info(f"Code extracted from {program_path}")
+        # Evaluate the solution
+        result = evaluator.evaluate(
+            track="algorithmic",
+            problem_id=problem_id,
+            code=code,
+            backend="docker",
+        )
+        logger.info(f"Evaluation completed with status: {result.status}")
+        # Process result
+        if result.status == EvaluationStatus.SUCCESS:
+            print(result)
+            score = result.score
+            # Use unbounded score for optimization (allows >100 if beating reference)
+            score_unbounded = result.metadata.get('scoreUnbounded', score) if result.metadata else score
+            print(f"score={score}, score_unbounded={score_unbounded}")
+            # Extract only essential metadata (exclude large test case outputs)
+            essential_metadata = {}
+            if result.metadata:
+                essential_metadata = {
+                    "status": result.metadata.get("status"),
+                    "passed": result.metadata.get("passed"),
+                    "result": result.metadata.get("result"),
+                    "score": result.metadata.get("score"),
+                    "scoreUnbounded": result.metadata.get("scoreUnbounded"),
+                }
+            return {
+                "combined_score": float(score),  # Ensure it's a float
+                "score_unbounded": score_unbounded,
+                "runs_successfully": 1.0,
+                "status": "success",
+                "message": result.message or "Evaluation successful",
+                "problem_id": problem_id,
+                "program_path": program_path,
+                "duration_seconds": result.duration_seconds,
+                "metadata": essential_metadata,
+            }
+        elif result.status == EvaluationStatus.TIMEOUT:
+            logger.warning(f"Evaluation timed out: {result.message}")
+            return {
+                "combined_score": 0.0,
+                "runs_successfully": 0.0,
+                "status": "timeout",
+                "message": result.message or "Evaluation timed out",
+                "problem_id": problem_id,
+                "program_path": program_path,
+            }
+        else:  # ERROR status
+            logger.error(f"Evaluation error: {result.message}")
+            return {
+                "combined_score": 0.0,
+                "runs_successfully": 0.0,
+                "status": "error",
+                "message": result.message or "Evaluation failed",
+                "problem_id": problem_id,
+                "program_path": program_path,
+                "logs": result.logs,
+            }
+    except Exception as e:
+        logger.error(f"Evaluation failed completely: {str(e)}")
+        logger.error(traceback.format_exc())
+        return {
+            "combined_score": 0.0,
+            "runs_successfully": 0.0,
+            "status": "error",
+            "message": str(e),
+            "problem_id": problem_id,
+            "program_path": program_path,
+            "error": str(e),
+        }

benchmarks/frontier-cs-eval/initial_program.cpp ADDED Viewed

	@@ -0,0 +1,6 @@

+#include <bits/stdc++.h>
+using namespace std;
+int main(){
+    std::cout << "Hello, World!" << std::endl;
+    return 0;
+}

benchmarks/frontier-cs-eval/run_all_frontiercs.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import argparse
+import os
+import sys
+import subprocess
+from pathlib import Path
+from concurrent.futures import ProcessPoolExecutor
+from dotenv import load_dotenv
+load_dotenv()
+SCRIPT_DIR = Path(__file__).resolve().parent
+frontier_cs_path = SCRIPT_DIR / "Frontier-CS" / "src"
+if str(frontier_cs_path) not in sys.path:
+    sys.path.insert(0, str(frontier_cs_path))
+from frontier_cs.runner.algorithmic_local import AlgorithmicLocalRunner
+def run_single_problem(args):
+    p_id, search, iterations, env = args
+    print(f"\n[START] Problem ID: {p_id}")
+    command = [
+        "uv", "run", "skydiscover-run",
+        "initial_program.cpp", "evaluator.py",
+        "-c", "config.yaml",
+        "-s", search,
+        "-i", str(iterations),
+        "-o", f"outputs/frontier_cs/problem_{p_id}",
+    ]
+    env = {**env, "FRONTIER_CS_PROBLEM": str(p_id)}
+    try:
+        subprocess.run(command, check=True, env=env, cwd=str(SCRIPT_DIR))
+        return f"✅ Problem {p_id} completed."
+    except subprocess.CalledProcessError as e:
+        return f"❌ Problem {p_id} failed: {e}"
+def main():
+    parser = argparse.ArgumentParser(description="Run SkyDiscover on all Frontier-CS problems")
+    parser.add_argument("--search", "-s", default="adaevolve",
+                        help="Search algorithm (default: adaevolve)")
+    parser.add_argument("--iterations", "-i", type=int, default=50,
+                        help="Iterations per problem (default: 50)")
+    parser.add_argument("--workers", "-w", type=int, default=6,
+                        help="Parallel workers (default: 6)")
+    args = parser.parse_args()
+    runner = AlgorithmicLocalRunner()
+    problems_data = runner.list_problems()
+    problem_ids = sorted([p['id'] for p in problems_data['problems']], key=int)
+    print(f"Running {len(problem_ids)} problems with {args.workers} workers "
+          f"(search={args.search}, iterations={args.iterations})...")
+    env = os.environ.copy()
+    task_args = [(p_id, args.search, args.iterations, env) for p_id in problem_ids]
+    with ProcessPoolExecutor(max_workers=args.workers) as executor:
+        results = list(executor.map(run_single_problem, task_args))
+    print("\n" + "=" * 30)
+    print("ALL RUNS COMPLETE")
+    print("=" * 30)
+    for result in results:
+        print(result)
+if __name__ == "__main__":
+    main()

benchmarks/frontier-cs-eval/run_best_programs_frontiercs.py ADDED Viewed

	@@ -0,0 +1,404 @@

+import os
+import sys
+import json
+import logging
+import threading
+from pathlib import Path
+from typing import Dict, List, Tuple
+from concurrent.futures import ThreadPoolExecutor, as_completed
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# Add Frontier-CS to path
+frontier_cs_path = Path(__file__).resolve().parent / "Frontier-CS" / "src"
+if str(frontier_cs_path) not in sys.path:
+    sys.path.insert(0, str(frontier_cs_path))
+try:
+    from frontier_cs.evaluator import FrontierCSEvaluator
+    from frontier_cs.runner.base import EvaluationStatus
+except ImportError as e:
+    logger.error(f"Failed to import Frontier-CS: {e}")
+    logger.error("Please ensure Frontier-CS is installed as a submodule in benchmarks/frontier-cs-eval/Frontier-CS")
+    sys.exit(1)
+class BestProgramEvaluator:
+    """Evaluates all best_program.cpp files in the outputs directory."""
+    def __init__(self, outputs_dir: str, judge_url: str = "http://localhost:8081", num_workers: int = 8):
+        """
+        Initialize the evaluator.
+        Args:
+            outputs_dir: Path to the outputs directory containing problem folders
+            judge_url: URL of the judge server
+            num_workers: Number of parallel workers for evaluation
+        """
+        self.outputs_dir = Path(outputs_dir)
+        self.judge_url = judge_url
+        self.num_workers = num_workers
+        # Use thread-local storage for evaluator instances (avoid race condition)
+        self._evaluator_local = threading.local()
+        self.results = []
+        # Create results directory in the script's directory
+        self.results_dir = Path(__file__).resolve().parent / "evaluation_results"
+        self.results_dir.mkdir(exist_ok=True)
+        logger.info(f"Results will be saved to {self.results_dir}")
+        logger.info(f"Using {self.num_workers} parallel workers with thread-local evaluators")
+    def _get_evaluator(self) -> 'FrontierCSEvaluator':
+        """
+        Get the evaluator for the current thread.
+        Creates a new instance if this thread hasn't created one yet.
+        This avoids race conditions from sharing a single evaluator across threads.
+        """
+        if not hasattr(self._evaluator_local, 'evaluator'):
+            self._evaluator_local.evaluator = FrontierCSEvaluator(
+                backend="docker",
+                judge_url=self.judge_url,
+            )
+            logger.debug(f"Created new evaluator for thread {threading.current_thread().name}")
+        return self._evaluator_local.evaluator
+    def find_best_programs(self) -> Dict[str, Path]:
+        """
+        Find all best_program.cpp files in the outputs directory.
+        Returns:
+            Dict mapping problem_id to best_program.cpp path
+        """
+        best_programs = {}
+        # Look for frontier_cs subdirectory
+        frontier_cs_dir = self.outputs_dir / "frontier_cs"
+        if not frontier_cs_dir.exists():
+            logger.error(f"frontier_cs directory not found at {frontier_cs_dir}")
+            return best_programs
+        # Iterate through problem directories
+        for problem_dir in sorted(frontier_cs_dir.iterdir()):
+            if not problem_dir.is_dir() or not problem_dir.name.startswith("problem_"):
+                continue
+            # Extract problem ID
+            problem_id = problem_dir.name.replace("problem_", "")
+            # Look for best_program.cpp
+            best_program_path = problem_dir / "best" / "best_program.cpp"
+            if best_program_path.exists():
+                best_programs[problem_id] = best_program_path
+                logger.info(f"Found best_program.cpp for problem {problem_id}")
+            else:
+                logger.warning(f"best_program.cpp not found for problem {problem_id} at {best_program_path}")
+        return best_programs
+    def evaluate_program(self, problem_id: str, program_path: Path) -> Dict:
+        """
+        Evaluate a single best_program.cpp file.
+        Args:
+            problem_id: The Frontier-CS problem ID
+            program_path: Path to the best_program.cpp file
+        Returns:
+            Dictionary with evaluation results
+        """
+        logger.info(f"Evaluating problem {problem_id}: {program_path}")
+        try:
+            # Read the solution code
+            if not program_path.exists():
+                error_msg = f"Solution file not found: {program_path}"
+                logger.error(error_msg)
+                return {
+                    "problem_id": problem_id,
+                    "program_path": str(program_path),
+                    "combined_score": 0.0,
+                    "runs_successfully": 0.0,
+                    "status": "error",
+                    "message": error_msg,
+                }
+            # Read the code
+            code = program_path.read_text().replace(
+                "// EVOLVE-BLOCK-START", ""
+            ).replace(
+                "// EVOLVE-BLOCK-END", ""
+            ).strip()
+            logger.info(f"Code extracted from {program_path}, length: {len(code)} characters")
+            # Evaluate the solution (use thread-local evaluator)
+            evaluator = self._get_evaluator()
+            result = evaluator.evaluate(
+                track="algorithmic",
+                problem_id=problem_id,
+                code=code,
+                backend="docker",
+            )
+            logger.info(f"Evaluation completed for problem {problem_id} with status: {result.status}")
+            # Log the result object and its properties
+            logger.info(f"Judger output for problem {problem_id}:")
+            logger.info(f"  Status: {result.status}")
+            logger.info(f"  Message: {result.message}")
+            if hasattr(result, 'score'):
+                logger.info(f"  Score: {result.score}")
+            if hasattr(result, 'duration_seconds'):
+                logger.info(f"  Duration: {result.duration_seconds}s")
+            if hasattr(result, 'metadata'):
+                logger.info(f"  Metadata: {result.metadata}")
+            logger.info(f"  Full result object: {result}")
+            # Process result
+            if result.status == EvaluationStatus.SUCCESS:
+                score = result.score
+                logger.info(f"Problem {problem_id}: Score = {score}")
+                return {
+                    "problem_id": problem_id,
+                    "program_path": str(program_path),
+                    "combined_score": float(score),
+                    "runs_successfully": 1.0,
+                    "status": "success",
+                    "message": result.message or "Evaluation successful",
+                    "duration_seconds": result.duration_seconds,
+                    "judger_output": str(result),
+                    "metadata": result.metadata if hasattr(result, 'metadata') else None,
+                }
+            elif result.status == EvaluationStatus.TIMEOUT:
+                logger.warning(f"Problem {problem_id}: Evaluation timed out")
+                return {
+                    "problem_id": problem_id,
+                    "program_path": str(program_path),
+                    "combined_score": 0.0,
+                    "runs_successfully": 0.0,
+                    "status": "timeout",
+                    "message": f"Evaluation timed out: {result.message}",
+                    "duration_seconds": result.duration_seconds,
+                    "judger_output": str(result),
+                }
+            elif result.status == EvaluationStatus.COMPILATION_ERROR:
+                logger.warning(f"Problem {problem_id}: Compilation error")
+                return {
+                    "problem_id": problem_id,
+                    "program_path": str(program_path),
+                    "combined_score": 0.0,
+                    "runs_successfully": 0.0,
+                    "status": "compilation_error",
+                    "message": f"Compilation error: {result.message}",
+                    "duration_seconds": result.duration_seconds,
+                    "judger_output": str(result),
+                }
+            else:
+                logger.error(f"Problem {problem_id}: Evaluation failed with status {result.status}")
+                return {
+                    "problem_id": problem_id,
+                    "program_path": str(program_path),
+                    "combined_score": 0.0,
+                    "runs_successfully": 0.0,
+                    "status": str(result.status),
+                    "message": f"Evaluation failed: {result.message}",
+                    "duration_seconds": result.duration_seconds,
+                    "judger_output": str(result),
+                }
+        except Exception as e:
+            logger.error(f"Exception while evaluating problem {problem_id}: {str(e)}")
+            logger.error(f"Exception traceback: {type(e).__name__}")
+            import traceback
+            logger.error(traceback.format_exc())
+            return {
+                "problem_id": problem_id,
+                "program_path": str(program_path),
+                "combined_score": 0.0,
+                "runs_successfully": 0.0,
+                "status": "exception",
+                "message": str(e),
+            }
+    def run_all_evaluations(self) -> List[Dict]:
+        """
+        Run evaluations for all best_program.cpp files sequentially (one at a time).
+        Returns:
+            List of evaluation results
+        """
+        logger.info(f"Starting evaluation of all best programs in {self.outputs_dir}")
+        best_programs = self.find_best_programs()
+        logger.info(f"Found {len(best_programs)} best_program.cpp files")
+        if not best_programs:
+            logger.warning("No best_program.cpp files found!")
+            return []
+        # Sort problems by ID for consistent ordering
+        sorted_problems = sorted(best_programs.items(), key=lambda x: int(x[0]))
+        # Evaluate each program sequentially (no parallelization)
+        results = []
+        total = len(sorted_problems)
+        for idx, (problem_id, program_path) in enumerate(sorted_problems, 1):
+            logger.info(f"[SEQ] Evaluating problem {problem_id} ({idx}/{total})")
+            try:
+                result = self.evaluate_program(problem_id, program_path)
+                # CRITICAL: Ensure problem_id matches
+                if result.get("problem_id") != problem_id:
+                    logger.error(f"[CRITICAL] Problem ID MISMATCH! Expected {problem_id}, got {result.get('problem_id')}")
+                    result["problem_id"] = problem_id  # Force correct problem_id
+                results.append(result)
+                self.results.append(result)
+                logger.info(f"[SAVE] Saving problem {problem_id} result to file")
+                # Save result immediately after evaluation
+                self.save_problem_result(result)
+            except Exception as e:
+                logger.error(f"Exception evaluating problem {problem_id}: {str(e)}")
+                import traceback
+                logger.error(traceback.format_exc())
+                error_result = {
+                    "problem_id": problem_id,
+                    "combined_score": 0.0,
+                    "runs_successfully": 0.0,
+                    "status": "exception",
+                    "message": str(e),
+                }
+                results.append(error_result)
+                self.results.append(error_result)
+                self.save_problem_result(error_result)
+        return results
+    def save_results(self, output_file: str = "evaluation_results.json"):
+        """
+        Save evaluation results to a JSON file.
+        Args:
+            output_file: Path to save the results
+        """
+        output_path = Path(output_file)
+        with open(output_path, 'w') as f:
+            json.dump(self.results, f, indent=2)
+        logger.info(f"Results saved to {output_path}")
+    def save_problem_result(self, result: Dict):
+        """
+        Save individual problem result to a separate file.
+        Args:
+            result: The evaluation result for a single problem
+        """
+        problem_id = result.get("problem_id", "unknown")
+        result_file = self.results_dir / f"problem_{problem_id}.json"
+        with open(result_file, 'w') as f:
+            json.dump(result, f, indent=2)
+        logger.info(f"Problem {problem_id} result saved to {result_file}")
+    def print_summary(self):
+        """Print a summary of the evaluation results."""
+        if not self.results:
+            logger.info("No results to summarize")
+            return
+        logger.info("\n" + "="*80)
+        logger.info("EVALUATION SUMMARY")
+        logger.info("="*80)
+        successful = [r for r in self.results if r.get("status") == "success"]
+        timeout = [r for r in self.results if r.get("status") == "timeout"]
+        compilation_error = [r for r in self.results if r.get("status") == "compilation_error"]
+        other_error = [r for r in self.results if r.get("status") not in ["success", "timeout", "compilation_error"]]
+        logger.info(f"Total problems evaluated: {len(self.results)}")
+        logger.info(f"Successful: {len(successful)}")
+        logger.info(f"Timeouts: {len(timeout)}")
+        logger.info(f"Compilation errors: {len(compilation_error)}")
+        logger.info(f"Other errors: {len(other_error)}")
+        if successful:
+            scores = [r["combined_score"] for r in successful]
+            logger.info(f"\nSuccessful evaluation scores:")
+            logger.info(f"  Average score: {sum(scores) / len(scores):.2f}")
+            logger.info(f"  Min score: {min(scores):.2f}")
+            logger.info(f"  Max score: {max(scores):.2f}")
+            logger.info(f"\nTop 5 problems by score:")
+            top_5 = sorted(successful, key=lambda r: r["combined_score"], reverse=True)[:5]
+            for i, result in enumerate(top_5, 1):
+                logger.info(f"  {i}. Problem {result['problem_id']}: {result['combined_score']:.2f}")
+        logger.info("="*80 + "\n")
+def main():
+    """Main entry point."""
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="Evaluate all best_program.cpp files in the outputs directory"
+    )
+    # Default outputs directory is two levels up from this script
+    default_outputs_dir = Path(__file__).resolve().parent.parent.parent / "outputs"
+    parser.add_argument(
+        "--outputs-dir",
+        type=str,
+        default=str(default_outputs_dir),
+        help="Path to the outputs directory (default: ../../outputs from script location)"
+    )
+    parser.add_argument(
+        "--judge-url",
+        type=str,
+        default="http://localhost:8081",
+        help="URL of the judge server (default: http://localhost:8081)"
+    )
+    parser.add_argument(
+        "--output-file",
+        type=str,
+        default="evaluation_results.json",
+        help="Path to save the evaluation results (default: evaluation_results.json)"
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=8,
+        help="Number of parallel workers for evaluation (default: 8)"
+    )
+    args = parser.parse_args()
+    # Run evaluations
+    evaluator = BestProgramEvaluator(
+        outputs_dir=args.outputs_dir,
+        judge_url=args.judge_url,
+        num_workers=args.workers
+    )
+    results = evaluator.run_all_evaluations()
+    evaluator.save_results(args.output_file)
+    evaluator.print_summary()
+    logger.info(f"Evaluation complete. Results saved to {args.output_file}")
+if __name__ == "__main__":
+    main()

benchmarks/gpu_mode/mla_decode/config.yaml ADDED Viewed

	@@ -0,0 +1,355 @@

+# GPU Mode: MLA Decode (Multi-Head Latent Attention) Triton Kernel
+max_iterations: 100
+checkpoint_interval: 1
+log_level: "INFO"
+llm:
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  api_base: https://api.openai.com/v1
+  temperature: 1.0
+  # top_p: 0.95  # omitted by default; some providers (e.g. Anthropic) reject both temperature and top_p
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: |
+    You are an expert Triton engineer tasked with translating PyTorch code into highly optimized Triton kernel code.
+    Below is a pytorch implementation of the multi-head latent attention (MLA) module. You will want to implement a Triton kernel for the operations in the forward call:
+    ```python
+    import math
+    from dataclasses import dataclass
+    import torch
+    from torch import nn
+    import torch.nn.functional as F
+    class RoPE(nn.Module):
+        def __init__(self, d_model: int):
+            super().__init__()
+            self.d_model = d_model
+            theta = 10000 ** (-torch.arange(0, d_model//2,dtype=torch.bfloat16) / (d_model//2))
+            self.register_buffer("theta", theta)
+        def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
+            x1, x2 = x.chunk(2, dim=-1)
+            return torch.cat((-x2, x1), dim=-1)
+        def forward(self, x: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
+            seq_len = x.size(-2)
+            d_model = x.size(-1)
+            assert d_model == self.d_model
+            seq_idx = torch.arange(start_pos, start_pos + seq_len, device=x.device)
+            idx_theta = torch.einsum('s,d->sd', seq_idx, self.theta)
+            idx_theta2 = torch.cat([idx_theta, idx_theta], dim=-1)
+            cos = idx_theta2.cos().to(torch.bfloat16)
+            sin = idx_theta2.sin().to(torch.bfloat16)
+            return x * cos + self.rotate_half(x) * sin
+    class KVCache(nn.Module):
+        def __init__(self, kv_cache_shape: tuple) -> None:
+            super().__init__()
+            self.register_buffer('data', torch.zeros(kv_cache_shape, dtype=torch.bfloat16, device='cuda'))
+            self.seq_len = 0
+            self.zero()
+        def zero(self) -> None:
+            self.data.zero_()
+        def get_data(self) -> torch.Tensor:
+            return self.data
+        def forward(self, c_kv: torch.Tensor) -> torch.Tensor:
+            assert self.seq_len + c_kv.size(1) <= self.data.size(1), "KV Cache Exceeded"
+            self.data = self.data.to(c_kv.dtype)
+            self.data[
+                :, self.seq_len : self.seq_len + c_kv.size(1), :
+            ] = c_kv
+            self.seq_len += c_kv.size(1)
+            return self.data[:, :self.seq_len], self.seq_len
+    @dataclass
+    class Config:
+        batch_size: int
+        dim: int
+        n_heads: int
+        q_lora_rank: int
+        kv_lora_rank: int
+        qk_nope_head_dim: int
+        qk_rope_head_dim: int
+        v_head_dim: int
+        seq_len: int
+        max_seq_len: int
+        kv_cache_shape: tuple
+        Q_proj_down_weight: torch.Tensor
+        Q_proj_up_weight: torch.Tensor
+        KV_proj_down_weight: torch.Tensor
+        KV_proj_up_weight: torch.Tensor
+        wo_weight: torch.Tensor
+    class MLA(nn.Module):
+        def __init__(self, config: Config):
+            super().__init__()
+            self.dim = config.dim
+            self.n_heads = config.n_heads
+            self.q_lora_rank = config.q_lora_rank
+            self.kv_lora_rank = config.kv_lora_rank
+            self.nope_head_dim = config.qk_nope_head_dim
+            self.rope_head_dim = config.qk_rope_head_dim
+            self.v_head_dim = config.v_head_dim
+            # Down-projection matrices
+            self.Q_proj_down = nn.Linear(self.dim, self.q_lora_rank, bias=False, dtype=torch.bfloat16)
+            self.KV_proj_down = nn.Linear(self.dim, self.kv_lora_rank + self.rope_head_dim, bias=False, dtype=torch.bfloat16)
+            # Up-projection and rope projection matrices
+            self.Q_proj_up = nn.Linear(self.q_lora_rank, (self.nope_head_dim + self.rope_head_dim) * self.n_heads, bias=False, dtype=torch.bfloat16)
+            self.KV_proj_up = nn.Linear(self.kv_lora_rank, (self.nope_head_dim + self.v_head_dim) * self.n_heads, bias=False, dtype=torch.bfloat16)
+            # RoPE on half embeddings
+            self.q_rope = RoPE(self.rope_head_dim)
+            self.k_rope = RoPE(self.rope_head_dim)
+            # Output projection
+            self.wo = nn.Linear(self.v_head_dim * self.n_heads, self.dim, dtype=torch.bfloat16, bias=False)
+            self.eps = 1e-6
+        def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
+            # seq_len = 1 always here
+            batch_size, seq_len, model_dim = x.size()
+            ## Step 1: Handle down-projection + KV cache ##
+            q_lora = self.Q_proj_down(x)
+            kv_lora = self.KV_proj_down(x)
+            kv_lora, kv_len = kv_cache(kv_lora)
+            query_pos = kv_len - 1
+            ## Step 2: Up-project and prepare NoPE + RoPE ##
+            # Handle queries Q first
+            q_nope_and_rope = self.Q_proj_up(q_lora).view(
+                batch_size, seq_len, self.n_heads, self.nope_head_dim + self.rope_head_dim)
+            q_nope, q_rope = torch.split(q_nope_and_rope, [self.nope_head_dim, self.rope_head_dim], dim=-1)
+            # Handle keys and values K/V. V does not need RoPE
+            kv_nope, k_rope = torch.split(kv_lora, [self.kv_lora_rank, self.rope_head_dim], dim=-1)
+            kv_nope = self.KV_proj_up(kv_nope).view(
+                batch_size, kv_len, self.n_heads, self.nope_head_dim + self.v_head_dim)
+            k_nope, v = torch.split(kv_nope, [self.nope_head_dim, self.v_head_dim], dim=-1)
+            ## Step 3: Handle RoPE Stream ##
+            # Compute RoPE for queries and combine with no-RoPE part
+            q_rope = q_rope.permute(0, 2, 1, 3) # bs x n_heads x seq_len x rope_head_dim
+            q_rope = self.q_rope(q_rope, start_pos=query_pos)
+            q_nope = q_nope.permute(0, 2, 1, 3) # bs x n_heads x seq_len x rope_head_dim
+            q = torch.concat([q_nope, q_rope], dim=-1)
+            # Compute RoPE for keys and combine with no-RoPE part
+            k_rope = k_rope[:, None, :, :]
+            k_rope = self.k_rope(k_rope).expand(-1,self.n_heads,-1,-1)
+            k_nope = k_nope.permute(0, 2, 1, 3) # bs x kv_len x n_heads x rope_head_dim
+            k = torch.concat([k_nope, k_rope], dim=-1)
+            ## Step 4: Compute Multi-head Attention ##
+            v = v.permute(0, 2, 1, 3) # bs x n_heads x kv_len x v_head_dim
+            scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.rope_head_dim + self.nope_head_dim)
+            attn = F.softmax(scores, dim=-1).to(torch.bfloat16)
+            y = torch.matmul(attn, v).view(batch_size, 1, -1)
+            y = self.wo(y)
+            return y, kv_cache.get_data()
+    ```
+    Your function should be defined as 'custom_kernel' (skeleton provided below)
+    ```python
+    ### DO NOT CHANGE THIS IMPORT STATEMENTS BLOCK ###
+    import os
+    import math
+    from typing import Tuple
+    import torch
+    import torch.nn.functional as F
+    import triton
+    from reference import KVCache, Config  # Definition of KVCache and Config classes are shown above. Must import this way. Do not rewrite yourself.
+    ### END OF IMPORT STATEMENTS BLOCK ###
+    ### Import other packages here if needed
+    def custom_kernel(data: Tuple[Config, torch.Tensor, KVCache]) -> Tuple[torch.Tensor, KVCache]:
+        config, x, kv_cache = data
+        bs = config.batch_size
+        sl = config.seq_len
+        pl = kv_cache.seq_len
+        msl = config.max_seq_len
+        nh = config.n_heads
+        d =  config.dim
+        dq = config.q_lora_rank
+        dkv = config.kv_lora_rank
+        dnope = config.qk_nope_head_dim
+        drope = config.qk_rope_head_dim
+        dv = config.v_head_dim
+        wDQ  = config.Q_proj_down_weight
+        wDKV = config.KV_proj_down_weight
+        wUQ  = config.Q_proj_up_weight
+        wUKV = config.KV_proj_up_weight
+        wO   = config.wo_weight
+        # Perform MLA operations to process data into output and updated kv_cache
+        return output, kv_cache.data
+    ```
+    with the following signature:
+    Input:
+    - `data`: Tuple of (config: Config, x: torch.Tensor, kv_cache: KVCache)
+        - config: An instance of class `Config` containing model configurations and weights
+        - x: Input tensor of shape [batch_size, seq_len, dim]
+        - kv_cache: An instance of KVCache class for caching the keys and values
+    Output:
+    - output: Output tensor [batch_size, seq_len, dim]
+    - kv_cache.data: The data field of the updated `KVCache` instance with the new keys and values added
+    To warm you up in writing optimized triton code, here is an example code which is correct for your task but very unoptimized. Your code should be as optimized as possible but still correct.
+    ```python
+    import os
+    import math
+    from typing import Tuple
+    import torch
+    import torch.nn.functional as F
+    import triton
+    import triton.language as tl
+    from reference import KVCache, Config
+    @triton.jit
+    def rope_swap_halves_kernel(
+        x_ptr,
+        cos_ptr, sin_ptr,
+        B: tl.constexpr,
+        T: tl.constexpr,
+        D: tl.constexpr,
+        stride_xb, stride_xt, stride_xd,
+        stride_cos_t, stride_cos_d,
+        stride_sin_t, stride_sin_d,
+        BLOCK_HALF: tl.constexpr,
+    ):
+        pid = tl.program_id(0)
+        bt = pid
+        b = bt // T
+        t = bt - b * T
+        half = D // 2
+        off = tl.arange(0, BLOCK_HALF)
+        mask = off < half
+        x_base = x_ptr + b * stride_xb + t * stride_xt
+        x0_ptr = x_base + off * stride_xd
+        x1_ptr = x_base + (half + off) * stride_xd
+        cos_base = cos_ptr + t * stride_cos_t
+        sin_base = sin_ptr + t * stride_sin_t
+        c_ptr = cos_base + off * stride_cos_d
+        s_ptr = sin_base + off * stride_sin_d
+        x0 = tl.load(x0_ptr, mask=mask, other=0.0).to(tl.float32)
+        x1 = tl.load(x1_ptr, mask=mask, other=0.0).to(tl.float32)
+        c  = tl.load(c_ptr,  mask=mask, other=0.0).to(tl.float32)
+        s  = tl.load(s_ptr,  mask=mask, other=0.0).to(tl.float32)
+        out0 = x0 * c - x1 * s
+        out1 = x1 * c + x0 * s
+        tl.store(x0_ptr, out0.to(tl.bfloat16), mask=mask)
+        tl.store(x1_ptr, out1.to(tl.bfloat16), mask=mask)
+    # ... (see initial_program.py for full working baseline)
+    ```
+    Below are the different configs that your kernel will be tested on:
+    Common configs:
+      - {"batch_size": 128, "seq_len": 1, "kv_lora_rank": 512, "qk_rope_head_dim": 64, "v_head_dim": 128, "n_heads": 128, "dim": 7168, "q_lora_rank": 1536, "max_seq_len": 8192}
+    For correctness check:
+      - {"prefill": 128}
+      - {"prefill": 512}
+      - {"prefill": 1024}
+      - {"prefill": 2048}
+    For performance benchmark (optimize runtime for these):
+      - {"prefill": 6144}
+    Rules:
+    - The tensors arguments passed in will be already on your cuda device.
+    - The weights for all parameters in the MLA will be given as input.
+    - All weights and data will be in `torch.bfloat16` format.
+    - Define all of your code in one final ```python ``` block.
+    - The entrypoint to your code must be named 'custom_kernel'.
+    - You will be using trition 3.4.0 and your kernels will be run on an Nvidia H200 GPU.
+    - Consider optimizing multiple operations with triton, not just limited to softmax. E.g., rope, attention, etc.
+    - You are allowed to use torch.compile().
+    Important rules in triton 3.4.0:
+    - `tl.load` does not have an argument called `dtype`. Never use it like `tl.load(..., dtype=...)`.
+    - Triton dtypes are not callable, so never use them like `tl.float16(1.0)`, `tl.float32(0.0)`.
+    - `tl.arange(start, end)`:
+        - range length (end - start) must be power-of-2
+        - start, end must be of type `tl.constexpr`
+    - `tl.range(start, end, step, num_stages)`:
+        - keep loop index type stable, don't reassign it
+        - start, end, step do not have to be `tl.constexpr` but must stay scalar integer types
+        - num_stages must be `tl.constexpr`
+    - Do not something like x[0] or offs[0] inside a Triton kernel. Triton tensors are SIMD vectors; scalar indexing like [0] is not generally supported.
+    Here's an simple example correctly following these rules:
+    ```python
+    import torch
+    import triton
+    import triton.language as tl
+    @triton.jit
+    def kernel_right(
+        x_ptr, y_ptr, out_ptr,
+        n_elements: tl.constexpr,
+        BLOCK: tl.constexpr,
+        ROW_STEP: tl.constexpr,
+        NUM_STAGES: tl.constexpr,
+    ):
+        pid = tl.program_id(axis=0)
+        offs = pid * BLOCK + tl.arange(0, BLOCK)
+        mask = offs < n_elements
+        x = tl.load(x_ptr + offs, mask=mask, other=0.0)
+        y = tl.load(y_ptr + offs, mask=mask, other=0.0)
+        one_f32 = tl.full([], 1.0, tl.float32)
+        acc = tl.zeros((BLOCK,), dtype=tl.float32)
+        acc = tl.cast(x, tl.float32) + tl.cast(y, tl.float32) + one_f32
+        base = tl.full([], pid * BLOCK, tl.int32)
+        x0 = tl.load(x_ptr + base, mask=(base < n_elements), other=0.0)
+        x0_vec = tl.full((BLOCK,), x0, tl.float32)
+        out_vec = acc + x0_vec
+        n_rows = tl.full([], 4, tl.int32)
+        extra = tl.zeros((BLOCK,), dtype=tl.float32)
+        for r in tl.range(0, n_rows, ROW_STEP, num_stages=NUM_STAGES):
+            shift = r * tl.full([], 1, tl.int32)
+            offs_r = offs + shift
+            xr = tl.load(x_ptr + offs_r, mask=(offs_r < n_elements), other=0.0)
+            extra += tl.cast(xr, tl.float32)
+        out_vec = out_vec + extra
+        tl.store(out_ptr + offs, tl.cast(out_vec, tl.float16), mask=mask)
+    ```
+evaluator:
+  timeout: 600
+  max_retries: 3
+  cascade_evaluation: true
+  cascade_thresholds: [0.4, 0.3]
+diff_based_generation: true
+max_solution_length: 60000
+random_seed: 42

benchmarks/gpu_mode/mla_decode/initial_program.py ADDED Viewed

	@@ -0,0 +1,245 @@

+# EVOLVE-BLOCK-START
+"""
+Initial MLA Decode submission — optimised baseline with Triton softmax and RoPE kernels.
+"""
+import os
+import math
+from typing import Tuple
+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+from reference import KVCache, Config
+@triton.jit
+def rope_swap_halves_kernel(
+    x_ptr,
+    cos_ptr, sin_ptr,
+    B: tl.constexpr,
+    T: tl.constexpr,
+    D: tl.constexpr,
+    stride_xb, stride_xt, stride_xd,
+    stride_cos_t, stride_cos_d,
+    stride_sin_t, stride_sin_d,
+    BLOCK_HALF: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    bt = pid
+    b = bt // T
+    t = bt - b * T
+    half = D // 2
+    off = tl.arange(0, BLOCK_HALF)
+    mask = off < half
+    x_base = x_ptr + b * stride_xb + t * stride_xt
+    x0_ptr = x_base + off * stride_xd
+    x1_ptr = x_base + (half + off) * stride_xd
+    cos_base = cos_ptr + t * stride_cos_t
+    sin_base = sin_ptr + t * stride_sin_t
+    c_ptr = cos_base + off * stride_cos_d
+    s_ptr = sin_base + off * stride_sin_d
+    x0 = tl.load(x0_ptr, mask=mask, other=0.0).to(tl.float32)
+    x1 = tl.load(x1_ptr, mask=mask, other=0.0).to(tl.float32)
+    c = tl.load(c_ptr, mask=mask, other=0.0).to(tl.float32)
+    s = tl.load(s_ptr, mask=mask, other=0.0).to(tl.float32)
+    out0 = x0 * c - x1 * s
+    out1 = x1 * c + x0 * s
+    tl.store(x0_ptr, out0.to(tl.bfloat16), mask=mask)
+    tl.store(x1_ptr, out1.to(tl.bfloat16), mask=mask)
+def rope_inplace_query(q_rope: torch.Tensor, cos_q: torch.Tensor, sin_q: torch.Tensor):
+    assert q_rope.is_cuda
+    assert q_rope.shape[-1] % 2 == 0
+    bs, nh, d_rope = q_rope.shape
+    half = d_rope // 2
+    BLOCK_HALF = 1 << (half - 1).bit_length()
+    grid = (bs * nh,)
+    rope_swap_halves_kernel[grid](
+        q_rope,
+        cos_q, sin_q,
+        B=bs, T=nh, D=d_rope,
+        stride_xb=q_rope.stride(0),
+        stride_xt=q_rope.stride(1),
+        stride_xd=q_rope.stride(2),
+        stride_cos_t=0, stride_cos_d=cos_q.stride(0),
+        stride_sin_t=0, stride_sin_d=sin_q.stride(0),
+        BLOCK_HALF=BLOCK_HALF,
+        num_warps=4,
+    )
+_rope_cache = {}
+def _rotate_half(x: torch.Tensor) -> torch.Tensor:
+    half = x.shape[-1] // 2
+    return torch.cat((-x[..., half:], x[..., :half]), dim=-1)
+def _get_rope_tables(dim: int, max_seq_len: int, device: torch.device):
+    key = (dim, max_seq_len, device)
+    if key not in _rope_cache:
+        half = dim // 2
+        theta = (10000.0 ** (-torch.arange(half, dtype=torch.float32, device=device) / half)).to(
+            torch.bfloat16
+        )
+        pos = torch.arange(max_seq_len, dtype=torch.int64, device=device).unsqueeze_(1)
+        idx = pos * theta[None, :]
+        idx = torch.cat([idx, idx], dim=-1)
+        _rope_cache[key] = (idx.cos().to(torch.bfloat16), idx.sin().to(torch.bfloat16))
+    return _rope_cache[key]
+@triton.jit
+def _softmax_kernel(
+    out_ptr, in_ptr,
+    stride_out, stride_in,
+    n_cols,
+    BLOCK_SIZE: tl.constexpr,
+    NUM_STAGES: tl.constexpr,
+):
+    row = tl.program_id(0)
+    row_off_in = row * stride_in
+    row_off_out = row * stride_out
+    max_val = tl.full([BLOCK_SIZE], -float("inf"), tl.float32)
+    col = tl.arange(0, BLOCK_SIZE)
+    for start in range(0, n_cols, BLOCK_SIZE):
+        cur = start + col
+        mask = cur < n_cols
+        val = tl.load(in_ptr + row_off_in + cur, mask=mask, other=-float('inf'))
+        max_val = tl.maximum(max_val, tl.cast(val, tl.float32))
+    row_max = tl.max(max_val)
+    sum_val = tl.full([BLOCK_SIZE], 0.0, tl.float32)
+    for start in range(0, n_cols, BLOCK_SIZE):
+        cur = start + col
+        mask = cur < n_cols
+        val = tl.load(in_ptr + row_off_in + cur, mask=mask, other=-float('inf'))
+        exp_val = tl.exp(tl.cast(val, tl.float32) - row_max)
+        tl.store(out_ptr + row_off_out + cur, tl.cast(exp_val, tl.bfloat16), mask=mask)
+        sum_val += exp_val
+    row_sum = tl.sum(sum_val)
+    for start in range(0, n_cols, BLOCK_SIZE):
+        cur = start + col
+        mask = cur < n_cols
+        val = tl.load(out_ptr + row_off_out + cur, mask=mask, other=0.0)
+        norm = tl.cast(val, tl.float32) / row_sum
+        tl.store(out_ptr + row_off_out + cur, tl.cast(norm, tl.bfloat16), mask=mask)
+def _triton_softmax(x: torch.Tensor) -> torch.Tensor:
+    assert x.is_cuda and x.dtype == torch.bfloat16
+    n_rows, n_cols = x.shape
+    if n_cols <= 32:
+        BLOCK_SIZE = 32
+    elif n_cols <= 64:
+        BLOCK_SIZE = 64
+    elif n_cols <= 128:
+        BLOCK_SIZE = 128
+    else:
+        BLOCK_SIZE = 1 << (n_cols - 1).bit_length()
+        BLOCK_SIZE = min(BLOCK_SIZE, 1024)
+    out = torch.empty_like(x)
+    grid = (n_rows,)
+    _softmax_kernel[grid](
+        out, x,
+        out.stride(0), x.stride(0),
+        n_cols,
+        BLOCK_SIZE=BLOCK_SIZE,
+        NUM_STAGES=2,
+        num_warps=4,
+    )
+    return out
+def custom_kernel(data: Tuple[Config, torch.Tensor, KVCache]) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Optimised forward step of the Multi-head Latent Attention (MLA) module.
+    """
+    config, x, kv_cache = data
+    bs = config.batch_size
+    sl = config.seq_len
+    nh = config.n_heads
+    dq = config.q_lora_rank
+    dkv = config.kv_lora_rank
+    d_nope = config.qk_nope_head_dim
+    d_rope = config.qk_rope_head_dim
+    dv = config.v_head_dim
+    msl = config.max_seq_len
+    wDQ = config.Q_proj_down_weight
+    wDKV = config.KV_proj_down_weight
+    wUQ = config.Q_proj_up_weight
+    wUKV = config.KV_proj_up_weight
+    wO = config.wo_weight
+    q_lora = F.linear(x, wDQ)
+    kv_lora_input = F.linear(x, wDKV)
+    kv_lora, kv_len = kv_cache(kv_lora_input)
+    query_pos = kv_len - 1
+    q_up = F.linear(q_lora.squeeze(1), wUQ)
+    q_up = q_up.view(bs, nh, d_nope + d_rope)
+    q_nope = q_up[..., :d_nope]
+    q_rope = q_up[..., d_nope:]
+    kv_nope_input = kv_lora[..., :dkv]
+    k_rope_input = kv_lora[..., dkv:]
+    cos_table, sin_table = _get_rope_tables(d_rope, msl, x.device)
+    cos_q = cos_table[query_pos].view(d_rope).contiguous()
+    sin_q = sin_table[query_pos].view(d_rope).contiguous()
+    rope_inplace_query(q_rope, cos_q, sin_q)
+    cos_k = cos_table[:kv_len]
+    sin_k = sin_table[:kv_len]
+    k_rope = k_rope_input * cos_k + _rotate_half(k_rope_input) * sin_k
+    wUKV_view = wUKV.view(nh, d_nope + dv, dkv)
+    wK = wUKV_view[:, :d_nope, :]
+    q_nope_latent = torch.einsum('bhd,hdk->bhk', q_nope, wK)
+    kv_nope_T = kv_nope_input.transpose(1, 2)
+    scores_nope = torch.matmul(q_nope_latent, kv_nope_T)
+    scores_rope = torch.matmul(q_rope, k_rope.transpose(-2, -1))
+    scale = 1.0 / math.sqrt(d_nope + d_rope)
+    scores = (scores_nope + scores_rope) * scale
+    scores_flat = scores.reshape(bs * nh, kv_len)
+    attn_flat = _triton_softmax(scores_flat)
+    attn = attn_flat.view(bs, nh, kv_len)
+    M = torch.matmul(attn, kv_nope_input)
+    wV = wUKV_view[:, d_nope:, :]
+    wV_T = wV.permute(0, 2, 1)
+    y_head = torch.einsum('bhd,hdk->bhk', M, wV_T)
+    y = y_head.reshape(bs, nh * dv)
+    y = y.unsqueeze(1)
+    output = F.linear(y, wO)
+    return output, kv_cache.data
+# EVOLVE-BLOCK-END

benchmarks/gpu_mode/mla_decode/reference.py ADDED Viewed

	@@ -0,0 +1,520 @@

+"""
+Reference implementation for MLA Decode (Multi-Head Latent Attention) Triton kernel.
+Same test cases, benchmarks, generate_input, ref_kernel, and check_implementation.
+"""
+import math
+from dataclasses import dataclass
+import torch
+from torch import nn
+import torch.nn.functional as F
+# ---------------------------------------------------------------------------
+# Scoring and benchmark configuration (read by shared_eval.py)
+# ---------------------------------------------------------------------------
+SCORE_SCALE = 3000.0
+# MLA uses wall-clock timing, 1% rel error, no wall clock timeout, torch.no_grad()
+BENCH_USE_CUDA_EVENTS = False
+BENCH_REL_ERROR = 0.01
+BENCH_WALL_TIMEOUT_NS = None
+BENCH_NO_GRAD = True
+BENCH_MAX_REPEATS = 100
+BENCH_MAX_TIME_NS = 10e9
+BENCH_WARMUP_STYLE = 'timed_calls'
+# ---------------------------------------------------------------------------
+# Model classes (needed by both reference and submissions)
+# ---------------------------------------------------------------------------
+class RoPE(nn.Module):
+    def __init__(self, d_model: int):
+        super().__init__()
+        self.d_model = d_model
+        theta = 10000 ** (-torch.arange(0, d_model // 2, dtype=torch.bfloat16) / (d_model // 2))
+        self.register_buffer("theta", theta)
+    def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    def forward(self, x: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
+        seq_len = x.size(-2)
+        d_model = x.size(-1)
+        assert d_model == self.d_model
+        seq_idx = torch.arange(start_pos, start_pos + seq_len, device=x.device)
+        idx_theta = torch.einsum('s,d->sd', seq_idx, self.theta)
+        idx_theta2 = torch.cat([idx_theta, idx_theta], dim=-1)
+        cos = idx_theta2.cos().to(torch.bfloat16)
+        sin = idx_theta2.sin().to(torch.bfloat16)
+        return x * cos + self.rotate_half(x) * sin
+class KVCache(nn.Module):
+    def __init__(self, kv_cache_shape: tuple, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.register_buffer('data', torch.zeros(kv_cache_shape, dtype=torch.bfloat16))
+        self.seq_len = 0
+        self.zero()
+    def zero(self) -> None:
+        self.data.zero_()
+    def get_data(self) -> torch.Tensor:
+        return self.data
+    def forward(self, c_kv: torch.Tensor) -> torch.Tensor:
+        assert self.seq_len + c_kv.size(1) <= self.data.size(1), "KV Cache Exceeded"
+        self.data = self.data.to(c_kv.dtype)
+        self.data[
+            :, self.seq_len: self.seq_len + c_kv.size(1), :
+        ] = c_kv
+        self.seq_len += c_kv.size(1)
+        return self.data[:, :self.seq_len], self.seq_len
+@dataclass
+class Config:
+    batch_size: int
+    dim: int
+    n_heads: int
+    q_lora_rank: int
+    kv_lora_rank: int
+    qk_nope_head_dim: int
+    qk_rope_head_dim: int
+    v_head_dim: int
+    seq_len: int
+    max_seq_len: int
+    kv_cache_shape: tuple
+    Q_proj_down_weight: torch.Tensor
+    Q_proj_up_weight: torch.Tensor
+    KV_proj_down_weight: torch.Tensor
+    KV_proj_up_weight: torch.Tensor
+    wo_weight: torch.Tensor
+class MLA(nn.Module):
+    def __init__(self, config: Config):
+        super().__init__()
+        self.dim = config.dim
+        self.n_heads = config.n_heads
+        self.q_lora_rank = config.q_lora_rank
+        self.kv_lora_rank = config.kv_lora_rank
+        self.nope_head_dim = config.qk_nope_head_dim
+        self.rope_head_dim = config.qk_rope_head_dim
+        self.v_head_dim = config.v_head_dim
+        self.Q_proj_down = nn.Linear(self.dim, self.q_lora_rank, dtype=torch.bfloat16, bias=False)
+        self.KV_proj_down = nn.Linear(self.dim, self.kv_lora_rank + self.rope_head_dim, dtype=torch.bfloat16, bias=False)
+        self.Q_proj_up = nn.Linear(self.q_lora_rank, (self.nope_head_dim + self.rope_head_dim) * self.n_heads, dtype=torch.bfloat16, bias=False)
+        self.KV_proj_up = nn.Linear(self.kv_lora_rank, (self.nope_head_dim + self.v_head_dim) * self.n_heads, dtype=torch.bfloat16, bias=False)
+        self.q_rope = RoPE(self.rope_head_dim)
+        self.k_rope = RoPE(self.rope_head_dim)
+        self.wo = nn.Linear(self.v_head_dim * self.n_heads, self.dim, dtype=torch.bfloat16, bias=False)
+        self.eps = 1e-6
+    def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
+        batch_size, seq_len, model_dim = x.size()
+        q_lora = self.Q_proj_down(x)
+        kv_lora = self.KV_proj_down(x)
+        kv_lora, kv_len = kv_cache(kv_lora)
+        query_pos = kv_len - 1
+        q_nope_and_rope = self.Q_proj_up(q_lora).view(
+            batch_size, seq_len, self.n_heads, self.nope_head_dim + self.rope_head_dim)
+        q_nope, q_rope = torch.split(q_nope_and_rope, [self.nope_head_dim, self.rope_head_dim], dim=-1)
+        kv_nope, k_rope = torch.split(kv_lora, [self.kv_lora_rank, self.rope_head_dim], dim=-1)
+        kv_nope = self.KV_proj_up(kv_nope).view(
+            batch_size, kv_len, self.n_heads, self.nope_head_dim + self.v_head_dim)
+        k_nope, v = torch.split(kv_nope, [self.nope_head_dim, self.v_head_dim], dim=-1)
+        q_rope = q_rope.permute(0, 2, 1, 3)
+        q_rope = self.q_rope(q_rope, start_pos=query_pos)
+        q_nope = q_nope.permute(0, 2, 1, 3)
+        q = torch.concat([q_nope, q_rope], dim=-1)
+        k_rope = k_rope[:, None, :, :]
+        k_rope = self.k_rope(k_rope).expand(-1, self.n_heads, -1, -1)
+        k_nope = k_nope.permute(0, 2, 1, 3)
+        k = torch.concat([k_nope, k_rope], dim=-1)
+        v = v.permute(0, 2, 1, 3)
+        scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.rope_head_dim + self.nope_head_dim)
+        attn = F.softmax(scores, dim=-1).to(torch.bfloat16)
+        y = torch.matmul(attn, v).view(batch_size, 1, -1)
+        y = self.wo(y)
+        return y, kv_cache.get_data()
+# ---------------------------------------------------------------------------
+# Test / benchmark cases — from discover task.yml
+# ---------------------------------------------------------------------------
+TEST_CASES = [
+    {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 128, "seed": 9247},
+    {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 512, "seed": 2197},
+    {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 1024, "seed": 9107},
+    {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 2048, "seed": 5291},
+]
+BENCHMARK_CASES = [
+    {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 4096, "seed": 9817},
+    {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 6144, "seed": 5291},
+]
+# ---------------------------------------------------------------------------
+# Input generation
+# ---------------------------------------------------------------------------
+def generate_input(batchsize, dim, dq, prefill, seed):
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    Q_proj_down_weight = torch.randn((dq, dim), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dim)
+    KV_proj_down_weight = torch.randn((512 + 64, dim), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dim)
+    Q_proj_up_weight = torch.randn(((128 + 64) * 128, dq), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dq)
+    KV_proj_up_weight = torch.randn(((128 + 128) * 128, 512), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(512)
+    wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(128 * 128)
+    config = Config(
+        batch_size=batchsize,
+        dim=dim,
+        q_lora_rank=dq,
+        n_heads=128,
+        kv_lora_rank=512,
+        qk_nope_head_dim=128,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        seq_len=1,
+        max_seq_len=8192,
+        kv_cache_shape=(batchsize, 8192, 512 + 64),
+        Q_proj_down_weight=Q_proj_down_weight,
+        Q_proj_up_weight=Q_proj_up_weight,
+        KV_proj_down_weight=KV_proj_down_weight,
+        KV_proj_up_weight=KV_proj_up_weight,
+        wo_weight=wo_weight,
+    )
+    x = torch.randn((config.batch_size, 1, config.dim), dtype=torch.bfloat16, generator=gen, device='cuda')
+    kv_cache = KVCache((config.batch_size, config.max_seq_len, config.kv_lora_rank + config.qk_rope_head_dim)).to('cuda')
+    pre_filled_cache = torch.randn(
+        (config.batch_size, prefill, config.kv_lora_rank + config.qk_rope_head_dim),
+        dtype=torch.bfloat16, generator=gen, device='cuda')
+    kv_cache(pre_filled_cache)
+    return config, x, kv_cache
+# ---------------------------------------------------------------------------
+# Reference kernel
+# ---------------------------------------------------------------------------
+def ref_kernel(data):
+    config, x, kv_cache = data
+    model = MLA(config).to('cuda')
+    model.Q_proj_down.weight = nn.Parameter(config.Q_proj_down_weight)
+    model.Q_proj_up.weight = nn.Parameter(config.Q_proj_up_weight)
+    model.KV_proj_down.weight = nn.Parameter(config.KV_proj_down_weight)
+    model.KV_proj_up.weight = nn.Parameter(config.KV_proj_up_weight)
+    model.wo.weight = nn.Parameter(config.wo_weight)
+    output, kv_data = model(x, kv_cache)
+    return output, kv_data
+# ---------------------------------------------------------------------------
+# Correctness checking
+# ---------------------------------------------------------------------------
+@torch.no_grad()
+def _verbose_allclose(received, expected, rtol=1e-05, atol=1e-08, max_print=5):
+    if received.shape != expected.shape:
+        return False, [f"SIZE MISMATCH. received shape: {received.shape}, expected shape: {expected.shape}"]
+    diff = torch.abs(received.to(torch.float32) - expected.to(torch.float32))
+    tolerance = atol + rtol * torch.abs(expected.to(torch.float32))
+    tol_mismatched = diff > tolerance
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+    mismatched_indices = torch.nonzero(mismatched)
+    num_mismatched = mismatched.count_nonzero().item()
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return False, mismatch_details
+    return True, [f"Maximum error: {torch.max(diff)}"]
+def check_implementation(data, submission_output, rtol=2e-2, atol=8e-3):
+    """Check submission output against reference. Returns (passed: bool, msg: str)."""
+    import gc
+    output_mla, output_kv = submission_output
+    # Move submission output to CPU and free GPU memory before running ref kernel
+    output_mla_cpu = output_mla.cpu()
+    output_kv_cpu = output_kv.cpu()
+    del output_mla, output_kv
+    gc.collect()
+    torch.cuda.empty_cache()
+    config, x, kv_cache = data
+    with torch.no_grad():
+        expected_mla, expected_kv = ref_kernel((config, x, kv_cache))
+    # Move ref output to CPU and free GPU memory before comparison
+    expected_mla_cpu = expected_mla.cpu()
+    expected_kv_cpu = expected_kv.cpu()
+    del expected_mla, expected_kv
+    gc.collect()
+    torch.cuda.empty_cache()
+    good_mla, reasons_mla = _verbose_allclose(output_mla_cpu, expected_mla_cpu, rtol=rtol, atol=atol)
+    good_kv, reasons_kv = _verbose_allclose(output_kv_cpu, expected_kv_cpu, rtol=rtol, atol=atol)
+    if not good_mla:
+        return False, "MLA output mismatch: " + " ".join(reasons_mla)
+    if not good_kv:
+        return False, "KV cache mismatch: " + " ".join(reasons_kv)
+    return True, "Match"
+# ---------------------------------------------------------------------------
+# Self-contained reference code for Modal remote execution
+# ---------------------------------------------------------------------------
+MODAL_REFERENCE_CODE = r'''
+import math
+from dataclasses import dataclass
+import torch
+from torch import nn
+import torch.nn.functional as F
+class RoPE(nn.Module):
+    def __init__(self, d_model: int):
+        super().__init__()
+        self.d_model = d_model
+        theta = 10000 ** (-torch.arange(0, d_model // 2, dtype=torch.bfloat16) / (d_model // 2))
+        self.register_buffer("theta", theta)
+    def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    def forward(self, x: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
+        seq_len = x.size(-2)
+        d_model = x.size(-1)
+        assert d_model == self.d_model
+        seq_idx = torch.arange(start_pos, start_pos + seq_len, device=x.device)
+        idx_theta = torch.einsum('s,d->sd', seq_idx, self.theta)
+        idx_theta2 = torch.cat([idx_theta, idx_theta], dim=-1)
+        cos = idx_theta2.cos().to(torch.bfloat16)
+        sin = idx_theta2.sin().to(torch.bfloat16)
+        return x * cos + self.rotate_half(x) * sin
+class KVCache(nn.Module):
+    def __init__(self, kv_cache_shape: tuple, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.register_buffer('data', torch.zeros(kv_cache_shape, dtype=torch.bfloat16))
+        self.seq_len = 0
+        self.zero()
+    def zero(self) -> None:
+        self.data.zero_()
+    def get_data(self) -> torch.Tensor:
+        return self.data
+    def forward(self, c_kv: torch.Tensor) -> torch.Tensor:
+        assert self.seq_len + c_kv.size(1) <= self.data.size(1), "KV Cache Exceeded"
+        self.data = self.data.to(c_kv.dtype)
+        self.data[:, self.seq_len: self.seq_len + c_kv.size(1), :] = c_kv
+        self.seq_len += c_kv.size(1)
+        return self.data[:, :self.seq_len], self.seq_len
+@dataclass
+class Config:
+    batch_size: int
+    dim: int
+    n_heads: int
+    q_lora_rank: int
+    kv_lora_rank: int
+    qk_nope_head_dim: int
+    qk_rope_head_dim: int
+    v_head_dim: int
+    seq_len: int
+    max_seq_len: int
+    kv_cache_shape: tuple
+    Q_proj_down_weight: torch.Tensor
+    Q_proj_up_weight: torch.Tensor
+    KV_proj_down_weight: torch.Tensor
+    KV_proj_up_weight: torch.Tensor
+    wo_weight: torch.Tensor
+class MLA(nn.Module):
+    def __init__(self, config: Config):
+        super().__init__()
+        self.dim = config.dim
+        self.n_heads = config.n_heads
+        self.q_lora_rank = config.q_lora_rank
+        self.kv_lora_rank = config.kv_lora_rank
+        self.nope_head_dim = config.qk_nope_head_dim
+        self.rope_head_dim = config.qk_rope_head_dim
+        self.v_head_dim = config.v_head_dim
+        self.Q_proj_down = nn.Linear(self.dim, self.q_lora_rank, dtype=torch.bfloat16, bias=False)
+        self.KV_proj_down = nn.Linear(self.dim, self.kv_lora_rank + self.rope_head_dim, dtype=torch.bfloat16, bias=False)
+        self.Q_proj_up = nn.Linear(self.q_lora_rank, (self.nope_head_dim + self.rope_head_dim) * self.n_heads, dtype=torch.bfloat16, bias=False)
+        self.KV_proj_up = nn.Linear(self.kv_lora_rank, (self.nope_head_dim + self.v_head_dim) * self.n_heads, dtype=torch.bfloat16, bias=False)
+        self.q_rope = RoPE(self.rope_head_dim)
+        self.k_rope = RoPE(self.rope_head_dim)
+        self.wo = nn.Linear(self.v_head_dim * self.n_heads, self.dim, dtype=torch.bfloat16, bias=False)
+        self.eps = 1e-6
+    def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
+        batch_size, seq_len, model_dim = x.size()
+        q_lora = self.Q_proj_down(x)
+        kv_lora = self.KV_proj_down(x)
+        kv_lora, kv_len = kv_cache(kv_lora)
+        query_pos = kv_len - 1
+        q_nope_and_rope = self.Q_proj_up(q_lora).view(
+            batch_size, seq_len, self.n_heads, self.nope_head_dim + self.rope_head_dim)
+        q_nope, q_rope = torch.split(q_nope_and_rope, [self.nope_head_dim, self.rope_head_dim], dim=-1)
+        kv_nope, k_rope = torch.split(kv_lora, [self.kv_lora_rank, self.rope_head_dim], dim=-1)
+        kv_nope = self.KV_proj_up(kv_nope).view(
+            batch_size, kv_len, self.n_heads, self.nope_head_dim + self.v_head_dim)
+        k_nope, v = torch.split(kv_nope, [self.nope_head_dim, self.v_head_dim], dim=-1)
+        q_rope = q_rope.permute(0, 2, 1, 3)
+        q_rope = self.q_rope(q_rope, start_pos=query_pos)
+        q_nope = q_nope.permute(0, 2, 1, 3)
+        q = torch.concat([q_nope, q_rope], dim=-1)
+        k_rope = k_rope[:, None, :, :]
+        k_rope = self.k_rope(k_rope).expand(-1, self.n_heads, -1, -1)
+        k_nope = k_nope.permute(0, 2, 1, 3)
+        k = torch.concat([k_nope, k_rope], dim=-1)
+        v = v.permute(0, 2, 1, 3)
+        scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.rope_head_dim + self.nope_head_dim)
+        attn = F.softmax(scores, dim=-1).to(torch.bfloat16)
+        y = torch.matmul(attn, v).view(batch_size, 1, -1)
+        y = self.wo(y)
+        return y, kv_cache.get_data()
+def ref_kernel(data):
+    config, x, kv_cache = data
+    model = MLA(config).to('cuda')
+    model.Q_proj_down.weight = nn.Parameter(config.Q_proj_down_weight)
+    model.Q_proj_up.weight = nn.Parameter(config.Q_proj_up_weight)
+    model.KV_proj_down.weight = nn.Parameter(config.KV_proj_down_weight)
+    model.KV_proj_up.weight = nn.Parameter(config.KV_proj_up_weight)
+    model.wo.weight = nn.Parameter(config.wo_weight)
+    output, kv_data = model(x, kv_cache)
+    return output, kv_data
+def generate_input(batchsize, dim, dq, prefill, seed):
+    gen = torch.Generator(device='cuda')
+    gen.manual_seed(seed)
+    Q_proj_down_weight = torch.randn((dq, dim), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dim)
+    KV_proj_down_weight = torch.randn((512 + 64, dim), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dim)
+    Q_proj_up_weight = torch.randn(((128 + 64) * 128, dq), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dq)
+    KV_proj_up_weight = torch.randn(((128 + 128) * 128, 512), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(512)
+    wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(128 * 128)
+    config = Config(
+        batch_size=batchsize, dim=dim, q_lora_rank=dq, n_heads=128,
+        kv_lora_rank=512, qk_nope_head_dim=128, qk_rope_head_dim=64,
+        v_head_dim=128, seq_len=1, max_seq_len=8192,
+        kv_cache_shape=(batchsize, 8192, 512 + 64),
+        Q_proj_down_weight=Q_proj_down_weight, Q_proj_up_weight=Q_proj_up_weight,
+        KV_proj_down_weight=KV_proj_down_weight, KV_proj_up_weight=KV_proj_up_weight,
+        wo_weight=wo_weight,
+    )
+    x = torch.randn((config.batch_size, 1, config.dim), dtype=torch.bfloat16, generator=gen, device='cuda')
+    kv_cache = KVCache((config.batch_size, config.max_seq_len, config.kv_lora_rank + config.qk_rope_head_dim)).to('cuda')
+    pre_filled_cache = torch.randn(
+        (config.batch_size, prefill, config.kv_lora_rank + config.qk_rope_head_dim),
+        dtype=torch.bfloat16, generator=gen, device='cuda')
+    kv_cache(pre_filled_cache)
+    return config, x, kv_cache
+@torch.no_grad()
+def _verbose_allclose(received, expected, rtol=1e-05, atol=1e-08, max_print=5):
+    if received.shape != expected.shape:
+        return False, [f"SIZE MISMATCH. received shape: {received.shape}, expected shape: {expected.shape}"]
+    diff = torch.abs(received.to(torch.float32) - expected.to(torch.float32))
+    tolerance = atol + rtol * torch.abs(expected.to(torch.float32))
+    tol_mismatched = diff > tolerance
+    nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
+    posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
+    neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
+    mismatched = torch.logical_or(
+        torch.logical_or(tol_mismatched, nan_mismatched),
+        torch.logical_or(posinf_mismatched, neginf_mismatched),
+    )
+    mismatched_indices = torch.nonzero(mismatched)
+    num_mismatched = mismatched.count_nonzero().item()
+    if num_mismatched >= 1:
+        mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
+        for index in mismatched_indices[:max_print]:
+            i = tuple(index.tolist())
+            mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
+        if num_mismatched > max_print:
+            mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
+        return False, mismatch_details
+    return True, [f"Maximum error: {torch.max(diff)}"]
+def check_implementation(data, submission_output, rtol=2e-2, atol=8e-3):
+    import gc
+    output_mla, output_kv = submission_output
+    # Move submission output to CPU and free GPU memory before running ref kernel
+    output_mla_cpu = output_mla.cpu()
+    output_kv_cpu = output_kv.cpu()
+    del output_mla, output_kv
+    gc.collect()
+    torch.cuda.empty_cache()
+    config, x, kv_cache = data
+    with torch.no_grad():
+        expected_mla, expected_kv = ref_kernel((config, x, kv_cache))
+    # Move ref output to CPU and free GPU memory before comparison
+    expected_mla_cpu = expected_mla.cpu()
+    expected_kv_cpu = expected_kv.cpu()
+    del expected_mla, expected_kv
+    gc.collect()
+    torch.cuda.empty_cache()
+    good_mla, reasons_mla = _verbose_allclose(output_mla_cpu, expected_mla_cpu, rtol=rtol, atol=atol)
+    good_kv, reasons_kv = _verbose_allclose(output_kv_cpu, expected_kv_cpu, rtol=rtol, atol=atol)
+    if not good_mla:
+        return False, "MLA output mismatch: " + " ".join(reasons_mla)
+    if not good_kv:
+        return False, "KV cache mismatch: " + " ".join(reasons_kv)
+    return True, "Match"
+'''

benchmarks/gpu_mode/mla_decode/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ triton
2	+ torch

benchmarks/gpu_mode/trimul/initial_program.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# EVOLVE-BLOCK-START
+"""
+Initial TriMul submission — PyTorch baseline with dummy Triton kernel.
+"""
+import torch
+from torch import nn, einsum
+import triton
+import triton.language as tl
+@triton.jit
+def _dummy_kernel(x_ptr, BLOCK_SIZE: tl.constexpr):
+    pid = tl.program_id(0)
+    pass
+class TriMul(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+    ):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+        self.left_proj = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
+        self.right_proj = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
+        self.left_gate = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
+        self.right_gate = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
+        self.out_gate = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
+        self.to_out_norm = nn.LayerNorm(hidden_dim)
+        self.to_out = nn.Linear(hidden_dim, dim, bias=False, dtype=torch.float32)
+    def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, _, dim = x.shape
+        x = self.norm(x)
+        x = x.to(torch.float32)
+        left = self.left_proj(x.to(torch.float32))
+        right = self.right_proj(x.to(torch.float32))
+        mask = mask.unsqueeze(-1)
+        left = left * mask
+        right = right * mask
+        left_gate = self.left_gate(x.to(torch.float32)).sigmoid()
+        right_gate = self.right_gate(x.to(torch.float32)).sigmoid()
+        out_gate = self.out_gate(x.to(torch.float32)).sigmoid()
+        left = left * left_gate
+        right = right * right_gate
+        out = einsum('... i k d, ... j k d -> ... i j d', left.to(torch.bfloat16), right.to(torch.bfloat16))
+        out = out.to(torch.float32)
+        out = self.to_out_norm(out)
+        out = out * out_gate
+        return self.to_out(out)
+def custom_kernel(data):
+    input_tensor, mask, weights, config = data
+    trimul = TriMul(config["dim"], config["hidden_dim"]).to(input_tensor.device)
+    trimul.norm.weight = nn.Parameter(weights['norm.weight'].to(torch.float32))
+    trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'].to(torch.float32))
+    trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'].to(torch.float32))
+    trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'].to(torch.float32))
+    trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'].to(torch.float32))
+    trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'].to(torch.float32))
+    trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'].to(torch.float32))
+    trimul.to_out.weight = nn.Parameter(weights['to_out.weight'].to(torch.float32))
+    trimul.norm.bias = nn.Parameter(weights['norm.bias'].to(torch.float32))
+    trimul.to_out_norm.bias = nn.Parameter(weights['to_out_norm.bias'].to(torch.float32))
+    output = trimul(input_tensor, mask).to(torch.float32)
+    return output
+# EVOLVE-BLOCK-END

benchmarks/image_gen/README.md ADDED Viewed

	@@ -0,0 +1,40 @@

+# Image Generation Benchmark
+This benchmark evaluates whether SkyDiscover can optimize images, not just code or text. Each "solution" in the population is an image, evolved by generating and scoring variants from a candidate pool stored in the database. The evolutionary loop is the same as for code — parent selection, mutation via LLM, crossover via other context images from other islands — but instead of evolving Python programs, SkyDiscover evolves text prompts fed to GPT-5's native image generation. The VLM receives actual parent and other context images alongside text guidance, reasons about what to improve, and generates a new image. Setting `language: "image"` in the config is the only change needed.
+## Benchmark: Sky Festival
+**Directory:** `sky_festival/`
+The system must generate a floating sky-festival image where many details must match exact structural constraints: 9 clouds with specific shapes (rabbit, teacup, musical note, crescent moon, whale, etc.), 5 hot-air balloons with exact colors, passengers, and a banner reading "HAPPY 100TH SKY FESTIVAL", a floating island with 4 trees in a specific left-to-right order, and a party table with precisely counted items (6 cupcakes, 8 golden plates, 5 gift boxes in a pyramid). The scene also includes 6 characters with specific attributes (e.g., a robot with 3 colored buttons on its chest, a grandmother giving a thumbs-up with her left hand), flying creatures, and a correctly ordered 7-band rainbow. The full specification is about 2000 words and lives in `config.yaml`'s `prompt.system_message`.
+**Evaluator.** Each generated image is graded by a GPT-5 vision judge using a strict rubric. The judge receives the image and a detailed scoring sheet, then returns per-category scores across 7 dimensions — cloud shapes (15 pts), balloons (20 pts), floating island (10 pts), table items (20 pts), characters (15 pts), decorations/creatures (10 pts), and rainbow/lighting (10 pts) — for a total of 100 points. The judge is instructed to be extremely harsh: points are awarded only when requirements are clearly and unambiguously met in the image.
+## Setup
+1. **Set your API key:**
+   ```bash
+   export OPENAI_API_KEY=...
+   ```
+   Both the image generator (GPT-5) and the evaluator judge (GPT-5) use the OpenAI API.
+## Run
+```bash
+cd benchmarks/image_gen/sky_festival
+# AdaEvolve
+uv run skydiscover-run evaluator.py -c config.yaml -s adaevolve -o sky_festival_output
+# EvoX
+uv run skydiscover-run evaluator.py -c config.yaml -s evox -o sky_festival_output
+```
+## Files
+| File | Description |
+|------|-------------|
+| `sky_festival/evaluator.py` | GPT-5 vision judge that scores images against the 100-point rubric |
+| `sky_festival/config.yaml` | Config — scene specification in `prompt.system_message` |

benchmarks/image_gen/sky_festival/evaluator.py ADDED Viewed

	@@ -0,0 +1,220 @@

+"""
+Sky Festival evaluator — GPT-5 LLM-as-a-judge.
+Scores VLM-generated images against a 100-point rubric using GPT-5 vision.
+Returns combined_score normalized to [0, 1].
+The framework passes the image path via a sidecar file:
+    <program_path>.image_path  ->  absolute path to the generated image
+Requirements:
+    pip install openai
+    Environment: OPENAI_API_KEY (required), JUDGE_MODEL (optional, default gpt-5)
+"""
+import base64
+import json
+import logging
+import os
+import re
+from typing import Dict, Union
+logger = logging.getLogger(__name__)
+JUDGE_MODEL = os.environ.get("JUDGE_MODEL", "gpt-5")
+SYSTEM_PROMPT = """\
+You are an extremely strict image evaluation judge. You score images against a precise rubric.
+You must output ONLY valid JSON with the exact keys specified. No markdown, no explanation outside JSON.
+Be harsh — most AI-generated images fail these criteria. Award points only when clearly met.
+If you cannot verify a requirement (e.g., too small to see), award 0 for that item."""
+RUBRIC_PROMPT = """\
+Score this image against the following rubric for a "Floating Sky Festival" scene.
+Be extremely strict. Only award points when requirements are CLEARLY and UNAMBIGUOUSLY met.
+## Category 1: Cloud Counting and Shapes (15 pts)
+- Exactly 9 clouds visible in the sky: 5 pts (8 or 10 clouds = 0)
+- At least 5 of the 9 clouds have recognizable distinct shapes (rabbit, teacup, musical note, crescent moon, whale, bicycle, crown, butterfly, number 7): 10 pts (2 pts per recognizable shape, max 10)
+## Category 2: Hot Air Balloons — Count, Colors, and Passengers (20 pts)
+- Exactly 5 hot air balloons visible: 4 pts (4 or 6 = 0)
+- Each balloon has correct distinct color/pattern (red-striped, yellow-dotted, rainbow, purple-stars, green-peace-sign): 6 pts (deduct 2 per wrong/missing pattern)
+- Correct passenger count per balloon (2 children, 1 woman, 3 cats, 1 violinist, empty): 6 pts (deduct 2 per wrong count)
+- Banner on Balloon 5 reads exactly "HAPPY 100TH SKY FESTIVAL": 4 pts (any word wrong = 0)
+## Category 3: Floating Island and Trees (10 pts)
+- Floating island visible suspended in air: 3 pts
+- Exactly 4 different trees on the island: 4 pts (3 or 5 = 0)
+- Trees in correct order left to right (oak, cherry blossom, palm, pine): 3 pts
+## Category 4: Party Table Items — Counting and Arrangement (20 pts)
+- 3-tier cake with candle present: 3 pts
+- Cake text "100 YEARS" legible on middle tier: 3 pts
+- Exactly 6 cupcakes in 2 rows of 3 with different colored frostings: 4 pts
+- Lemonade pitcher with 3 lemon slices and 2 ice cubes: 3 pts
+- Stack of exactly 8 golden plates: 3 pts
+- Exactly 5 gift boxes in pyramid (3 bottom, 2 top): 4 pts
+## Category 5: Characters — Count, Identity, and Details (15 pts)
+- Exactly 6 characters seated at the table (3 per side): 5 pts
+- Correct characters identifiable (girl with pigtails, penguin with bowtie, giraffe, robot, grandmother, golden retriever): 5 pts (1 pt per correct character, max 5 — giraffe counts as 1 even if neck extends)
+- Specific details: robot has 3 colored buttons on chest, grandmother thumbs-up with LEFT hand, dog wears striped party hat, girl has 5 fingers per hand: 5 pts (deduct 1.5 per missing detail)
+## Category 6: Decorations and Flying Creatures (10 pts)
+- Bunting banner with approximately 11 flags in alternating red/yellow/blue: 3 pts
+- Exactly 7 paper lanterns in different colors: 3 pts
+- Correct flying creatures: 4 birds (blue jay, cardinal, canary, hummingbird) + 2 butterflies (monarch, morpho): 4 pts (1 pt per 2 correct creatures)
+## Category 7: Rainbow, Lighting, and Overall Composition (10 pts)
+- Complete semicircular rainbow with 7 color bands in correct order: 4 pts
+- Consistent warm golden lighting from upper left with shadows falling lower right: 3 pts
+- Overall magical/celebratory mood, scene is joyful and cohesive: 3 pts
+Respond with ONLY this JSON (no other text):
+{
+  "cloud_shapes": <0-15>,
+  "balloons": <0-20>,
+  "floating_island": <0-10>,
+  "table_items": <0-20>,
+  "characters": <0-15>,
+  "decorations_creatures": <0-10>,
+  "rainbow_lighting": <0-10>,
+  "reasoning": "<brief 2-3 sentence explanation>"
+}"""
+# Category maximum scores for validation
+CATEGORY_MAXES = {
+    "cloud_shapes": 15,
+    "balloons": 20,
+    "floating_island": 10,
+    "table_items": 20,
+    "characters": 15,
+    "decorations_creatures": 10,
+    "rainbow_lighting": 10,
+}
+_client = None
+def _get_client():
+    global _client
+    if _client is None:
+        from openai import OpenAI
+        _client = OpenAI()
+    return _client
+def _encode_image(image_path: str) -> str:
+    with open(image_path, "rb") as f:
+        return base64.b64encode(f.read()).decode("utf-8")
+def _judge_image(image_path: str) -> Dict[str, Union[float, str]]:
+    """Call GPT-5 to score the image. Retries once on failure."""
+    client = _get_client()
+    b64 = _encode_image(image_path)
+    ext = os.path.splitext(image_path)[1].lstrip(".").lower()
+    mime = {"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg", "webp": "image/webp"}.get(ext, "image/png")
+    data_url = f"data:{mime};base64,{b64}"
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": data_url, "detail": "high"}},
+                {"type": "text", "text": RUBRIC_PROMPT},
+            ],
+        },
+    ]
+    last_error = None
+    for attempt in range(2):
+        try:
+            response = client.chat.completions.create(
+                model=JUDGE_MODEL,
+                messages=messages,
+                max_completion_tokens=16384,
+            )
+            content = response.choices[0].message.content or ""
+            raw = content.strip()
+            logger.info(f"Judge raw response (first 300 chars): {raw[:300]}")
+            # Extract JSON from markdown code block if present
+            if "```" in raw:
+                m = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", raw, re.DOTALL)
+                if m:
+                    raw = m.group(1).strip()
+            # Find JSON object in response
+            start = raw.find("{")
+            end = raw.rfind("}") + 1
+            if start >= 0 and end > start:
+                raw = raw[start:end]
+            result = json.loads(raw)
+            # Validate and clamp scores
+            scores = {}
+            for cat, max_val in CATEGORY_MAXES.items():
+                val = result.get(cat, 0)
+                if not isinstance(val, (int, float)):
+                    val = 0
+                scores[cat] = max(0, min(max_val, float(val)))
+            scores["reasoning"] = str(result.get("reasoning", ""))
+            return scores
+        except Exception as e:
+            last_error = e
+            logger.warning(f"Judge attempt {attempt + 1} failed: {e}")
+    logger.error(f"GPT-5 judge failed after retries: {last_error}")
+    return {cat: 0.0 for cat in CATEGORY_MAXES}
+def evaluate(program_path: str) -> Dict[str, Union[float, str]]:
+    """Score a VLM-generated image using GPT-5 as judge.
+    Args:
+        program_path: Path to the text file (VLM reasoning).
+            A sidecar file ``<program_path>.image_path`` contains the
+            absolute path to the generated image.
+    Returns:
+        Dictionary with combined_score (0-1), per-category scores, and image_path.
+    """
+    # Read image path from sidecar
+    sidecar = program_path + ".image_path"
+    image_path = None
+    if os.path.exists(sidecar):
+        with open(sidecar) as f:
+            image_path = f.read().strip()
+    if not image_path or not os.path.exists(image_path):
+        logger.warning("No image found for scoring")
+        return {"combined_score": 0.0, "error": "No image to score"}
+    # Score with GPT-5
+    scores = _judge_image(image_path)
+    # Compute total out of 100, normalize to 0-1
+    total = sum(v for k, v in scores.items() if k in CATEGORY_MAXES)
+    combined = round(total / 100.0, 4)
+    result = {"combined_score": combined, "image_path": image_path}
+    # Add per-category scores (normalized to 0-1 for each category)
+    for cat, max_val in CATEGORY_MAXES.items():
+        result[cat] = round(scores.get(cat, 0) / max_val, 4)
+    # Also store raw scores
+    result["raw_total"] = round(total, 1)
+    reasoning = scores.get("reasoning", "")
+    if reasoning:
+        result["judge_reasoning"] = reasoning
+    return result

benchmarks/math/README.md ADDED Viewed

	@@ -0,0 +1,43 @@

+# Math Benchmarks
+Mathematical optimization and algorithm evolution problems.
+## Problems
+### Signal processing & geometry (from SkyDiscover demos)
+- [signal_processing](signal_processing/) — Real-time adaptive filtering for non-stationary time series
+- [circle_packing](circle_packing/) — Pack 26 circles in a unit square to maximize sum of radii (AlphaEvolve B.12)
+### AlphaEvolve mathematical problems
+12 problems from [AlphaEvolve Appendices A and B](https://storage.googleapis.com/deepmind-media/DeepMind.com/Blog/alphaevolve-a-gemini-powered-coding-agent-for-designing-advanced-algorithms/AlphaEvolve.pdf). All evaluators are normalized to **maximize** the target metric.
+**Appendix A:**
+- [matmul](matmul/) — Faster algorithm for matrix multiplication (A)
+**Appendix B:**
+1. [first_autocorr_ineq](first_autocorr_ineq/) — Upper bound on autoconvolution constant (B.1)
+2. [second_autocorr_ineq](second_autocorr_ineq/) — Lower bound on autoconvolution norm constant (B.2)
+3. [third_autocorr_ineq](third_autocorr_ineq/) — Upper bound on absolute autoconvolution constant (B.3)
+4. [uncertainty_ineq](uncertainty_ineq/) — Upper bound on Fourier uncertainty constant (B.4)
+5. [erdos_min_overlap](erdos_min_overlap/) — Upper bound on Erdos minimum overlap constant (B.5)
+6. [sums_diffs_finite_sets](sums_diffs_finite_sets/) — Lower bound on sums/differences of finite sets (B.6)
+7. [hexagon_packing](hexagon_packing/) — Pack unit hexagons in a regular hexagon, n=11,12 (B.7)
+8. [minimizing_max_min_dist](minimizing_max_min_dist/) — Minimize max/min distance ratio, n=16 d=2 and n=14 d=3 (B.8)
+9. [heilbronn_triangle](heilbronn_triangle/) — Heilbronn problem for triangles, n=11 (B.9)
+10. [heilbronn_convex](heilbronn_convex/) — Heilbronn problem for convex regions, n=13,14 (B.10)
+11. [circle_packing_rect](circle_packing_rect/) — Pack circles in a rectangle of perimeter 4 (B.13)
+## Run
+```bash
+uv run skydiscover-run \
+  benchmarks/math/signal_processing/initial_program.py \
+  benchmarks/math/signal_processing/evaluator.py \
+  -c benchmarks/math/signal_processing/config.yaml \
+  -s [your_algorithm] \
+  -i 100
+```
+Each problem directory contains `initial_program.py`, `evaluator.py`, and either `config.yaml` or per-search configs. Some multi-variant problems have numbered subdirectories (e.g., `heilbronn_convex/13/`, `hexagon_packing/11/`).

benchmarks/math/circle_packing/README.md ADDED Viewed

	@@ -0,0 +1,38 @@

+# Circle Packing
+Pack 26 non-overlapping circles in a unit square to maximize the sum of their radii (AlphaEvolve B.12). Target: 2.635.
+## Problem
+- Pack exactly 26 circles inside a unit square
+- No circles may overlap
+- Each circle must lie entirely within the square
+- Maximize the sum of all radii
+## Run
+```bash
+# From repo root
+uv run skydiscover-run \
+  benchmarks/math/circle_packing/initial_program.py \
+  benchmarks/math/circle_packing/evaluator.py \
+  -c benchmarks/math/circle_packing/config.yaml \
+  -s [your_algorithm] \
+  -i 100
+```
+A `codebase/reference/` directory is provided with geometric insights (hex grids, optimization patterns, packing strategies) that can be used with agentic mode (`--agentic`).
+## Scoring
+- **combined_score**: `sum_of_radii / 2.635` (ratio to AlphaEvolve target)
+- Evaluator validates no overlaps and boundary constraints
+## Files
+| File | Description |
+|------|-------------|
+| `initial_program.py` | Seed: simple ring-based circle arrangement |
+| `evaluator.py` | Validates constraints, computes sum-of-radii ratio to target |
+| `config.yaml` | LLM and evaluator settings |
+| `codebase/reference/` | Geometric reference material for agentic mode |

benchmarks/math/circle_packing/codebase/reference/hex_grid.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""
+Hexagonal grid initialization for circle packing.
+A hexagonal (offset) grid provides a good starting arrangement
+because it's the densest regular packing pattern. Even rows are
+offset by half the spacing, which reduces wasted space.
+"""
+import numpy as np
+def hexagonal_grid(n, margin=0.1):
+    """
+    Generate n points on a hexagonal grid inside [margin, 1-margin]^2.
+    Args:
+        n: number of points to generate
+        margin: distance from edges to keep clear
+    Returns:
+        np.array of shape (n, 2) with (x, y) coordinates
+    """
+    usable = 1.0 - 2 * margin
+    cols = int(np.ceil(np.sqrt(n * 2 / np.sqrt(3))))
+    rows = int(np.ceil(n / cols))
+    dx = usable / max(cols - 1, 1)
+    dy = usable / max(rows - 1, 1)
+    points = []
+    for row in range(rows):
+        for col in range(cols):
+            if len(points) >= n:
+                break
+            x = margin + col * dx
+            if row % 2 == 1:
+                x += dx / 2  # offset for hex pattern
+            y = margin + row * dy
+            x = np.clip(x, margin, 1 - margin)
+            y = np.clip(y, margin, 1 - margin)
+            points.append([x, y])
+    return np.array(points[:n])

benchmarks/math/circle_packing/codebase/reference/optimization_patterns.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""
+Common patterns for constrained geometric optimization using scipy.
+This module shows how to use scipy.optimize.minimize with inequality
+constraints and the SLSQP solver — useful for any problem where you
+need to maximize/minimize an objective subject to geometric constraints.
+"""
+import numpy as np
+from scipy.optimize import minimize
+def example_constrained_optimization():
+    """
+    Template: pack n objects by optimizing positions + sizes jointly.
+    Decision vector:  x = [pos_0, pos_1, ..., pos_{n-1}, size_0, ..., size_{n-1}]
+    Objective:        maximize sum(sizes)  =>  minimize -sum(sizes)
+    Constraints:      non-overlap + boundary containment (all >= 0)
+    """
+    n = 10  # number of objects
+    # --- Objective: negative sum of sizes (we minimize, so negate to maximize) ---
+    def objective(x):
+        sizes = x[2 * n:]
+        return -np.sum(sizes)
+    # --- Constraints as a single function returning array of values >= 0 ---
+    def constraints_fn(x):
+        positions = x[:2 * n].reshape(n, 2)
+        sizes = x[2 * n:]
+        c = []
+        # Pairwise non-overlap: dist(i,j) - size_i - size_j >= 0
+        for i in range(n):
+            for j in range(i + 1, n):
+                dist = np.linalg.norm(positions[i] - positions[j])
+                c.append(dist - sizes[i] - sizes[j])
+        # Boundary: each object stays inside [0, 1] x [0, 1]
+        for i in range(n):
+            c.append(positions[i, 0] - sizes[i])      # left
+            c.append(1 - positions[i, 0] - sizes[i])   # right
+            c.append(positions[i, 1] - sizes[i])        # bottom
+            c.append(1 - positions[i, 1] - sizes[i])    # top
+        return np.array(c)
+    # --- Initial guess ---
+    x0_pos = np.random.rand(n, 2) * 0.6 + 0.2  # avoid edges
+    x0_sizes = np.full(n, 0.05)
+    x0 = np.concatenate([x0_pos.flatten(), x0_sizes])
+    # --- Bounds ---
+    pos_bounds = [(0, 1)] * (2 * n)
+    size_bounds = [(0.01, 0.25)] * n
+    bounds = pos_bounds + size_bounds
+    # --- Solve ---
+    result = minimize(
+        objective,
+        x0,
+        method="SLSQP",
+        bounds=bounds,
+        constraints={"type": "ineq", "fun": constraints_fn},
+        options={"maxiter": 1000, "ftol": 1e-9},
+    )
+    opt_positions = result.x[:2 * n].reshape(n, 2)
+    opt_sizes = result.x[2 * n:]
+    return opt_positions, opt_sizes, -result.fun  # return positive sum
+def multi_start_optimization(objective, constraint_fn, bounds, n_starts=5):
+    """
+    Run SLSQP from multiple random starts and keep the best.
+    This helps escape local optima — the solver is gradient-based
+    and sensitive to the initial guess.
+    """
+    best_result = None
+    for _ in range(n_starts):
+        x0 = np.array([np.random.uniform(lo, hi) for lo, hi in bounds])
+        result = minimize(
+            objective,
+            x0,
+            method="SLSQP",
+            bounds=bounds,
+            constraints={"type": "ineq", "fun": constraint_fn},
+            options={"maxiter": 500, "ftol": 1e-8},
+        )
+        if best_result is None or result.fun < best_result.fun:
+            best_result = result
+    return best_result

benchmarks/math/circle_packing/codebase/reference/packing_strategies.md ADDED Viewed

	@@ -0,0 +1,45 @@

+# Circle Packing Strategies for n=26 in a Unit Square
+## Key Insight
+Naive geometric placement (rings, grids) gives sum_radii ~ 1.0.
+Using numerical optimization (scipy.optimize) with proper constraint formulation
+can push sum_radii above 2.5.
+## Why Optimization Works Better Than Manual Placement
+Manual placement fixes circle positions, then computes maximum radii.
+This leaves gaps because positions aren't optimized for the radii they produce.
+**Joint optimization** treats both positions (x,y for each circle) AND radii
+as decision variables, optimizing them simultaneously. This is the key insight.
+Decision vector: [x0, y0, x1, y1, ..., x25, y25, r0, r1, ..., r25]
+Total variables: 26*2 + 26 = 78
+## Constraint Formulation
+1. **Non-overlap**: For every pair (i,j): distance(center_i, center_j) >= r_i + r_j
+2. **Boundary**: For every circle i: x_i - r_i >= 0, x_i + r_i <= 1, y_i - r_i >= 0, y_i + r_i <= 1
+3. **Positive radii**: r_i > 0 for all i (use bounds, not constraints)
+## Recommended Solver
+scipy.optimize.minimize with method="SLSQP":
+- Handles inequality constraints natively
+- Works with bounds on variables
+- Good for smooth, continuous problems like circle packing
+- Sensitive to initial guess — use multiple starts or a good heuristic
+## Initial Guess Strategy
+A hexagonal grid initial guess works well:
+- Place circles on offset rows (hex pattern)
+- Start with equal small radii (e.g., 0.05)
+- Let the optimizer adjust both positions and radii
+## Performance Tips
+- Set maxiter=1000 or higher for 26 circles
+- Use ftol=1e-8 or smaller for precise solutions
+- Radii bounds: (0.01, 0.2) is a reasonable range for n=26
+- The objective is -sum(radii) (minimize negative to maximize)

benchmarks/math/circle_packing/config.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+# Math benchmark: circle_packing
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 16384
+  timeout: 600
+prompt:
+  system_message: 'You are an expert mathematician specializing in circle packing problems and computational geometry. Your
+    task is to improve a constructor function that directly produces a specific arrangement of 26 circles in a unit square,
+    maximizing the sum of their radii. The AlphaEvolve paper achieved a sum of 2.635 for n=26.
+    Key geometric insights:
+    - Circle packings often follow hexagonal patterns in the densest regions
+    - Maximum density for infinite circle packing is pi/(2*sqrt(3)) ≈ 0.9069
+    - Edge effects make square container packing harder than infinite packing
+    - Circles can be placed in layers or shells when confined to a square
+    - Similar radius circles often form regular patterns, while varied radii allow better space utilization
+    - Perfect symmetry may not yield the optimal packing due to edge effects
+    Focus on designing an explicit constructor that places each circle in a specific position, rather than an iterative search
+    algorithm.
+    '
+evaluator:
+  timeout: 360
+  cascade_evaluation: true
+  cascade_thresholds:
+  - 0.3
+  - 0.6
+# Live monitor dashboard
+monitor:
+  enabled: true
+  port: 8765
+  host: "127.0.0.1"
+# Human feedback
+human_feedback_enabled: true

benchmarks/math/circle_packing/evaluator.py ADDED Viewed

	@@ -0,0 +1,338 @@

+"""
+Evaluator for circle packing example (n=26) with improved timeout handling
+"""
+import numpy as np
+import time
+import os
+import subprocess
+import tempfile
+import traceback
+import sys
+import pickle
+class TimeoutError(Exception):
+    pass
+def timeout_handler(signum, frame):
+    """Handle timeout signal"""
+    raise TimeoutError("Function execution timed out")
+def validate_packing(centers, radii):
+    """
+    Validate that circles don't overlap and are inside the unit square
+    Args:
+        centers: np.array of shape (n, 2) with (x, y) coordinates
+        radii: np.array of shape (n) with radius of each circle
+    Returns:
+        True if valid, False otherwise
+    """
+    n = centers.shape[0]
+    # Check for NaN values
+    if np.isnan(centers).any():
+        print("NaN values detected in circle centers")
+        return False
+    if np.isnan(radii).any():
+        print("NaN values detected in circle radii")
+        return False
+    # Check if radii are nonnegative and not nan
+    for i in range(n):
+        if radii[i] < 0:
+            print(f"Circle {i} has negative radius {radii[i]}")
+            return False
+        elif np.isnan(radii[i]):
+            print(f"Circle {i} has nan radius")
+            return False
+    # Check if circles are inside the unit square
+    for i in range(n):
+        x, y = centers[i]
+        r = radii[i]
+        if x - r < -1e-6 or x + r > 1 + 1e-6 or y - r < -1e-6 or y + r > 1 + 1e-6:
+            print(f"Circle {i} at ({x}, {y}) with radius {r} is outside the unit square")
+            return False
+    # Check for overlaps
+    for i in range(n):
+        for j in range(i + 1, n):
+            dist = np.sqrt(np.sum((centers[i] - centers[j]) ** 2))
+            if dist < radii[i] + radii[j] - 1e-6:  # Allow for tiny numerical errors
+                print(f"Circles {i} and {j} overlap: dist={dist}, r1+r2={radii[i]+radii[j]}")
+                return False
+    return True
+def run_with_timeout(program_path, timeout_seconds=20):
+    """
+    Run the program in a separate process with timeout
+    using a simple subprocess approach
+    Args:
+        program_path: Path to the program file
+        timeout_seconds: Maximum execution time in seconds
+    Returns:
+        centers, radii, sum_radii tuple from the program
+    """
+    # Create a temporary file to execute
+    with tempfile.NamedTemporaryFile(suffix=".py", delete=False) as temp_file:
+        # Write a script that executes the program and saves results
+        script = f"""
+import sys
+import numpy as np
+import os
+import pickle
+import traceback
+# Add the directory to sys.path
+sys.path.insert(0, os.path.dirname('{program_path}'))
+# Debugging info
+print(f"Running in subprocess, Python version: {{sys.version}}")
+print(f"Program path: {program_path}")
+try:
+    # Import the program
+    spec = __import__('importlib.util').util.spec_from_file_location("program", '{program_path}')
+    program = __import__('importlib.util').util.module_from_spec(spec)
+    spec.loader.exec_module(program)
+    # Run the packing function
+    print("Calling run_packing()...")
+    centers, radii, sum_radii = program.run_packing()
+    print(f"run_packing() returned successfully: sum_radii = {{sum_radii}}")
+    # Save results to a file
+    results = {{
+        'centers': centers,
+        'radii': radii,
+        'sum_radii': sum_radii
+    }}
+    with open('{temp_file.name}.results', 'wb') as f:
+        pickle.dump(results, f)
+    print(f"Results saved to {temp_file.name}.results")
+except Exception as e:
+    # If an error occurs, save the error instead
+    print(f"Error in subprocess: {{str(e)}}")
+    traceback.print_exc()
+    with open('{temp_file.name}.results', 'wb') as f:
+        pickle.dump({{'error': str(e)}}, f)
+    print(f"Error saved to {temp_file.name}.results")
+"""
+        temp_file.write(script.encode())
+        temp_file_path = temp_file.name
+    results_path = f"{temp_file_path}.results"
+    try:
+        # Run the script with timeout
+        process = subprocess.Popen(
+            [sys.executable, temp_file_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        try:
+            stdout, stderr = process.communicate(timeout=timeout_seconds)
+            exit_code = process.returncode
+            # Always print output for debugging purposes
+            print(f"Subprocess stdout: {stdout.decode()}")
+            if stderr:
+                print(f"Subprocess stderr: {stderr.decode()}")
+            # Still raise an error for non-zero exit codes, but only after printing the output
+            if exit_code != 0:
+                raise RuntimeError(f"Process exited with code {exit_code}")
+            # Load the results
+            if os.path.exists(results_path):
+                with open(results_path, "rb") as f:
+                    results = pickle.load(f)
+                # Check if an error was returned
+                if "error" in results:
+                    raise RuntimeError(f"Program execution failed: {results['error']}")
+                return results["centers"], results["radii"], results["sum_radii"]
+            else:
+                raise RuntimeError("Results file not found")
+        except subprocess.TimeoutExpired:
+            # Kill the process if it times out
+            process.kill()
+            process.wait()
+            raise TimeoutError(f"Process timed out after {timeout_seconds} seconds")
+    finally:
+        # Clean up temporary files
+        if os.path.exists(temp_file_path):
+            os.unlink(temp_file_path)
+        if os.path.exists(results_path):
+            os.unlink(results_path)
+def evaluate(program_path):
+    """
+    Evaluate the program by running it once and checking the sum of radii
+    Args:
+        program_path: Path to the program file
+    Returns:
+        Dictionary of metrics
+    """
+    # Target value from the paper
+    TARGET_VALUE = 2.635  # AlphaEvolve result for n=26
+    try:
+        # For constructor-based approaches, a single evaluation is sufficient
+        # since the result is deterministic
+        start_time = time.time()
+        # Use subprocess to run with timeout
+        centers, radii, reported_sum = run_with_timeout(
+            program_path, timeout_seconds=600  # Single timeout
+        )
+        end_time = time.time()
+        eval_time = end_time - start_time
+        # Ensure centers and radii are numpy arrays
+        if not isinstance(centers, np.ndarray):
+            centers = np.array(centers)
+        if not isinstance(radii, np.ndarray):
+            radii = np.array(radii)
+        # Check for NaN values before validation
+        if np.isnan(centers).any() or np.isnan(radii).any():
+            print("NaN values detected in solution")
+            return {
+                "sum_radii": 0.0,
+                "target_ratio": 0.0,
+                "validity": 0.0,
+                "eval_time": float(time.time() - start_time),
+                "combined_score": 0.0,
+            }
+        # Validate solution
+        valid = validate_packing(centers, radii)
+        # Check shape and size
+        shape_valid = centers.shape == (26, 2) and radii.shape == (26,)
+        if not shape_valid:
+            print(
+                f"Invalid shapes: centers={centers.shape}, radii={radii.shape}, expected (26, 2) and (26,)"
+            )
+            valid = False
+        # Calculate sum
+        sum_radii = np.sum(radii) if valid else 0.0
+        # Make sure reported_sum matches the calculated sum
+        if abs(sum_radii - reported_sum) > 1e-6:
+            print(f"Warning: Reported sum {reported_sum} doesn't match calculated sum {sum_radii}")
+        # Target ratio (how close we are to the target)
+        target_ratio = sum_radii / TARGET_VALUE if valid else 0.0
+        # Validity score
+        validity = 1.0 if valid else 0.0
+        # Combined score - higher is better
+        combined_score = target_ratio * validity
+        print(
+            f"Evaluation: valid={valid}, sum_radii={sum_radii:.6f}, target={TARGET_VALUE}, ratio={target_ratio:.6f}, time={eval_time:.2f}s"
+        )
+        return {
+            "sum_radii": float(sum_radii),
+            "target_ratio": float(target_ratio),
+            "validity": float(validity),
+            "eval_time": float(eval_time),
+            "combined_score": float(combined_score),
+        }
+    except Exception as e:
+        print(f"Evaluation failed completely: {str(e)}")
+        traceback.print_exc()
+        return {
+            "sum_radii": 0.0,
+            "target_ratio": 0.0,
+            "validity": 0.0,
+            "eval_time": 0.0,
+            "combined_score": 0.0,
+        }
+# Stage-based evaluation for cascade evaluation
+def evaluate_stage1(program_path):
+    """
+    First stage evaluation - quick validation check
+    """
+    try:
+        # Use the simplified subprocess approach
+        try:
+            centers, radii, sum_radii = run_with_timeout(program_path, timeout_seconds=600)
+            # Ensure centers and radii are numpy arrays
+            if not isinstance(centers, np.ndarray):
+                centers = np.array(centers)
+            if not isinstance(radii, np.ndarray):
+                radii = np.array(radii)
+            # Validate solution (shapes and constraints)
+            shape_valid = centers.shape == (26, 2) and radii.shape == (26,)
+            if not shape_valid:
+                print(f"Invalid shapes: centers={centers.shape}, radii={radii.shape}")
+                return {"validity": 0.0, "error": "Invalid shapes"}
+            valid = validate_packing(centers, radii)
+            # Calculate sum
+            actual_sum = np.sum(radii) if valid else 0.0
+            # Target from paper
+            target = 2.635
+            # Simple combined score for stage 1
+            combined_score = (actual_sum / target) if valid else 0.0
+            # Return evaluation metrics
+            return {
+                "validity": 1.0 if valid else 0.0,
+                "sum_radii": float(actual_sum),
+                "target_ratio": float(actual_sum / target if valid else 0.0),
+                "combined_score": float(combined_score),
+            }
+        except TimeoutError as e:
+            print(f"Stage 1 evaluation timed out: {e}")
+            return {"validity": 0.0, "combined_score": 0.0, "error": "Timeout"}
+        except Exception as e:
+            print(f"Stage 1 evaluation failed: {e}")
+            print(traceback.format_exc())
+            return {"validity": 0.0, "combined_score": 0.0, "error": str(e)}
+    except Exception as e:
+        print(f"Stage 1 evaluation failed completely: {e}")
+        print(traceback.format_exc())
+        return {"validity": 0.0, "combined_score": 0.0, "error": str(e)}
+def evaluate_stage2(program_path):
+    """
+    Second stage evaluation - full evaluation
+    """
+    # Full evaluation as in the main evaluate function
+    return evaluate(program_path)

benchmarks/math/circle_packing/evaluator/Dockerfile ADDED Viewed

	@@ -0,0 +1,11 @@

+FROM python:3.12-slim
+WORKDIR /benchmark
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY evaluator.py .
+COPY evaluate.sh .
+RUN chmod +x evaluate.sh
+ENTRYPOINT ["./evaluate.sh"]

benchmarks/math/circle_packing/evaluator/evaluate.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+#!/usr/bin/env bash
+set -euo pipefail
+PROGRAM="$1"
+# MODE ($2) is accepted but ignored — pure optimization has no data split.
+echo "[$(date '+%H:%M:%S')] eval start: $PROGRAM" >> /tmp/eval.log
+python /benchmark/evaluator.py "$PROGRAM"