JustinTX commited on
Commit
b0e88cf
·
verified ·
1 Parent(s): af83196

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +5 -0
  2. assets/architecture.png +3 -0
  3. assets/benchmarks.png +3 -0
  4. assets/comparison.png +3 -0
  5. assets/logo_vector.png +3 -0
  6. assets/scaling_comparison.png +3 -0
  7. benchmarks/ADRS/README.md +63 -0
  8. benchmarks/ADRS/eplb/config.yaml +37 -0
  9. benchmarks/ADRS/llm_sql/README.md +56 -0
  10. benchmarks/ADRS/llm_sql/initial_program.py +365 -0
  11. benchmarks/ADRS/prism/config.yaml +24 -0
  12. benchmarks/ADRS/prism/evaluator/evaluate.sh +7 -0
  13. benchmarks/ADRS/prism/evaluator/evaluator.py +259 -0
  14. benchmarks/ADRS/prism/initial_program.py +75 -0
  15. benchmarks/ADRS/prism/initial_program_naive.py +30 -0
  16. benchmarks/arc_benchmark/README.md +108 -0
  17. benchmarks/arc_benchmark/config.yaml +51 -0
  18. benchmarks/arc_benchmark/convert_arc_agi2_data.py +63 -0
  19. benchmarks/arc_benchmark/evaluator/Dockerfile +13 -0
  20. benchmarks/arc_benchmark/evaluator/evaluate.sh +7 -0
  21. benchmarks/arc_benchmark/evaluator/evaluator.py +407 -0
  22. benchmarks/arc_benchmark/evaluator/requirements.txt +1 -0
  23. benchmarks/arc_benchmark/evaluator/wrapper.py +98 -0
  24. benchmarks/arc_benchmark/generate_config.py +101 -0
  25. benchmarks/arc_benchmark/initial_program.py +42 -0
  26. benchmarks/arc_benchmark/post_discovery_eval.py +157 -0
  27. benchmarks/frontier-cs-eval/README.md +72 -0
  28. benchmarks/frontier-cs-eval/analyze_results.py +105 -0
  29. benchmarks/frontier-cs-eval/combine_results.py +66 -0
  30. benchmarks/frontier-cs-eval/config.yaml +57 -0
  31. benchmarks/frontier-cs-eval/evaluator.py +174 -0
  32. benchmarks/frontier-cs-eval/initial_program.cpp +6 -0
  33. benchmarks/frontier-cs-eval/run_all_frontiercs.py +70 -0
  34. benchmarks/frontier-cs-eval/run_best_programs_frontiercs.py +404 -0
  35. benchmarks/gpu_mode/mla_decode/config.yaml +355 -0
  36. benchmarks/gpu_mode/mla_decode/initial_program.py +245 -0
  37. benchmarks/gpu_mode/mla_decode/reference.py +520 -0
  38. benchmarks/gpu_mode/mla_decode/requirements.txt +2 -0
  39. benchmarks/gpu_mode/trimul/initial_program.py +84 -0
  40. benchmarks/image_gen/README.md +40 -0
  41. benchmarks/image_gen/sky_festival/evaluator.py +220 -0
  42. benchmarks/math/README.md +43 -0
  43. benchmarks/math/circle_packing/README.md +38 -0
  44. benchmarks/math/circle_packing/codebase/reference/hex_grid.py +43 -0
  45. benchmarks/math/circle_packing/codebase/reference/optimization_patterns.py +94 -0
  46. benchmarks/math/circle_packing/codebase/reference/packing_strategies.md +45 -0
  47. benchmarks/math/circle_packing/config.yaml +54 -0
  48. benchmarks/math/circle_packing/evaluator.py +338 -0
  49. benchmarks/math/circle_packing/evaluator/Dockerfile +11 -0
  50. benchmarks/math/circle_packing/evaluator/evaluate.sh +8 -0
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/logo_vector.png filter=lfs diff=lfs merge=lfs -text
37
+ assets/benchmarks.png filter=lfs diff=lfs merge=lfs -text
38
+ assets/scaling_comparison.png filter=lfs diff=lfs merge=lfs -text
39
+ assets/architecture.png filter=lfs diff=lfs merge=lfs -text
40
+ assets/comparison.png filter=lfs diff=lfs merge=lfs -text
assets/architecture.png ADDED

Git LFS Details

  • SHA256: 3b10c6bfb1734211abab7fa2e53b36931428d842ade3c96cbef255543b3889d8
  • Pointer size: 131 Bytes
  • Size of remote file: 278 kB
assets/benchmarks.png ADDED

Git LFS Details

  • SHA256: 42a69cb4c8119b79901ecfcdf93088e932643d6e0890d3c984dead40c407dc5b
  • Pointer size: 131 Bytes
  • Size of remote file: 758 kB
assets/comparison.png ADDED

Git LFS Details

  • SHA256: 8d68074ff5106764b1328b23ef5e949332aab3541172f8d91e2580d6f168e184
  • Pointer size: 131 Bytes
  • Size of remote file: 399 kB
assets/logo_vector.png ADDED

Git LFS Details

  • SHA256: d74ce6a1024e519a5afc85706133e31bafeb06b48b603a11284845b549cb586e
  • Pointer size: 131 Bytes
  • Size of remote file: 891 kB
assets/scaling_comparison.png ADDED

Git LFS Details

  • SHA256: d2aa00d9f59b5e14fc10d2569b872632fb992ab61fcfbba2ae946bef9deb22d8
  • Pointer size: 131 Bytes
  • Size of remote file: 297 kB
benchmarks/ADRS/README.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ADRS: AI-Driven Research for Systems
2
+
3
+ This directory contains the systems optimization benchmarks from the **AI-Driven Research for Systems (ADRS)** initiative at UC Berkeley.
4
+
5
+ ADRS investigates how AI — large language models, evolutionary algorithms, and multi-agent architectures — can autonomously design, optimize, and evaluate computer systems. Instead of treating systems research as a purely manual process, ADRS frames it as a closed-loop optimization problem: propose candidate algorithms, evaluate them against system-level objectives, analyze failure modes, adapt the search strategy, and iterate.
6
+
7
+ Each benchmark below defines a concrete systems task with a provided evaluator, initial program, and configuration. Solutions are evolved using SkyDiscover's evolutionary search loop.
8
+
9
+ ## Benchmarks
10
+
11
+ ### Cloudcast — Multi-Cloud Data Transfer
12
+
13
+ **Directory:** `cloudcast/`
14
+
15
+ Given a network of cloud regions with heterogeneous egress pricing and bandwidth, broadcast a dataset from a source region to multiple destinations at minimum total cost. The evolved algorithm must construct routing topologies (e.g., relay trees, Steiner-like structures) that exploit shared intermediate hops across transfers.
16
+
17
+ ### Expert Parallelism Load Balancer (EPLB)
18
+
19
+ **Directory:** `eplb/`
20
+
21
+ In Mixture-of-Experts (MoE) model inference, a small subset of experts handles each token, leading to GPU load imbalance when certain experts become disproportionately popular. This task evolves an algorithm that decides how many replicas each expert should have and how to assign them across GPUs, optimizing both load-balance quality and rebalancing runtime.
22
+
23
+ ### Model Placement (Prism)
24
+
25
+ **Directory:** `prism/`
26
+
27
+ Assign multiple LLM models to a fixed GPU cluster (80 GB per GPU) such that the worst-case KV-cache pressure ratio across GPUs is minimized. Lower pressure means more memory headroom for serving, improving throughput and stability under varying request loads.
28
+
29
+ ### LLM-SQL — Column Reordering for Prefix Caching
30
+
31
+ **Directory:** `llm_sql/`
32
+
33
+ When rows of a table are serialized into LLM prompts sequentially, consecutive rows that share leading column values can reuse cached prefixes. This task evolves a column-reordering strategy that maximizes prefix-cache hit rates across multiple real-world datasets without altering the underlying data.
34
+
35
+ ### Transaction Scheduling (TXN)
36
+
37
+ **Directory:** `txn_scheduling/`
38
+
39
+ Given a set of database transactions with read/write dependencies on shared keys, find an execution ordering that minimizes the total makespan. The evolved scheduler must respect conflict constraints (read-write and write-write on the same key) while compressing the overall completion time.
40
+
41
+ ### Telemetry Repair
42
+
43
+ **Coming soon.** The Telemetry Repair benchmark is under active development and will be released in a future update.
44
+
45
+ ## Quick Start
46
+
47
+ Each benchmark directory contains:
48
+ - `initial_program.py` — the seed solution for evolution
49
+ - `evaluator.py` — the scoring function
50
+ - `config.yaml` — run configuration
51
+
52
+ Run any benchmark from the repo root:
53
+
54
+ ```bash
55
+ uv run skydiscover-run \
56
+ benchmarks/ADRS/cloudcast/initial_program.py \
57
+ benchmarks/ADRS/cloudcast/evaluator.py \
58
+ -c benchmarks/ADRS/cloudcast/config.yaml \
59
+ -s [your_algorithm] \
60
+ -i 100
61
+ ```
62
+
63
+ See the individual benchmark directories for task-specific setup instructions (e.g., dataset downloads, GPU dependencies).
benchmarks/ADRS/eplb/config.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Expert Parallelism Load Balancer (EPLB) — MoE Expert Rearrangement
2
+ # Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
3
+ # NOTE: Requires expert-load.json — see README.md for download instructions.
4
+ language: python
5
+ diff_based_generation: true
6
+ max_iterations: 100
7
+ checkpoint_interval: 5
8
+ max_solution_length: 60000
9
+
10
+ llm:
11
+ api_base: https://api.openai.com/v1
12
+ models:
13
+ - name: "gpt-5"
14
+ weight: 1.0
15
+ max_tokens: 32000
16
+ timeout: 600
17
+
18
+ prompt:
19
+ system_message: |-
20
+ You are an expert programmer specializing in optimization algorithms. Your task
21
+ is to improve the Mixture-of-Expert models Expert Parallelism Load Balancer
22
+ (MoE EPLB) expert rearrangement algorithm.
23
+
24
+ This algorithm will take the load metrics recorded by the vLLM server, and
25
+ rearrange the experts to balance the load. It can make replicas of some experts
26
+ to achieve better load balancing.
27
+
28
+ Your goal will be two-fold:
29
+ 1. Improve the algorithm to achieve better load balancing; while
30
+ 2. Improve the algorithm to be more efficient, i.e. reduce the execution time
31
+ of the algorithm itself, since perfect load balancing is NP-hard.
32
+
33
+ The current algorithm is implemented in the `rebalance_experts` function.
34
+
35
+ evaluator:
36
+ timeout: 360
37
+
benchmarks/ADRS/llm_sql/README.md ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LLM-SQL — Column Reordering for Prefix Caching
2
+
3
+ When rows of a table are serialized into LLM prompts sequentially, consecutive rows that share leading column values can reuse cached prefixes. This task evolves a column-reordering strategy that maximizes prefix-cache hit rates across multiple real-world datasets without altering the underlying data.
4
+
5
+ ## Setup
6
+
7
+ 1. **Download the datasets** (~69 MB total):
8
+
9
+ ```bash
10
+ cd benchmarks/ADRS/llm_sql
11
+ bash download_dataset.sh
12
+ ```
13
+
14
+ This downloads 5 CSV datasets into `datasets/`:
15
+ - `movies.csv` — Rotten Tomatoes movie reviews (~9 MB)
16
+ - `beer.csv` — Beer review dataset (~2.5 MB)
17
+ - `BIRD.csv` — BIRD text-to-SQL dataset (~34 MB)
18
+ - `PDMX.csv` — PDMX metadata dataset (~7.4 MB)
19
+ - `products.csv` — Amazon product catalog (~16 MB)
20
+
21
+ 2. **Set your API key:**
22
+
23
+ ```bash
24
+ export OPENAI_API_KEY=...
25
+ ```
26
+
27
+ ## Run
28
+
29
+ From the repo root:
30
+
31
+ ```bash
32
+ uv run skydiscover-run \
33
+ benchmarks/ADRS/llm_sql/initial_program.py \
34
+ benchmarks/ADRS/llm_sql/evaluator.py \
35
+ -c benchmarks/ADRS/llm_sql/config.yaml \
36
+ -s [your_algorithm] \
37
+ -i 100
38
+ ```
39
+
40
+ ## Scoring
41
+
42
+ Combined score: `0.95 * average_hit_rate + 0.05 * (12 - min(12, avg_runtime)) / 12`
43
+
44
+ - **Hit rate** (95% weight): prefix-cache hit count normalized across 5 datasets
45
+ - **Runtime** (5% weight): wall-clock seconds for the reordering algorithm
46
+
47
+ ## Files
48
+
49
+ | File | Description |
50
+ |------|-------------|
51
+ | `initial_program.py` | Baseline `Evolved` class with `reorder()` method to evolve |
52
+ | `evaluator.py` | Scores programs on prefix hit rate and runtime across 5 datasets |
53
+ | `config.yaml` | Task-specific config (LLM, evaluator timeout, system prompt) |
54
+ | `solver.py` | Base `Algorithm` class and greedy baseline |
55
+ | `utils.py` | Prefix hit count evaluation utilities |
56
+ | `download_dataset.sh` | Script to download required CSV datasets |
benchmarks/ADRS/llm_sql/initial_program.py ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # EVOLVE-BLOCK-START
2
+ import pandas as pd
3
+ from solver import Algorithm
4
+ from typing import Tuple, List, Dict
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ from functools import lru_cache
7
+ from collections import Counter
8
+ import networkx as nx
9
+
10
+
11
+ class Evolved(Algorithm):
12
+ """
13
+ GGR algorithm
14
+ """
15
+
16
+ def __init__(self, df: pd.DataFrame = None):
17
+ self.df = df
18
+
19
+ self.dep_graph = None # NOTE: not used, for one way dependency
20
+
21
+ self.num_rows = 0
22
+ self.num_cols = 0
23
+ self.column_stats = None
24
+ self.val_len = None
25
+ self.row_stop = None
26
+ self.col_stop = None
27
+ self.base = 2000
28
+
29
+ def find_max_group_value(self, df: pd.DataFrame, value_counts: Dict, early_stop: int = 0) -> str:
30
+ # NOTE: recalculate value counts and length for each value
31
+ value_counts = Counter(df.stack())
32
+ weighted_counts = {val: self.val_len[val] * (count - 1) for val, count in value_counts.items()} # if count > 1
33
+ if not weighted_counts:
34
+ return None
35
+ max_group_val, max_weighted_count = max(weighted_counts.items(), key=lambda x: x[1])
36
+ if max_weighted_count < early_stop:
37
+ return None
38
+ return max_group_val
39
+
40
+ def reorder_columns_for_value(self, row, value, column_names, grouped_rows_len: int = 1):
41
+ # cols_with_value will now use attribute access instead of indexing with row[]
42
+ cols_with_value = []
43
+ for idx, col in enumerate(column_names):
44
+ if hasattr(row, col) and getattr(row, col) == value:
45
+ cols_with_value.append(col)
46
+ elif hasattr(row, col.replace(" ", "_")) and getattr(row, col.replace(" ", "_")) == value:
47
+ cols_with_value.append(col)
48
+ else:
49
+ attr_name = f"_{idx}"
50
+ if hasattr(row, attr_name) and getattr(row, attr_name) == value:
51
+ cols_with_value.append(attr_name)
52
+
53
+ if self.dep_graph is not None and grouped_rows_len > 1:
54
+ # NOTE: experimental
55
+ reordered_cols = []
56
+ for col in cols_with_value:
57
+ dependent_cols = self.get_dependent_columns(col)
58
+
59
+ # check if dependent columns are in row, and if column exists in row attributes
60
+ valid_dependent_cols = []
61
+ for idx, dep_col in enumerate(dependent_cols):
62
+ if hasattr(row, dep_col):
63
+ valid_dependent_cols.append(dep_col)
64
+ elif hasattr(row, dep_col.replace(" ", "_")):
65
+ valid_dependent_cols.append(dep_col)
66
+ else:
67
+ attr_name = f"_{idx}"
68
+ if hasattr(row, attr_name):
69
+ valid_dependent_cols.append(dep_col)
70
+
71
+ reordered_cols.extend([col] + valid_dependent_cols)
72
+ cols_without_value = [col for col in column_names if col not in reordered_cols]
73
+ reordered_cols.extend(cols_without_value)
74
+ assert len(reordered_cols) == len(
75
+ column_names
76
+ ), f"Reordered cols len: {len(reordered_cols)} Original cols len: {len(column_names)}"
77
+ return [getattr(row, col) for col in reordered_cols], cols_with_value
78
+ else:
79
+ cols_without_value = []
80
+ for idx, col in enumerate(column_names):
81
+ if hasattr(row, col) and getattr(row, col) != value:
82
+ cols_without_value.append(col)
83
+ elif hasattr(row, col.replace(" ", "_")) and getattr(row, col.replace(" ", "_")) != value:
84
+ cols_without_value.append(col)
85
+ else:
86
+ # Handle some edge cases
87
+ attr_name = f"_{idx}"
88
+ if hasattr(row, attr_name) and getattr(row, attr_name) != value:
89
+ cols_without_value.append(attr_name)
90
+
91
+ reordered_cols = cols_with_value + cols_without_value
92
+ assert len(reordered_cols) == len(
93
+ column_names
94
+ ), f"Reordered cols len: {len(reordered_cols)} Original cols len: {len(column_names)}"
95
+ return [getattr(row, col) for col in reordered_cols], cols_with_value
96
+
97
+ def get_dependent_columns(self, col: str) -> List[str]:
98
+ if self.dep_graph is None or not self.dep_graph.has_node(col):
99
+ return []
100
+ return list(nx.descendants(self.dep_graph, col))
101
+
102
+ @lru_cache(maxsize=None)
103
+ def get_cached_dependent_columns(self, col: str) -> List[str]:
104
+ return self.get_dependent_columns(col)
105
+
106
+ def fixed_reorder(self, df: pd.DataFrame, row_sort: bool = True) -> Tuple[pd.DataFrame, List[List[str]]]:
107
+ num_rows, column_stats = self.calculate_col_stats(df, enable_index=True)
108
+ reordered_columns = [col for col, _, _, _ in column_stats]
109
+ reordered_df = df[reordered_columns]
110
+
111
+ assert reordered_df.shape == df.shape
112
+ column_orderings = [reordered_columns] * num_rows
113
+
114
+ if row_sort:
115
+ reordered_df = reordered_df.sort_values(by=reordered_columns, axis=0)
116
+
117
+ return reordered_df, column_orderings
118
+
119
+ def column_recursion(self, result_df, max_value, grouped_rows, row_stop, col_stop, early_stop):
120
+ cols_settled = []
121
+ with ThreadPoolExecutor() as executor:
122
+ futures = [
123
+ executor.submit(self.reorder_columns_for_value, row, max_value, grouped_rows.columns.tolist(), len(grouped_rows))
124
+ for row in grouped_rows.itertuples(index=False)
125
+ ]
126
+ for i, future in enumerate(as_completed(futures)):
127
+ reordered_row, cols_settled = future.result()
128
+ result_df.loc[i] = reordered_row
129
+
130
+ grouped_value_counts = Counter()
131
+
132
+ if not result_df.empty:
133
+ # Group by the first column
134
+ grouped_result_df = result_df.groupby(result_df.columns[0])
135
+ grouped_value_counts = Counter(grouped_rows.stack()) # this is still faster than updating from cached value counts
136
+
137
+ for _, group in grouped_result_df:
138
+ if group[group.columns[0]].iloc[0] != max_value:
139
+ continue
140
+
141
+ dependent_cols = self.get_cached_dependent_columns(group.columns[0])
142
+ length_of_settle_cols = len(cols_settled)
143
+
144
+ if dependent_cols:
145
+ assert length_of_settle_cols >= 1, f"Dependent columns should be no less than 1, but got {length_of_settle_cols}"
146
+
147
+ # test the first length_of_settle_cols columns, each column has nunique == 1
148
+ for col in group.columns[:length_of_settle_cols]:
149
+ assert group[col].nunique() == 1, f"Column {col} should have nunique == 1, but got {group[col].nunique()}"
150
+
151
+ # drop all the settled columns and reorder the rest
152
+ group_remainder = group.iloc[:, length_of_settle_cols:]
153
+ else:
154
+ group_remainder = group.iloc[:, 1:]
155
+
156
+ grouped_remainder_value_counts = Counter(group_remainder.stack())
157
+
158
+ reordered_group_remainder, _ = self.recursive_reorder(
159
+ group_remainder, grouped_remainder_value_counts, early_stop=early_stop, row_stop=row_stop, col_stop=col_stop + 1
160
+ )
161
+ # Update the group with the reordered columns
162
+ if dependent_cols:
163
+ group.iloc[:, length_of_settle_cols:] = reordered_group_remainder.values
164
+ else:
165
+ group.iloc[:, 1:] = reordered_group_remainder.values
166
+
167
+ result_df.update(group)
168
+ break
169
+
170
+ return result_df, grouped_value_counts
171
+
172
+ def recursive_reorder(
173
+ self,
174
+ df: pd.DataFrame,
175
+ value_counts: Dict,
176
+ early_stop: int = 0,
177
+ original_columns: List[str] = None,
178
+ row_stop: int = 0,
179
+ col_stop: int = 0,
180
+ ) -> Tuple[pd.DataFrame, List[List[str]]]:
181
+ if df.empty or len(df.columns) == 0 or len(df) == 0:
182
+ return df, []
183
+
184
+ if self.row_stop is not None and row_stop >= self.row_stop:
185
+ return self.fixed_reorder(df)
186
+
187
+ if self.col_stop is not None and col_stop >= self.col_stop:
188
+ return self.fixed_reorder(df)
189
+
190
+ if original_columns is None:
191
+ original_columns = df.columns.tolist()
192
+
193
+ # Find the max group value using updated counts
194
+ max_value = self.find_max_group_value(df, value_counts, early_stop=early_stop)
195
+ if max_value is None:
196
+ # If there is no max value, then fall back to fixed reorder
197
+ return self.fixed_reorder(df)
198
+
199
+ grouped_rows = df[df.isin([max_value]).any(axis=1)]
200
+ remaining_rows = df[~df.isin([max_value]).any(axis=1)]
201
+
202
+ # If there is no grouped rows, return the original DataFrame
203
+ if grouped_rows.empty:
204
+ return self.fixed_reorder(df)
205
+
206
+ result_df = pd.DataFrame(columns=df.columns)
207
+
208
+ reordered_remaining_rows = pd.DataFrame(columns=df.columns) # Initialize empty dataframe first
209
+
210
+ # Column Recursion
211
+ result_df, grouped_value_counts = self.column_recursion(result_df, max_value, grouped_rows, row_stop, col_stop, early_stop)
212
+
213
+ remaining_value_counts = value_counts - grouped_value_counts # Approach 1 - update remaining value counts with subtraction
214
+
215
+ # Row Recursion
216
+ reordered_remaining_rows, _ = self.recursive_reorder(
217
+ remaining_rows, remaining_value_counts, early_stop=early_stop, row_stop=row_stop + 1, col_stop=col_stop
218
+ )
219
+ old_column_names = result_df.columns.tolist()
220
+ result_cols_reset = result_df.reset_index(drop=True)
221
+ result_rows_reset = reordered_remaining_rows.reset_index(drop=True)
222
+ final_result_df = pd.DataFrame(result_cols_reset.values.tolist() + result_rows_reset.values.tolist())
223
+
224
+ if row_stop == 0 and col_stop == 0:
225
+ final_result_df.columns = old_column_names
226
+ final_result_df.columns = final_result_df.columns.tolist()[:-1] + ["original_index"]
227
+
228
+ return final_result_df, []
229
+
230
+ def recursive_split_and_reorder(self, df: pd.DataFrame, original_columns: List[str] = None, early_stop: int = 0):
231
+ """
232
+ Recursively split the DataFrame into halves until the size is <= 1000, then apply the recursive reorder function.
233
+ """
234
+ if len(df) <= self.base:
235
+ initial_value_counts = Counter(df.stack())
236
+ return self.recursive_reorder(df, initial_value_counts, early_stop, original_columns, row_stop=0, col_stop=0)[0]
237
+
238
+ mid_index = len(df) // 2
239
+ df_top_half = df.iloc[:mid_index]
240
+ df_bottom_half = df.iloc[mid_index:]
241
+
242
+ with ThreadPoolExecutor() as executor:
243
+ future_top = executor.submit(self.recursive_split_and_reorder, df_top_half, original_columns, early_stop)
244
+ future_bottom = executor.submit(self.recursive_split_and_reorder, df_bottom_half, original_columns, early_stop)
245
+
246
+ reordered_top_half = future_top.result()
247
+ reordered_bottom_half = future_bottom.result()
248
+
249
+ assert reordered_bottom_half.shape == df_bottom_half.shape
250
+ reordered_df = pd.concat([reordered_top_half, reordered_bottom_half], axis=0, ignore_index=True)
251
+
252
+ assert reordered_df.shape == df.shape
253
+
254
+ return reordered_df
255
+
256
+ @lru_cache(maxsize=None)
257
+ def calculate_length(self, value):
258
+ if isinstance(value, bool):
259
+ return 4**2
260
+ if isinstance(value, (int, float)):
261
+ return len(str(value)) ** 2
262
+ if isinstance(value, str):
263
+ return len(value) ** 2
264
+ return 0
265
+
266
+ def reorder(
267
+ self,
268
+ df: pd.DataFrame,
269
+ early_stop: int = 0,
270
+ row_stop: int = None,
271
+ col_stop: int = None,
272
+ col_merge: List[List[str]] = [],
273
+ one_way_dep: List[Tuple[str, str]] = [],
274
+ distinct_value_threshold: float = 0.8,
275
+ parallel: bool = True,
276
+ ) -> Tuple[pd.DataFrame, List[List[str]]]:
277
+ # Prepare
278
+ initial_df = df.copy()
279
+ if col_merge:
280
+ self.num_rows, self.column_stats = self.calculate_col_stats(df, enable_index=True)
281
+ reordered_columns = [col for col, _, _, _ in self.column_stats]
282
+ for col_to_merge in col_merge:
283
+ final_col_order = [col for col in reordered_columns if col in col_to_merge]
284
+ df = self.merging_columns(df, final_col_order, prepended=False)
285
+ self.num_rows, self.column_stats = self.calculate_col_stats(df, enable_index=True)
286
+ self.column_stats = {col: (num_groups, avg_len, score) for col, num_groups, avg_len, score in self.column_stats}
287
+
288
+ # One way dependency statistics [not used]
289
+ if one_way_dep is not None and len(one_way_dep) > 0:
290
+ self.dep_graph = nx.DiGraph()
291
+ for dep in one_way_dep:
292
+ col1 = [col for col in df.columns if dep[0] in col]
293
+ col2 = [col for col in df.columns if dep[1] in col]
294
+ assert len(col1) == 1, f"Expected one column to match {dep[0]}, but got {len(col1)}"
295
+ assert len(col2) == 1, f"Expected one column to match {dep[1]}, but got {len(col2)}"
296
+ col1 = col1[0]
297
+ col2 = col2[0]
298
+ self.dep_graph.add_edge(col1, col2)
299
+
300
+ # Discard too distinct columns by threshold [optional]
301
+ nunique_threshold = len(df) * distinct_value_threshold
302
+ columns_to_discard = [col for col in df.columns if df[col].nunique() > nunique_threshold]
303
+ columns_to_discard = sorted(columns_to_discard, key=lambda x: self.column_stats[x][2], reverse=True)
304
+ columns_to_recurse = [col for col in df.columns if col not in columns_to_discard]
305
+ df["original_index"] = range(len(df))
306
+ discarded_columns_df = df[columns_to_discard + ["original_index"]]
307
+ df_to_recurse = df[columns_to_recurse + ["original_index"]]
308
+ recurse_df = df_to_recurse
309
+
310
+ self.column_stats = {col: stats for col, stats in self.column_stats.items() if col not in columns_to_discard}
311
+ initial_value_counts = Counter(recurse_df.stack())
312
+ self.val_len = {val: self.calculate_length(val) for val in initial_value_counts.keys()}
313
+
314
+ self.row_stop = row_stop if row_stop else len(recurse_df)
315
+ self.col_stop = col_stop if col_stop else len(recurse_df.columns.tolist())
316
+ print("*" * 80)
317
+ print(f"DF columns = {df.columns}")
318
+ # print(f"Early stop = {early_stop}")
319
+ # print(f"Row recursion stop depth = {self.row_stop}, Column recursion stop depth = {self.col_stop}")
320
+ print("*" * 80)
321
+
322
+ # Eary stop and fall back
323
+ recurse_df, _ = self.fixed_reorder(recurse_df)
324
+
325
+ # Recursive reordering
326
+ self.num_cols = len(recurse_df.columns)
327
+ if parallel:
328
+ reordered_df = self.recursive_split_and_reorder(recurse_df, original_columns=columns_to_recurse, early_stop=early_stop)
329
+ else:
330
+ reordered_df, _ = self.recursive_reorder(
331
+ recurse_df,
332
+ initial_value_counts,
333
+ early_stop=early_stop,
334
+ )
335
+
336
+ assert (
337
+ reordered_df.shape == recurse_df.shape
338
+ ), f"Reordered DataFrame shape {reordered_df.shape} does not match original DataFrame shape {recurse_df.shape}"
339
+ assert recurse_df["original_index"].is_unique, "Passed in recurse index contains duplicates!"
340
+ assert reordered_df["original_index"].is_unique, "Reordered index contains duplicates!"
341
+
342
+ if len(columns_to_discard) > 0:
343
+ final_df = pd.merge(reordered_df, discarded_columns_df, on="original_index", how="left")
344
+ else:
345
+ final_df = reordered_df
346
+
347
+ final_df = final_df.drop(columns=["original_index"])
348
+
349
+ if not col_merge:
350
+ assert (
351
+ final_df.shape == initial_df.shape
352
+ ), f"Final DataFrame shape {final_df.shape} does not match original DataFrame shape {initial_df.shape}"
353
+ else:
354
+ assert (
355
+ final_df.shape[0] == initial_df.shape[0]
356
+ ), f"Final DataFrame shape {final_df.shape} does not match original DataFrame shape {initial_df.shape}"
357
+ assert (
358
+ final_df.shape[1] == recurse_df.shape[1] + len(columns_to_discard) - 1
359
+ ), f"Final DataFrame shape {final_df.shape} does not match original DataFrame shape {recurse_df.shape}"
360
+
361
+ # sort by the first column to get the final order
362
+ final_df = final_df.sort_values(by=final_df.columns.to_list(), axis=0)
363
+ return final_df, []
364
+
365
+ # EVOLVE-BLOCK-END
benchmarks/ADRS/prism/config.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Prism (GPU Model Placement) — Prompt Caching Column Reordering Optimization
2
+ # Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
3
+ language: python
4
+ diff_based_generation: true
5
+ max_iterations: 100
6
+ checkpoint_interval: 5
7
+ max_solution_length: 60000
8
+
9
+ llm:
10
+ api_base: https://api.openai.com/v1
11
+ models:
12
+ - name: "gpt-5"
13
+ weight: 1.0
14
+ max_tokens: 32000
15
+ timeout: 600
16
+
17
+ prompt:
18
+ system_message: |-
19
+ You are an expert for model placement on GPUs. Your task is to improve a model placement algorithm by improve the function named compute_model_placement in the intial program that places models to available GPUs.
20
+ The algorithm must MINIMIZE the maximum KVPR across all GPUs while ensuring models can fit into the GPUs' memory. Note that KVPR is KV cache pressure for a GPU. It indicates how crowded a GPU is. For a specific GPU, its KVPR is computed as sum(model.req_rate/model.slo for model in models) / (GPU_MEM_SIZE - sum(model.model_size for model in models)), where models are the models on this GPU. The generated program should be as simple as possible and the code should be executed correctly without errors.
21
+
22
+ evaluator:
23
+ timeout: 360
24
+
benchmarks/ADRS/prism/evaluator/evaluate.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ PROGRAM="$1"
5
+ # MODE ($2) accepted but ignored — override this file to use train/test splits.
6
+
7
+ python /benchmark/evaluator.py "$PROGRAM"
benchmarks/ADRS/prism/evaluator/evaluator.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib.util
2
+ import numpy as np
3
+ import time
4
+ import concurrent.futures
5
+ import traceback
6
+ from dataclasses import dataclass
7
+
8
+ GPU_MEM_SIZE = 80 # GB
9
+ MIN_INT = float('-inf') # Define MIN_INT as negative infinity
10
+
11
+ @dataclass
12
+ class Model:
13
+ model_name: str
14
+ model_size: int
15
+ req_rate: int
16
+ slo: int
17
+ cur_gpu_id: int
18
+
19
+
20
+ def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=30):
21
+ """
22
+ Run a function with a timeout using concurrent.futures
23
+ """
24
+ with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
25
+ future = executor.submit(func, *args, **kwargs)
26
+ try:
27
+ result = future.result(timeout=timeout_seconds)
28
+ return result
29
+ except concurrent.futures.TimeoutError:
30
+ raise TimeoutError(f"Function timed out after {timeout_seconds} seconds")
31
+
32
+
33
+ def safe_float(value):
34
+ """Convert a value to float safely"""
35
+ try:
36
+ if np.isnan(value) or np.isinf(value):
37
+ return 0.0
38
+ return float(value)
39
+ except (TypeError, ValueError):
40
+ return 0.0
41
+
42
+ def verify_gpu_mem_constraint(placement_data: dict[int, list[Model]]) -> bool:
43
+ """
44
+ Verify the whether models can fit into GPU memory
45
+ """
46
+ # Check if the placement data is valid
47
+ if placement_data is None:
48
+ return False
49
+
50
+ # Check if the placement data is valid
51
+ for gpu_id, models in placement_data.items():
52
+ if sum(model.model_size for model in models) > GPU_MEM_SIZE:
53
+ return False
54
+
55
+ return True
56
+
57
+
58
+ def calculate_kvcache_pressure(placement_data: dict[int, list[Model]]) -> float:
59
+ """
60
+ Calculate the KVCache pressure
61
+ """
62
+ max_kvpr = MIN_INT
63
+ for gpu_id, models in placement_data.items():
64
+ total_model_size = sum(model.model_size for model in models)
65
+ total_weighted_req_rate = sum(model.req_rate / model.slo for model in models)
66
+ if GPU_MEM_SIZE - total_model_size > 0:
67
+ kvpr = total_weighted_req_rate / (GPU_MEM_SIZE - total_model_size)
68
+ else:
69
+ kvpr = 1000000
70
+ max_kvpr = max(max_kvpr, kvpr)
71
+
72
+ return max_kvpr
73
+
74
+
75
+ def generate_test_gpu_models(num_tests=50):
76
+ """
77
+ Generate multiple test signals with different characteristics
78
+ """
79
+ test_cases = []
80
+ np.random.seed(42)
81
+
82
+ for i in range(num_tests):
83
+ gpu_num = np.random.randint(5, 10)
84
+ gpu_models = []
85
+ for j in range(gpu_num*2):
86
+ model_size = np.random.randint(10, 30)
87
+ req_rate = np.random.randint(1, 10)
88
+ slo = np.random.randint(5, 10)
89
+ gpu_models.append(Model(model_name=f"model_{j}", model_size=model_size, req_rate=req_rate, slo=slo, cur_gpu_id=j))
90
+
91
+ test_cases.append((gpu_num, gpu_models))
92
+
93
+ return test_cases
94
+
95
+ def evaluate(program_path):
96
+ """
97
+ Main evaluation function that tests the signal processing algorithm
98
+ on multiple test signals and calculates the composite performance metric.
99
+ """
100
+ try:
101
+ # Load the program
102
+ spec = importlib.util.spec_from_file_location("program", program_path)
103
+ program = importlib.util.module_from_spec(spec)
104
+ spec.loader.exec_module(program)
105
+
106
+ # Check if required function exists
107
+ if not hasattr(program, "compute_model_placement"):
108
+ return {
109
+ "max_kvpr": 0.0,
110
+ "success_rate": 0.0,
111
+ "combined_score": 0.0,
112
+ "error": "Missing compute_model_placement function",
113
+ }
114
+
115
+ # Generate test gpu and models
116
+ test_gpu_models = generate_test_gpu_models()
117
+
118
+ # Collect metrics across all tests
119
+ all_kvpr = []
120
+ all_metrics = []
121
+ successful_runs = 0
122
+
123
+ for i, (gpu_num, gpu_models) in enumerate(test_gpu_models):
124
+ try:
125
+ # Run the algorithm with timeout
126
+ start_time = time.time()
127
+
128
+ # Call the program's main function
129
+ result = run_with_timeout(
130
+ program.compute_model_placement,
131
+ kwargs={
132
+ 'gpu_num': gpu_num,
133
+ 'models': gpu_models
134
+ },
135
+ timeout_seconds=10
136
+ )
137
+
138
+ execution_time = time.time() - start_time
139
+
140
+ # Validate result format
141
+ if not isinstance(result, dict):
142
+ return {
143
+ "max_kvpr": 0.0,
144
+ "success_rate": 0.0,
145
+ "combined_score": 0.0,
146
+ "error": f"Placement {i}: Expected dict, got {type(result).__name__}",
147
+ }
148
+
149
+ # Validate all models are placed
150
+ placed_models = []
151
+ for gpu_id, assigned_models in result.items():
152
+ if not isinstance(assigned_models, list):
153
+ return {
154
+ "max_kvpr": 0.0,
155
+ "success_rate": 0.0,
156
+ "combined_score": 0.0,
157
+ "error": f"GPU {gpu_id} value must be list, got {type(assigned_models).__name__}",
158
+ }
159
+ placed_models.extend(assigned_models)
160
+
161
+ if len(placed_models) != len(gpu_models):
162
+ return {
163
+ "max_kvpr": 0.0,
164
+ "success_rate": 0.0,
165
+ "combined_score": 0.0,
166
+ "error": f"Not all models placed: {len(placed_models)}/{len(gpu_models)}",
167
+ }
168
+
169
+ # Check for duplicate placements (by object identity)
170
+ placed_ids = [id(m) for m in placed_models]
171
+ if len(set(placed_ids)) != len(placed_ids):
172
+ return {
173
+ "max_kvpr": 0.0,
174
+ "success_rate": 0.0,
175
+ "combined_score": 0.0,
176
+ "error": f"Duplicate models detected",
177
+ }
178
+
179
+ # Check placed models are the exact input objects
180
+ original_ids = {id(m) for m in gpu_models}
181
+ if set(placed_ids) != original_ids:
182
+ return {
183
+ "max_kvpr": 0.0,
184
+ "success_rate": 0.0,
185
+ "combined_score": 0.0,
186
+ "error": "Placed models don't match input models (missing or foreign models)",
187
+ }
188
+
189
+ # Verify GPU memory constraints
190
+ if not verify_gpu_mem_constraint(result):
191
+ return {
192
+ "max_kvpr": 0.0,
193
+ "success_rate": 0.0,
194
+ "combined_score": 0.0,
195
+ "error": f"GPU memory constraint violated",
196
+ }
197
+
198
+ # Calculate metrics using the generated test signal
199
+ max_kvpr = calculate_kvcache_pressure(result)
200
+
201
+ # Store metrics
202
+ metrics = {
203
+ 'max_kvpr': safe_float(max_kvpr),
204
+ 'execution_time': safe_float(execution_time),
205
+ }
206
+
207
+ all_kvpr.append(safe_float(max_kvpr))
208
+ all_metrics.append(metrics)
209
+ successful_runs += 1
210
+
211
+ except TimeoutError:
212
+ print(f"Placement {i}: Timeout")
213
+ continue
214
+ except Exception as e:
215
+ print(f"Placement {i}: Error - {str(e)}")
216
+ continue
217
+
218
+ # If no successful runs, return minimal scores
219
+ if successful_runs == 0:
220
+ return {
221
+ "max_kvpr": 0.0,
222
+ "success_rate": 0.0,
223
+ "combined_score": 0.0,
224
+ "error": "All test signals failed"
225
+ }
226
+
227
+ print(all_metrics)
228
+ # Calculate aggregate metrics
229
+ avg_kvpr = np.mean(all_kvpr)
230
+ if avg_kvpr != 0:
231
+ avg_kvpr = 1.0 / avg_kvpr
232
+ avg_execution_time = np.mean([m['execution_time'] for m in all_metrics])
233
+ success_rate = successful_runs / len(test_gpu_models)
234
+
235
+ return {
236
+ "max_kvpr": safe_float(avg_kvpr),
237
+ "execution_time": safe_float(avg_execution_time),
238
+ "success_rate": safe_float(success_rate),
239
+ "combined_score": safe_float(avg_kvpr) + safe_float(success_rate),
240
+ }
241
+
242
+ except Exception as e:
243
+ print(f"Evaluation failed: {str(e)}")
244
+ print(traceback.format_exc())
245
+ return {
246
+ "max_kvpr": 0.0,
247
+ "success_rate": 0.0,
248
+ "combined_score": 0.0,
249
+ "error": str(e)
250
+ }
251
+
252
+
253
+ if __name__ == "__main__":
254
+ # Backwards-compat: bridges old evaluate() -> dict to the container JSON
255
+ # protocol. wrapper.py is auto-injected at build time from
256
+ # skydiscover/evaluation/wrapper.py.
257
+ from wrapper import run
258
+
259
+ run(evaluate)
benchmarks/ADRS/prism/initial_program.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GPU_MEM_SIZE = 80 # GB
2
+
3
+ # EVOLVE-BLOCK-START
4
+
5
+ def compute_model_placement(gpu_num, models):
6
+ """
7
+ Compute a model placement that minimizes the maximum KVPR across all GPUs.
8
+
9
+ Args:
10
+ gpu_num: Number of GPUs
11
+ models: List of models to place
12
+
13
+ Returns:
14
+ A placement of models to GPUs
15
+ """
16
+
17
+ # Greedy KVPR-minimizing placement based on Algorithm 1 (without τ check)
18
+ # 1) Sort models by r_j / s_j in descending order
19
+ sorted_models = sorted(models, key=lambda m: (m.req_rate / m.slo), reverse=True)
20
+
21
+ # 2) Initialize per-GPU states
22
+ placement = {gpu_id: [] for gpu_id in range(gpu_num)}
23
+ shared_kv = [GPU_MEM_SIZE for _ in range(gpu_num)] # remaining memory per GPU
24
+ weighted_req_rate = [0.0 for _ in range(gpu_num)] # sum of r_j / s_j per GPU
25
+
26
+ # 3) Assign each model to the GPU that minimizes current KVPR while fitting in memory
27
+ for model in sorted_models:
28
+ best_idx = None
29
+ best_ratio = float('inf')
30
+
31
+ for gpu_id in range(gpu_num):
32
+ if model.model_size <= shared_kv[gpu_id] and shared_kv[gpu_id] > 0:
33
+ current_ratio = weighted_req_rate[gpu_id] / shared_kv[gpu_id]
34
+ if current_ratio < best_ratio:
35
+ best_ratio = current_ratio
36
+ best_idx = gpu_id
37
+
38
+ # Failure: if no GPU can fit, raise an error instead of overcommitting
39
+ if best_idx is None:
40
+ raise ValueError(
41
+ f"Unable to place model of size {model.model_size} GB on any GPU. "
42
+ f"Remaining per-GPU memory: {shared_kv}"
43
+ )
44
+
45
+ placement[best_idx].append(model)
46
+ weighted_req_rate[best_idx] += model.req_rate / model.slo
47
+ shared_kv[best_idx] -= model.model_size
48
+
49
+ return placement
50
+
51
+ # EVOLVE-BLOCK-END
52
+
53
+
54
+ if __name__ == "__main__":
55
+ # Test the algorithm
56
+
57
+ from evaluator import generate_test_gpu_models
58
+ from evaluator import calculate_kvcache_pressure
59
+ from evaluator import safe_float
60
+ import numpy as np
61
+
62
+ test_cases = generate_test_gpu_models()
63
+ all_kvpr = []
64
+ for i, (gpu_num, gpu_models) in enumerate(test_cases):
65
+
66
+ results = compute_model_placement(gpu_num, gpu_models)
67
+ max_kvpr = calculate_kvcache_pressure(results)
68
+ all_kvpr.append(safe_float(max_kvpr))
69
+
70
+ avg_kvpr = np.mean(all_kvpr)
71
+ if avg_kvpr != 0:
72
+ avg_kvpr = 1.0 / avg_kvpr
73
+
74
+
75
+ print(f"Max KVPR: {avg_kvpr:.3f}")
benchmarks/ADRS/prism/initial_program_naive.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # EVOLVE-BLOCK-START
2
+
3
+ GPU_MEM_SIZE = 80 # GB
4
+
5
+ def compute_model_placement(gpu_num, models):
6
+ """
7
+ Compute a model placement that minimizes the maximum KVPR across all GPUs.
8
+
9
+ Args:
10
+ gpu_num: Number of GPUs
11
+ models: List of models to place
12
+
13
+ Returns:
14
+ A placement of models to GPUs
15
+ """
16
+
17
+ # gready algorithm to place models to the GPUs with smallest gpu_id first
18
+
19
+ placement = dict()
20
+ for gpu_id in range(gpu_num):
21
+ placement[gpu_id] = []
22
+
23
+ for model in models:
24
+ for gpu_id in range(gpu_num):
25
+ if model.model_size <= GPU_MEM_SIZE - sum(model.model_size for model in placement[gpu_id]):
26
+ placement[gpu_id].append(model)
27
+ break
28
+ return placement
29
+
30
+ # EVOLVE-BLOCK-END
benchmarks/arc_benchmark/README.md ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ARC Benchmark
2
+
3
+ Evolves ARC-AGI visual reasoning task solutions using SkyDiscover.
4
+
5
+ ## Setup
6
+
7
+ ### 1. Download ARC data
8
+
9
+ Clone the ARC-AGI-2 repo and convert the data:
10
+
11
+ ```bash
12
+ cd benchmarks/arc_benchmark
13
+ git clone https://github.com/arcprize/ARC-AGI-2.git /tmp/ARC-AGI-2
14
+ OUT_DIR=./data uv run python convert_arc_agi2_data.py /tmp/ARC-AGI-2
15
+ rm -rf /tmp/ARC-AGI-2
16
+ ```
17
+
18
+ This creates 4 files in `data/`:
19
+ - `arc-agi_training_challenges.json` (1000 tasks)
20
+ - `arc-agi_training_solutions.json`
21
+ - `arc-agi_evaluation_challenges.json` (120 tasks)
22
+ - `arc-agi_evaluation_solutions.json`
23
+
24
+ ### 2. Set your API key
25
+
26
+ ```bash
27
+ export OPENAI_API_KEY=...
28
+ ```
29
+
30
+ ## Run a single task
31
+
32
+ ARC requires a per-task config (each task has unique training examples as the prompt). Use `generate_config.py` to create one, then run with any search backend:
33
+
34
+ ```bash
35
+ cd benchmarks/arc_benchmark
36
+
37
+ # Generate task-specific config
38
+ TASK_NUM=0 ARC_TASK_FILE=training CONFIG_OUT=./config_task_0.yaml \
39
+ uv run python generate_config.py
40
+
41
+ # Run with any backend
42
+ uv run skydiscover-run initial_program.py evaluator.py \
43
+ -c config_task_0.yaml -s [your_algorithm] -i 30
44
+
45
+ # Or with evox, openevolve, gepa:
46
+ uv run skydiscover-run initial_program.py evaluator.py \
47
+ -c config_task_0.yaml -s [your_algorithm] -i 30
48
+ ```
49
+
50
+ ## Run all evaluation tasks
51
+
52
+ ```bash
53
+ cd benchmarks/arc_benchmark
54
+ export ARC_TASK_FILE=evaluation
55
+
56
+ NUM_TASKS=$(uv run python -c "import json; print(len(json.load(open('data/arc-agi_evaluation_challenges.json'))))")
57
+
58
+ for i in $(seq 0 $((NUM_TASKS - 1))); do
59
+ TASK_NUM=$i CONFIG_OUT=./config_task_${i}.yaml uv run python generate_config.py
60
+ TASK_NUM=$i uv run skydiscover-run initial_program.py evaluator.py \
61
+ -c config_task_${i}.yaml -s [your_algorithm] -i 30 \
62
+ -o outputs/eval_task_${i}
63
+ done
64
+ ```
65
+
66
+ ## Post-discovery test evaluation
67
+
68
+ After the discovery process, evaluate the best program on held-out test inputs:
69
+
70
+ ```bash
71
+ TASK_NUM=0 ARC_TASK_FILE=evaluation \
72
+ OUTS_DIR=./outputs/eval_task_0/adaevolve \
73
+ uv run python post_discovery_eval.py
74
+ ```
75
+
76
+ ## Config: GPT vs Gemini
77
+
78
+ Edit `config.yaml` — comment the GPT block and uncomment the Gemini block, or override with `--model`:
79
+
80
+ ```bash
81
+ uv run skydiscover-run ... -m gemini/gemini-3-pro-preview
82
+ ```
83
+
84
+ ## Files
85
+
86
+ | File | Description |
87
+ |------|-------------|
88
+ | `initial_program.py` | Seed program with two transform functions to evolve |
89
+ | `evaluator.py` | Scores programs on pass@2 + cell accuracy |
90
+ | `config.yaml` | Base config template (prompt injected by generate_config.py) |
91
+ | `generate_config.py` | Injects task-specific training examples into config as system prompt |
92
+ | `post_discovery_eval.py` | Evaluates best program on held-out test inputs |
93
+ | `convert_arc_agi2_data.py` | Converts raw ARC-AGI-2 data to benchmark format |
94
+ | `requirements.txt` | Dependencies (numpy) |
95
+
96
+ ## Environment variables
97
+
98
+ | Variable | Default | Description |
99
+ |----------|---------|-------------|
100
+ | `OPENAI_API_KEY` | (required) | API key |
101
+ | `ARC_TASK_FILE` | `training` | `training` or `evaluation` |
102
+ | `TASK_NUM` | `0` | Task index within the dataset |
103
+ | `BASE_CONFIG` | `./config.yaml` | Base config template path |
104
+ | `CONFIG_OUT` | `./config_task_{N}.yaml` | Output path for generated config |
105
+ | `DATA_ROOT` | `./data` | Path to ARC data directory |
106
+ | `MAX_ITERATIONS` | (from config) | Override `max_iterations` at runtime |
107
+ | `ARC_EVAL_INCLUDE_TEST` | `0` | Set to `1` to also run the held-out test inputs during evolution |
108
+ | `ARC_EVAL_USE_TEST_FOR_SCORE` | `0` | Set to `1` to average train and test scores into `combined_score` (only used when `ARC_EVAL_INCLUDE_TEST=1`) |
benchmarks/arc_benchmark/config.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ARC Benchmark base config
2
+ # This file is used by generate_config.py to inject a task-specific prompt.
3
+ # Switch models by editing the 'llm' section below.
4
+
5
+ # General settings
6
+ max_iterations: 30
7
+ checkpoint_interval: 10
8
+ log_level: "INFO"
9
+ random_seed: 42
10
+ diff_based_generation: true
11
+ max_solution_length: 50000
12
+
13
+ # LLM configuration
14
+ llm:
15
+ models:
16
+ - name: "gpt-5"
17
+ weight: 1.0
18
+ api_base: "https://api.openai.com/v1"
19
+ temperature: 0.7
20
+ # top_p: 0.95 # omitted by default; some providers (e.g. Anthropic) reject both temperature and top_p
21
+ max_tokens: 32768
22
+ timeout: 3000
23
+
24
+ # Option B: Gemini 3 Pro (comment Option A and uncomment below)
25
+ # llm:
26
+ # models:
27
+ # - name: "gemini-3-pro-preview"
28
+ # weight: 1.0
29
+ # api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
30
+ # temperature: 0.7
31
+ # top_p: 0.95
32
+ # max_tokens: 32768
33
+ # timeout: 3000
34
+
35
+ # Search configuration (default: top-k)
36
+ search:
37
+ type: "topk"
38
+ database:
39
+ random_seed: 42
40
+ num_context_programs: 4
41
+
42
+ # Prompt configuration
43
+ # NOTE: generate_config.py overwrites prompt.system_message per task.
44
+ prompt:
45
+ system_message: "PLACEHOLDER_REPLACED_BY_GENERATE_CONFIG"
46
+
47
+ # Evaluator configuration
48
+ evaluator:
49
+ timeout: 360
50
+ max_retries: 3
51
+ cascade_evaluation: false
benchmarks/arc_benchmark/convert_arc_agi2_data.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Convert ARC-AGI-2-style data (data/training/*.json, data/evaluation/*.json)
4
+ into the format expected by this benchmark:
5
+ - arc-agi_{split}_challenges.json (task_id -> { train, test with inputs only })
6
+ - arc-agi_{split}_solutions.json (task_id -> list of test output grids)
7
+
8
+ Usage (from benchmarks/arc_benchmark, with data already in ./data/training and ./data/evaluation):
9
+ OUT_DIR=./data python3 convert_arc_agi2_data.py .
10
+
11
+ Or with an external ARC-AGI-2 clone:
12
+ python3 convert_arc_agi2_data.py /path/to/ARC-AGI-2
13
+ # Writes into that path by default; set OUT_DIR to write elsewhere.
14
+ """
15
+ import json
16
+ import os
17
+ import sys
18
+
19
+
20
+ def convert_split(repo_root: str, split: str, out_dir: str) -> None:
21
+ """Convert data/{split}/*.json into challenges + solutions JSON."""
22
+ split_dir = os.path.join(repo_root, "data", split)
23
+ if not os.path.isdir(split_dir):
24
+ print(f"Skip {split}: no directory {split_dir}")
25
+ return
26
+
27
+ challenges = {}
28
+ solutions = {}
29
+
30
+ for name in sorted(os.listdir(split_dir)):
31
+ if not name.endswith(".json"):
32
+ continue
33
+ task_id = name[:-5] # strip .json
34
+ path = os.path.join(split_dir, name)
35
+ with open(path, "r") as f:
36
+ task = json.load(f)
37
+ # Challenge: train as-is; test with only "input" (no output)
38
+ challenges[task_id] = {
39
+ "train": task["train"],
40
+ "test": [{"input": p["input"]} for p in task["test"]],
41
+ }
42
+ # Solutions: list of test output grids
43
+ solutions[task_id] = [p["output"] for p in task["test"]]
44
+
45
+ challenges_path = os.path.join(out_dir, f"arc-agi_{split}_challenges.json")
46
+ solutions_path = os.path.join(out_dir, f"arc-agi_{split}_solutions.json")
47
+ with open(challenges_path, "w") as f:
48
+ json.dump(challenges, f)
49
+ with open(solutions_path, "w") as f:
50
+ json.dump(solutions, f)
51
+ print(f"Wrote {challenges_path} ({len(challenges)} tasks)")
52
+ print(f"Wrote {solutions_path} ({len(solutions)} tasks)")
53
+
54
+
55
+ def main():
56
+ repo_root = os.path.abspath(sys.argv[1] if len(sys.argv) > 1 else ".")
57
+ out_dir = os.getenv("OUT_DIR", repo_root)
58
+ for split in ("training", "evaluation"):
59
+ convert_split(repo_root, split, out_dir)
60
+
61
+
62
+ if __name__ == "__main__":
63
+ main()
benchmarks/arc_benchmark/evaluator/Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+ WORKDIR /benchmark
3
+
4
+ COPY requirements.txt .
5
+ RUN pip install --no-cache-dir -r requirements.txt
6
+
7
+ # wrapper.py provides backwards compatibility for old Python-based evaluators
8
+ # that define evaluate(program_path) -> dict. Bridges them to the container
9
+ # JSON protocol. Source of truth: skydiscover/evaluation/wrapper.py
10
+ COPY . .
11
+ RUN chmod +x evaluate.sh
12
+
13
+ ENTRYPOINT ["./evaluate.sh"]
benchmarks/arc_benchmark/evaluator/evaluate.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ PROGRAM="$1"
5
+ # MODE ($2) accepted but ignored — override this file to use train/test splits.
6
+
7
+ python /benchmark/evaluator.py "$PROGRAM"
benchmarks/arc_benchmark/evaluator/evaluator.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from typing import List, Tuple, Dict, Any
3
+ import json
4
+ import os
5
+
6
+ try:
7
+ from skydiscover.evaluation.evaluation_result import EvaluationResult
8
+ except ImportError:
9
+ from dataclasses import dataclass, field
10
+ from typing import Union
11
+
12
+ @dataclass
13
+ class EvaluationResult:
14
+ metrics: Dict[str, float]
15
+ artifacts: Dict[str, Union[str, bytes]] = field(default_factory=dict)
16
+ import importlib.util
17
+
18
+ TASK_FILE = os.getenv("ARC_TASK_FILE", "training")
19
+ TASK_NUM = os.getenv("TASK_NUM", 0)
20
+ DATA_ROOT = os.getenv("DATA_ROOT", os.path.join(os.path.dirname(os.path.abspath(__file__)), "data"))
21
+ INCLUDE_TEST = os.getenv("ARC_EVAL_INCLUDE_TEST", "0").lower() in ("1", "true", "yes")
22
+ USE_TEST_IN_SCORE = os.getenv("ARC_EVAL_USE_TEST_FOR_SCORE", "0").lower() in ("1", "true", "yes")
23
+
24
+
25
+ def cell_accuracy_single(pred: np.ndarray, gt: np.ndarray) -> float:
26
+ """
27
+ Compute continuous cell-level accuracy between prediction and ground truth.
28
+ Returns a float in [0, 1]. Handles shape mismatches gracefully.
29
+ """
30
+ if pred.shape != gt.shape:
31
+ # Partial credit for getting shape partially right
32
+ shape_score = 0.0
33
+ if len(pred.shape) == len(gt.shape) == 2:
34
+ row_match = 1.0 if pred.shape[0] == gt.shape[0] else 0.0
35
+ col_match = 1.0 if pred.shape[1] == gt.shape[1] else 0.0
36
+ shape_score = (row_match + col_match) * 0.1 # up to 0.2 for correct dimensions
37
+ return shape_score
38
+ # Cell-level accuracy
39
+ total_cells = gt.size
40
+ if total_cells == 0:
41
+ return 1.0
42
+ correct_cells = int(np.sum(pred == gt))
43
+ return correct_cells / total_cells
44
+
45
+
46
+ def best_attempt_cell_accuracy(attempts: List[np.ndarray], gt: np.ndarray) -> float:
47
+ """Return the best cell accuracy across all attempts for one example."""
48
+ return max(cell_accuracy_single(a, gt) for a in attempts)
49
+
50
+
51
+ def pass_at_2_accuracy_single(
52
+ attempts: List[np.ndarray],
53
+ gt: np.ndarray
54
+ ) -> Tuple[int, Dict[int, Any]]:
55
+ """
56
+ Compute pass@2 accuracy for a single ARC test case.
57
+
58
+ Args:
59
+ attempts: List of 2 numpy arrays representing model attempts.
60
+ gt: Ground-truth output as a 2D numpy array.
61
+
62
+ Returns:
63
+ pass_at_2: int (1 if any attempt is perfectly correct, else 0)
64
+ diagnostics: dict mapping attempt index -> diagnostic info.
65
+ If sizes match, includes indices of incorrect cells.
66
+ """
67
+ assert len(attempts) == 2, "Expected exactly 2 attempts for pass@2 evaluation."
68
+
69
+ diagnostics = {}
70
+ passed = False
71
+
72
+ for i, pred in enumerate(attempts):
73
+ attempt_info = {}
74
+
75
+ # Size check
76
+ if pred.shape != gt.shape:
77
+ attempt_info["size_match"] = False
78
+ attempt_info["pred_shape"] = list(pred.shape)
79
+ attempt_info["gt_shape"] = list(gt.shape)
80
+ attempt_info["incorrect_indices"] = None
81
+ attempt_info["cell_accuracy"] = 0.0
82
+ attempt_passed = False
83
+ else:
84
+ attempt_info["size_match"] = True
85
+
86
+ # Find incorrect cells
87
+ incorrect_mask = pred != gt
88
+ incorrect_indices = np.argwhere(incorrect_mask)
89
+
90
+ attempt_info["incorrect_indices"] = incorrect_indices.tolist()
91
+ attempt_info["num_incorrect"] = int(incorrect_mask.sum())
92
+ attempt_info["num_total"] = int(gt.size)
93
+ attempt_info["cell_accuracy"] = float(np.sum(~incorrect_mask)) / gt.size
94
+
95
+ # Perfect match
96
+ if incorrect_mask.sum() == 0:
97
+ attempt_passed = True
98
+ else:
99
+ attempt_passed = False
100
+
101
+ attempt_info["perfect_match"] = attempt_passed
102
+ passed = attempt_passed or passed
103
+
104
+ diagnostics[i] = attempt_info
105
+
106
+ pass_at_2 = 1 if passed else 0
107
+
108
+ return pass_at_2, diagnostics
109
+
110
+ def pass_at_2_accuracy_multi_test(
111
+ all_attempts: List[List[np.ndarray]],
112
+ all_gt: List[np.ndarray]
113
+ ) -> Tuple[List[int], List[Dict[int, Any]]]:
114
+ """
115
+ Compute pass@2 accuracy across multiple ARC test cases.
116
+
117
+ Args:
118
+ all_attempts: List of lists of 2 numpy arrays for each test case.
119
+ all_gt: List of ground-truth outputs as 2D numpy arrays.
120
+ """
121
+ assert len(all_attempts) == len(all_gt), "Mismatched number of test cases."
122
+
123
+ all_diagnostics = []
124
+ all_pass = []
125
+
126
+ for attempts, gt in zip(all_attempts, all_gt):
127
+ pass_at_2, diagnostics = pass_at_2_accuracy_single(attempts, gt)
128
+ all_pass.append(pass_at_2)
129
+ all_diagnostics.append(diagnostics)
130
+
131
+ return all_pass, all_diagnostics
132
+
133
+ def extract_failure_artifacts(diagnostics, pred=None, gt=None):
134
+ """
135
+ Extract failure artifacts from diagnostics for a given example.
136
+ Includes actual vs expected output snippets for better LLM feedback.
137
+ """
138
+ artifacts = {}
139
+ if not diagnostics["size_match"]:
140
+ artifacts["error_type"] = "SizeMismatch"
141
+ artifacts["error_message"] = (
142
+ f"Output shape {diagnostics['pred_shape']} does not match "
143
+ f"expected shape {diagnostics['gt_shape']}."
144
+ )
145
+ artifacts["suggestion"] = (
146
+ f"Your output has shape {diagnostics['pred_shape']} but the correct output "
147
+ f"has shape {diagnostics['gt_shape']}. Review how you determine output dimensions."
148
+ )
149
+ else:
150
+ num_incorrect = diagnostics['num_incorrect']
151
+ num_total = diagnostics['num_total']
152
+ accuracy = diagnostics['cell_accuracy']
153
+ artifacts["error_type"] = "IncorrectCells"
154
+ artifacts["error_message"] = (
155
+ f"{num_incorrect}/{num_total} cells incorrect "
156
+ f"(cell accuracy: {accuracy:.1%})."
157
+ )
158
+ # Show a compact diff of expected vs actual for first few wrong cells
159
+ if diagnostics['incorrect_indices'] and pred is not None and gt is not None:
160
+ wrong = diagnostics['incorrect_indices'][:8] # first 8 wrong cells
161
+ diff_lines = []
162
+ for r, c in wrong:
163
+ diff_lines.append(f" [{r},{c}]: got {int(pred[r,c])}, expected {int(gt[r,c])}")
164
+ artifacts["cell_diffs"] = "\n".join(diff_lines)
165
+ if len(diagnostics['incorrect_indices']) > 8:
166
+ artifacts["cell_diffs"] += f"\n ... and {len(diagnostics['incorrect_indices'])-8} more"
167
+ artifacts["suggestion"] = (
168
+ f"Your solution gets {accuracy:.1%} of cells correct. "
169
+ f"Review the transformation logic for the failing cells."
170
+ )
171
+
172
+ return artifacts
173
+
174
+ def evaluate(program_path):
175
+ """
176
+ Evaluate the program on ARC task training (and optionally test) examples.
177
+
178
+ Returns a combined_score that blends:
179
+ - pass@2 (binary perfect-match, weighted 0.6)
180
+ - cell accuracy (continuous partial credit, weighted 0.4)
181
+ This gives evolution gradient signal even when no example is solved perfectly.
182
+ """
183
+ spec = importlib.util.spec_from_file_location("program_module", program_path)
184
+ program_module = importlib.util.module_from_spec(spec)
185
+ spec.loader.exec_module(program_module)
186
+
187
+ if not hasattr(program_module, 'transform_grid_attempt_1') or not hasattr(program_module, 'transform_grid_attempt_2'):
188
+ print(f"Stage 1 validation failed: Program must define 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.")
189
+
190
+ error_artifacts = {
191
+ "error_type": "MissingFunction",
192
+ "error_message": "Stage 1: Program is missing required 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.",
193
+ "suggestion": "Make sure your program includes a functions named 'transform_grid_attempt_1' and 'transform_grid_attempt_2' that take as an argument a 2D numpy array and return a 2D numpy array."
194
+ }
195
+
196
+ return EvaluationResult(
197
+ metrics={
198
+ "runs_successfully": 0.0,
199
+ "combined_score": 0.0,
200
+ "error": "Missing transform_grid_attempt_1 and transform_grid_attempt_2 functions"
201
+ },
202
+ artifacts=error_artifacts
203
+ )
204
+
205
+ # Load ARC tasks
206
+ challenge_path = os.path.join(DATA_ROOT, f"arc-agi_{TASK_FILE}_challenges.json")
207
+
208
+ with open(challenge_path, 'r') as f:
209
+ tasks = json.load(f)
210
+
211
+ task_id = list(tasks.keys())[int(TASK_NUM)]
212
+ task = tasks[task_id]
213
+
214
+ train_inputs = [np.array(inp["input"]) for inp in task['train']]
215
+ train_gts = [np.array(gt["output"]) for gt in task['train']]
216
+
217
+ train_attempts = []
218
+
219
+ # Generate attempts for training data
220
+ for inp in train_inputs:
221
+ attempt_1 = program_module.transform_grid_attempt_1(inp)
222
+ if not isinstance(attempt_1, np.ndarray):
223
+ print(f"transform_grid_attempt_1 did not return a numpy array")
224
+
225
+ error_artifacts = {
226
+ "error_type": "InvalidReturnType",
227
+ "error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array.",
228
+ "suggestion": "Make sure your transform_grid_attempt_1 function returns a 2D numpy array."
229
+ }
230
+
231
+ return EvaluationResult(
232
+ metrics={
233
+ "runs_successfully": 0.0,
234
+ "combined_score": 0.0,
235
+ "error": "transform_grid_attempt_1 did not return a numpy array"
236
+ },
237
+ artifacts=error_artifacts
238
+ )
239
+
240
+ attempt_2 = program_module.transform_grid_attempt_2(inp)
241
+ if not isinstance(attempt_2, np.ndarray):
242
+ print(f"transform_grid_attempt_2 did not return a numpy array")
243
+
244
+ error_artifacts = {
245
+ "error_type": "InvalidReturnType",
246
+ "error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array.",
247
+ "suggestion": "Make sure your transform_grid_attempt_2 function returns a 2D numpy array."
248
+ }
249
+
250
+ return EvaluationResult(
251
+ metrics={
252
+ "runs_successfully": 0.0,
253
+ "combined_score": 0.0,
254
+ "error": "transform_grid_attempt_2 did not return a numpy array"
255
+ },
256
+ artifacts=error_artifacts
257
+ )
258
+ train_attempts.append([attempt_1, attempt_2])
259
+
260
+ pass_at_2_train, train_diagnostics_list = pass_at_2_accuracy_multi_test(train_attempts, train_gts)
261
+
262
+ # Compute both binary pass@2 and continuous cell accuracy
263
+ train_pass_score = sum(pass_at_2_train) / len(pass_at_2_train)
264
+ train_cell_acc = sum(
265
+ best_attempt_cell_accuracy(attempts, gt)
266
+ for attempts, gt in zip(train_attempts, train_gts)
267
+ ) / len(train_gts)
268
+
269
+ # Blended score: pass@2 (60%) + cell accuracy (40%) gives gradient signal
270
+ train_score = 0.6 * train_pass_score + 0.4 * train_cell_acc
271
+
272
+ metrics = {
273
+ "runs_successfully": 1.0,
274
+ "combined_score": train_score,
275
+ "train_combined_score": train_score,
276
+ "train_pass_at_2_score": train_pass_score,
277
+ "train_cell_accuracy": round(train_cell_acc, 4),
278
+ }
279
+ error_artifacts = {}
280
+ for i, (train_pass, train_diagnostics) in enumerate(zip(pass_at_2_train, train_diagnostics_list)):
281
+ example_name = f"train_example_{i}"
282
+ metrics[f"{example_name}_pass_at_2"] = train_pass
283
+ best_acc = best_attempt_cell_accuracy(train_attempts[i], train_gts[i])
284
+ metrics[f"{example_name}_cell_accuracy"] = round(best_acc, 4)
285
+ for attempt in train_diagnostics:
286
+ attempt_pass = train_diagnostics[attempt]["perfect_match"]
287
+ metrics[f"{example_name}_attempt_{attempt}"] = attempt_pass
288
+ if not attempt_pass:
289
+ pred = train_attempts[i][attempt]
290
+ gt = train_gts[i]
291
+ error_artifacts[f"{example_name}_attempt_{attempt}_diagnostics"] = extract_failure_artifacts(
292
+ train_diagnostics[attempt], pred=pred, gt=gt
293
+ )
294
+
295
+ # Optional: include test feedback (uses solutions if available)
296
+ if INCLUDE_TEST:
297
+ solution_path = os.path.join(DATA_ROOT, f"arc-agi_{TASK_FILE}_solutions.json")
298
+ if os.path.isfile(solution_path):
299
+ with open(solution_path, 'r') as f:
300
+ solutions = json.load(f)
301
+ task_id = list(tasks.keys())[int(TASK_NUM)]
302
+ solution = solutions.get(task_id)
303
+ if solution is not None and "test" in task:
304
+ if len(task["test"]) != len(solution):
305
+ raise ValueError(
306
+ f"Train/test data mismatch: task {task_id} has {len(task['test'])} test inputs "
307
+ f"but {len(solution)} solution outputs. Check that arc-agi_{TASK_FILE}_challenges.json "
308
+ f"and arc-agi_{TASK_FILE}_solutions.json were generated together."
309
+ )
310
+ test_inputs = [np.array(inp["input"]) for inp in task['test']]
311
+ test_gts = [np.array(gt) for gt in solution]
312
+
313
+ test_attempts = []
314
+ for inp in test_inputs:
315
+ attempt_1 = program_module.transform_grid_attempt_1(inp)
316
+ if not isinstance(attempt_1, np.ndarray):
317
+ print(f"transform_grid_attempt_1 did not return a numpy array (test)")
318
+ return EvaluationResult(
319
+ metrics={
320
+ "runs_successfully": 0.0,
321
+ "combined_score": 0.0,
322
+ "error": "transform_grid_attempt_1 did not return a numpy array (test)"
323
+ },
324
+ artifacts={
325
+ "error_type": "InvalidReturnType",
326
+ "error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array (test).",
327
+ "suggestion": "Make sure transform_grid_attempt_1 returns a 2D numpy array."
328
+ }
329
+ )
330
+
331
+ attempt_2 = program_module.transform_grid_attempt_2(inp)
332
+ if not isinstance(attempt_2, np.ndarray):
333
+ print(f"transform_grid_attempt_2 did not return a numpy array (test)")
334
+ return EvaluationResult(
335
+ metrics={
336
+ "runs_successfully": 0.0,
337
+ "combined_score": 0.0,
338
+ "error": "transform_grid_attempt_2 did not return a numpy array (test)"
339
+ },
340
+ artifacts={
341
+ "error_type": "InvalidReturnType",
342
+ "error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array (test).",
343
+ "suggestion": "Make sure transform_grid_attempt_2 returns a 2D numpy array."
344
+ }
345
+ )
346
+ test_attempts.append([attempt_1, attempt_2])
347
+
348
+ pass_at_2_test, test_diagnostics_list = pass_at_2_accuracy_multi_test(test_attempts, test_gts)
349
+ test_pass_score = sum(pass_at_2_test) / len(pass_at_2_test)
350
+ test_cell_acc = sum(
351
+ best_attempt_cell_accuracy(attempts, gt)
352
+ for attempts, gt in zip(test_attempts, test_gts)
353
+ ) / len(test_gts)
354
+ test_score = 0.6 * test_pass_score + 0.4 * test_cell_acc
355
+
356
+ metrics["test_combined_score"] = test_score
357
+ metrics["test_pass_at_2_score"] = test_pass_score
358
+ metrics["test_cell_accuracy"] = round(test_cell_acc, 4)
359
+ metrics["test_included"] = 1
360
+
361
+ for i, (test_pass, test_diagnostics) in enumerate(zip(pass_at_2_test, test_diagnostics_list)):
362
+ example_name = f"test_example_{i}"
363
+ metrics[f"{example_name}_pass_at_2"] = test_pass
364
+ best_acc = best_attempt_cell_accuracy(test_attempts[i], test_gts[i])
365
+ metrics[f"{example_name}_cell_accuracy"] = round(best_acc, 4)
366
+ for attempt in test_diagnostics:
367
+ metrics[f"{example_name}_attempt_{attempt}"] = test_diagnostics[attempt]["perfect_match"]
368
+ if test_pass == 0:
369
+ first_failing_idx = next(
370
+ (a for a in test_diagnostics if not test_diagnostics[a]["perfect_match"]),
371
+ 0,
372
+ )
373
+ pred = test_attempts[i][first_failing_idx]
374
+ gt = test_gts[i]
375
+ error_artifacts[f"{example_name}"] = extract_failure_artifacts(
376
+ test_diagnostics[first_failing_idx], pred=pred, gt=gt
377
+ )
378
+
379
+ if USE_TEST_IN_SCORE:
380
+ metrics["combined_score"] = (train_score + test_score) / 2.0
381
+ else:
382
+ metrics["test_included"] = 0
383
+ else:
384
+ metrics["test_included"] = 0
385
+
386
+ return EvaluationResult(
387
+ metrics=metrics,
388
+ artifacts=error_artifacts
389
+ )
390
+
391
+
392
+ def _evaluate_as_dict(program_path):
393
+ """Adapter: calls evaluate() and converts EvaluationResult to a plain dict."""
394
+ result = evaluate(program_path)
395
+ d = dict(result.metrics)
396
+ for k, v in result.artifacts.items():
397
+ d[k] = v
398
+ return d
399
+
400
+
401
+ if __name__ == "__main__":
402
+ # Backwards-compat: bridges old evaluate() -> EvaluationResult to the
403
+ # container JSON protocol. wrapper.py is copied from
404
+ # skydiscover/evaluation/wrapper.py.
405
+ from wrapper import run
406
+
407
+ run(_evaluate_as_dict)
benchmarks/arc_benchmark/evaluator/requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ numpy
benchmarks/arc_benchmark/evaluator/wrapper.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Backwards-compat wrapper for old Python-based evaluators.
2
+
3
+ Old-style evaluators define ``evaluate(program_path) -> dict``. This module
4
+ bridges that interface to the container JSON protocol expected by
5
+ ContainerizedEvaluator.
6
+
7
+ Usage — add this to the bottom of your evaluator.py::
8
+
9
+ if __name__ == "__main__":
10
+ from wrapper import run
11
+ run(evaluate)
12
+ """
13
+
14
+ import json
15
+ import sys
16
+ import traceback
17
+
18
+
19
+ def run(evaluate_fn):
20
+ """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
21
+
22
+ * Reads ``sys.argv[1]`` as the program path.
23
+ * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
24
+ don't contaminate the JSON output.
25
+ * Separates numeric metrics from non-numeric artifacts.
26
+ * Guarantees ``combined_score`` is always present in metrics.
27
+ """
28
+ if len(sys.argv) < 2:
29
+ print("Usage: evaluator.py <program_path>", file=sys.stderr)
30
+ sys.exit(1)
31
+
32
+ program_path = sys.argv[1]
33
+
34
+ # Redirect stdout → stderr during evaluation so debug prints from
35
+ # the evaluator don't contaminate the JSON output on stdout.
36
+ real_stdout = sys.stdout
37
+ sys.stdout = sys.stderr
38
+ try:
39
+ result = evaluate_fn(program_path)
40
+ except Exception as e:
41
+ sys.stdout = real_stdout
42
+ print(
43
+ json.dumps(
44
+ {
45
+ "status": "error",
46
+ "combined_score": 0.0,
47
+ "metrics": {"combined_score": 0.0},
48
+ "artifacts": {
49
+ "error": str(e),
50
+ "traceback": traceback.format_exc(),
51
+ },
52
+ }
53
+ )
54
+ )
55
+ return
56
+ sys.stdout = real_stdout
57
+
58
+ if not isinstance(result, dict):
59
+ print(
60
+ json.dumps(
61
+ {
62
+ "status": "error",
63
+ "combined_score": 0.0,
64
+ "metrics": {"combined_score": 0.0},
65
+ "artifacts": {
66
+ "error": f"evaluate() returned {type(result).__name__}, expected dict"
67
+ },
68
+ }
69
+ )
70
+ )
71
+ return
72
+
73
+ # Separate numeric metrics from non-numeric artifacts.
74
+ metrics = {}
75
+ artifacts = {}
76
+ for k, v in result.items():
77
+ if isinstance(v, bool):
78
+ metrics[k] = float(v)
79
+ elif isinstance(v, (int, float)):
80
+ metrics[k] = float(v)
81
+ elif isinstance(v, str):
82
+ artifacts[k] = v
83
+ elif isinstance(v, (list, dict)):
84
+ artifacts[k] = json.dumps(v)
85
+
86
+ if "combined_score" not in metrics:
87
+ metrics["combined_score"] = 0.0
88
+
89
+ status = "error" if "error" in artifacts else "success"
90
+ output = {
91
+ "status": status,
92
+ "combined_score": metrics["combined_score"],
93
+ "metrics": metrics,
94
+ }
95
+ if artifacts:
96
+ output["artifacts"] = artifacts
97
+
98
+ print(json.dumps(output))
benchmarks/arc_benchmark/generate_config.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import yaml
3
+ import json
4
+
5
+
6
+ def load_task_as_prompt(task_json, task_num):
7
+ with open(task_json, 'r') as f:
8
+ tasks = json.load(f)
9
+
10
+ task_id = list(tasks.keys())[int(task_num)]
11
+ task = tasks[task_id]
12
+ train_inputs = [inp["input"] for inp in task['train']]
13
+ train_outputs = [gt["output"] for gt in task['train']]
14
+
15
+ train_pairs = ""
16
+ for i, (inp, out) in enumerate(zip(train_inputs, train_outputs)):
17
+ train_pairs += f"In {i} - {inp}\nOut {i} - {out}\n"
18
+
19
+ prompt = f"""You are participating in a puzzle solving competition. You are an expert at solving puzzles.
20
+ Find the common pattern that transforms each input grid into its corresponding output grid.
21
+
22
+ Your task is to write python functions that implement the MOST GENERAL transformation rule. The rule must:
23
+ - Apply consistently to ALL training examples
24
+ - Generalize to unseen inputs (critical for success)
25
+ - Be based on structural patterns, not memorized examples
26
+ - Use relative/spatial rules rather than absolute coordinates
27
+
28
+ Generalization rules (THIS IS CRITICAL):
29
+ - Infer the transformation ONLY from the training input-output pairs
30
+ - If multiple rules fit the training data, choose the SIMPLEST and MOST GENERAL one
31
+ - Prefer structural/relational rules (shapes, adjacency, symmetry, patterns) over coordinate-based rules
32
+ - Do NOT hardcode any values, coordinates, or specific grid sizes that appear in training examples
33
+ - Think: "What is the underlying principle?" not "What fits these specific examples?"
34
+ - Use numpy only (no external libraries)
35
+
36
+ Common failure modes to avoid:
37
+ - Overfitting to specific grid sizes or positions in training examples
38
+ - Hardcoding colors, coordinates, or counts from training data
39
+ - Assuming global properties (like separator colors) without verifying across ALL examples
40
+ - Using absolute positions when relative/structural rules would generalize better
41
+
42
+ Solution approach:
43
+ - Analyze the training examples to identify the CORE transformation principle
44
+ - Prefer block-wise, object-wise, or pattern-based rules that work locally
45
+ - If the grid has distinct regions, solve each region independently
46
+ - Build flexible rules that adapt to different input sizes and structures
47
+
48
+ Training examples:
49
+ {train_pairs}
50
+
51
+ Your task: Write 2 different Python functions that implement the general transformation rule.
52
+ - Each function takes a 2D numpy array as input and returns the transformed 2D numpy array
53
+ - The two attempts should use genuinely different strategies (e.g., different algorithmic approaches)
54
+ - Focus on generalization - your solution will be evaluated on BOTH training examples AND unseen test cases
55
+
56
+ CRITICAL: Write general transformations that discover the underlying rule, not memorize the training examples.
57
+
58
+ Remember to only output the modified python functions as your solution."""
59
+
60
+ return prompt
61
+
62
+ def generate_config(task_num, task_file, dataset_root=None, base_config=None):
63
+ if dataset_root is None:
64
+ dataset_root = os.getenv("DATA_ROOT")
65
+ if not dataset_root:
66
+ dataset_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
67
+ task_json = os.path.join(dataset_root, f"arc-agi_{task_file}_challenges.json")
68
+ prompt = load_task_as_prompt(task_json, task_num)
69
+
70
+ if base_config is None:
71
+ default_base = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config.yaml")
72
+ base_config = os.getenv("BASE_CONFIG", default_base)
73
+ with open(base_config, 'r') as file:
74
+ config = yaml.safe_load(file)
75
+
76
+ config['prompt']['system_message'] = prompt
77
+ # Use OPENAI_API_KEY at runtime if set (keeps real key out of committed config)
78
+ api_key_env = os.getenv("OPENAI_API_KEY")
79
+ if api_key_env and api_key_env.strip() and api_key_env != "your-gemini-api-key":
80
+ config["llm"]["api_key"] = api_key_env.strip()
81
+ # Override max_iterations from env if set (e.g. by run_discovery.sh)
82
+ max_iter_env = os.getenv("MAX_ITERATIONS")
83
+ if max_iter_env is not None and str(max_iter_env).strip() != "":
84
+ try:
85
+ config["max_iterations"] = int(max_iter_env)
86
+ except ValueError:
87
+ pass
88
+
89
+ # Write to a per-task config file so parallel runs don't conflict
90
+ out_path = os.getenv("CONFIG_OUT", f"./config_task_{task_num}.yaml")
91
+ with open(out_path, 'w') as file:
92
+ yaml.dump(config, file)
93
+ return out_path
94
+
95
+ if __name__ == "__main__":
96
+ TASK_FILE = os.getenv("ARC_TASK_FILE", "training")
97
+ TASK_NUM = os.getenv("TASK_NUM", 0)
98
+
99
+ path = generate_config(TASK_NUM, TASK_FILE)
100
+ print(path)
101
+
benchmarks/arc_benchmark/initial_program.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # EVOLVE-BLOCK-START
2
+
3
+ import numpy as np
4
+
5
+ def transform_grid_attempt_1(grid):
6
+ """
7
+ Example transformation:
8
+ - Validate input (2D, integer values 0-9).
9
+ - Rotate the grid 90 degrees clockwise.
10
+ - Increment every cell by 1 modulo 10 (keeps values 0-9).
11
+ Returns a new numpy int array.
12
+ """
13
+ arr = _validate_grid(grid)
14
+ out = np.rot90(arr, k=-1) # 90 degrees clockwise
15
+ out = (out + 1) % 10
16
+ return out.astype(np.int32)
17
+
18
+ def transform_grid_attempt_2(grid):
19
+ """
20
+ Example transformation:
21
+ - Validate input (2D, integer values 0-9).
22
+ - Upsample each cell to a 2x2 block (doubling both dimensions).
23
+ - Invert colors by mapping v -> 9 - v (keeps values 0-9).
24
+ Returns a new numpy int array.
25
+ """
26
+ arr = _validate_grid(grid)
27
+ out = np.repeat(np.repeat(arr, 2, axis=0), 2, axis=1)
28
+ out = 9 - out
29
+ return out.astype(np.int32)
30
+
31
+ # EVOLVE-BLOCK-END
32
+
33
+ def _validate_grid(grid):
34
+ arr = np.asarray(grid)
35
+ if arr.ndim != 2:
36
+ raise ValueError("Input must be a 2D array.")
37
+ # cast to integer type for value checks
38
+ if not np.issubdtype(arr.dtype, np.integer):
39
+ arr = arr.astype(int)
40
+ if arr.size and (arr.min() < 0 or arr.max() > 9):
41
+ raise ValueError("Array values must be integers in the range 0-9.")
42
+ return arr
benchmarks/arc_benchmark/post_discovery_eval.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib.util
2
+ import os
3
+ import json
4
+ import numpy as np
5
+ from evaluator import pass_at_2_accuracy_multi_test, extract_failure_artifacts
6
+
7
+ TASK_FILE = os.getenv("ARC_TASK_FILE", "training")
8
+ TASK_NUM = os.getenv("TASK_NUM", 0)
9
+ OUTS_DIR = os.getenv("OUTS_DIR", "")
10
+ # Optional: path to a checkpoint dir (e.g. outputs/evaluation_task_0/checkpoints/checkpoint_10) to eval that best_program.py on test set
11
+ PROGRAM_DIR = os.getenv("PROGRAM_DIR", "")
12
+
13
+
14
+ def _program_path():
15
+ """Path to best_program.py: PROGRAM_DIR if set, else OUTS_DIR/best/."""
16
+ if PROGRAM_DIR:
17
+ return os.path.join(PROGRAM_DIR, "best_program.py")
18
+ return os.path.join(OUTS_DIR, "best", "best_program.py")
19
+
20
+
21
+ def _result_path():
22
+ """Where to write post_evolution_evaluation_result.json."""
23
+ if PROGRAM_DIR:
24
+ return os.path.join(PROGRAM_DIR, "post_evolution_evaluation_result.json")
25
+ return os.path.join(OUTS_DIR, "best", "post_evolution_evaluation_result.json")
26
+
27
+
28
+ def load_program_module():
29
+ """Dynamically load the best_program.py module from the specified directory."""
30
+ path = _program_path()
31
+ if not os.path.isfile(path):
32
+ raise FileNotFoundError(f"Program not found: {path}. Set PROGRAM_DIR to a checkpoint dir (e.g. .../checkpoints/checkpoint_10) or ensure OUTS_DIR/best/best_program.py exists.")
33
+ spec = importlib.util.spec_from_file_location("program_module", path)
34
+ program_module = importlib.util.module_from_spec(spec)
35
+ spec.loader.exec_module(program_module)
36
+
37
+ return program_module
38
+
39
+ def evaluate():
40
+ """Evaluate the program module located in the specified directory."""
41
+ program_module = load_program_module()
42
+ if not hasattr(program_module, 'transform_grid_attempt_1') or not hasattr(program_module, 'transform_grid_attempt_2'):
43
+ print(f"Stage 1 validation failed: Program must define 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.")
44
+
45
+ error_artifacts = {
46
+ "error_type": "MissingFunction",
47
+ "error_message": "Stage 1: Program is missing required 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.",
48
+ "suggestion": "Make sure your program includes a functions named 'transform_grid_attempt_1' and 'transform_grid_attempt_2' that take as an argument a 2D numpy array and return a 2D numpy array."
49
+ }
50
+
51
+ return dict(
52
+ metrics={
53
+ "runs_successfully": 0.0,
54
+ "combined_score": 0.0,
55
+ "error": "Missing transform_grid_attempt_1 and transform_grid_attempt_2 functions"
56
+ },
57
+ artifacts=error_artifacts
58
+ )
59
+ # Load ARC tasks
60
+ data_root = os.getenv("DATA_ROOT")
61
+ if not data_root:
62
+ data_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
63
+ challenge_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_challenges.json")
64
+ solution_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_solutions.json")
65
+
66
+ with open(challenge_path, 'r') as f:
67
+ tasks = json.load(f)
68
+ with open(solution_path, 'r') as f:
69
+ solutions = json.load(f)
70
+
71
+ task_id = list(tasks.keys())[int(TASK_NUM)]
72
+ solution = solutions[task_id]
73
+ task = tasks[task_id]
74
+
75
+ # Sanity check: test inputs and solutions must align (same task, same order)
76
+ if len(task["test"]) != len(solution):
77
+ raise ValueError(
78
+ f"Train/test data mismatch: task {task_id} has {len(task['test'])} test inputs "
79
+ f"but {len(solution)} solution outputs. Check that arc-agi_{TASK_FILE}_challenges.json "
80
+ f"and arc-agi_{TASK_FILE}_solutions.json were generated together (convert_arc_agi2_data.py)."
81
+ )
82
+
83
+ test_inputs = [np.array(inp["input"]) for inp in task['test']]
84
+ test_gts = [np.array(gt) for gt in solution]
85
+
86
+ test_attempts = []
87
+ for inp in test_inputs:
88
+ attempt_1 = program_module.transform_grid_attempt_1(inp)
89
+ if not isinstance(attempt_1, np.ndarray):
90
+ print(f"transform_grid_attempt_1 did not return a numpy array")
91
+
92
+ error_artifacts = {
93
+ "error_type": "InvalidReturnType",
94
+ "error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array.",
95
+ "suggestion": "Make sure your transform_grid_attempt_1 function returns a 2D numpy array."
96
+ }
97
+
98
+ return dict(
99
+ metrics={
100
+ "runs_successfully": 0.0,
101
+ "combined_score": 0.0,
102
+ "error": "transform_grid_attempt_1 did not return a numpy array"
103
+ },
104
+ artifacts=error_artifacts
105
+ )
106
+
107
+ attempt_2 = program_module.transform_grid_attempt_2(inp)
108
+ if not isinstance(attempt_2, np.ndarray):
109
+ print(f"transform_grid_attempt_2 did not return a numpy array")
110
+
111
+ error_artifacts = {
112
+ "error_type": "InvalidReturnType",
113
+ "error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array.",
114
+ "suggestion": "Make sure your transform_grid_attempt_2 function returns a 2D numpy array."
115
+ }
116
+
117
+ return dict(
118
+ metrics={
119
+ "runs_successfully": 0.0,
120
+ "combined_score": 0.0,
121
+ "error": "transform_grid_attempt_2 did not return a numpy array"
122
+ },
123
+ artifacts=error_artifacts
124
+ )
125
+ test_attempts.append([attempt_1, attempt_2])
126
+
127
+ pass_at_2_test, test_diagnostics_list = pass_at_2_accuracy_multi_test(test_attempts, test_gts)
128
+ metrics = {
129
+ "runs_successfully": 1.0,
130
+ "combined_score": sum(pass_at_2_test) / len(pass_at_2_test),
131
+ }
132
+ error_artifacts = {}
133
+ for i, (test_pass, test_diagnostics) in enumerate(zip(pass_at_2_test, test_diagnostics_list)):
134
+ example_name = f"test_example_{i}"
135
+ metrics[f"{example_name}_pass_at_2"] = test_pass
136
+ for attempt in test_diagnostics:
137
+ metrics[f"{example_name}_attempt_{attempt}"] = test_diagnostics[attempt]["perfect_match"]
138
+ if test_pass == 0:
139
+ # test_diagnostics is {0: {...}, 1: {...}}; extract_failure_artifacts expects one attempt's dict
140
+ first_failing = next(
141
+ (test_diagnostics[a] for a in test_diagnostics if not test_diagnostics[a]["perfect_match"]),
142
+ test_diagnostics[0],
143
+ )
144
+ error_artifacts[f"{example_name}"] = extract_failure_artifacts(first_failing)
145
+
146
+ return dict(
147
+ metrics=metrics,
148
+ artifacts=error_artifacts
149
+ )
150
+
151
+ if __name__ == "__main__":
152
+ evaluation_result = evaluate()
153
+ result_path = _result_path()
154
+ os.makedirs(os.path.dirname(result_path), exist_ok=True)
155
+ with open(result_path, 'w') as f:
156
+ json.dump(evaluation_result, f, indent=4)
157
+ print(f"Test-set evaluation written to {result_path}")
benchmarks/frontier-cs-eval/README.md ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Frontier-CS Benchmark
2
+
3
+ Evolves C++ solutions for [Frontier-CS](https://github.com/facebookresearch/Frontier-CS) algorithmic optimization problems using SkyDiscover.
4
+
5
+ ## Setup
6
+
7
+ ```bash
8
+ # 1. Clone Frontier-CS
9
+ cd benchmarks/frontier-cs-eval
10
+ git clone https://github.com/FrontierCS/Frontier-CS.git
11
+
12
+ # 2. Start the judge server (requires Docker)
13
+ cd Frontier-CS/algorithmic
14
+ docker compose up -d
15
+
16
+ # 3. Install dependencies (from project root)
17
+ cd ../../..
18
+ uv sync --extra frontier-cs
19
+
20
+ # 4. Set your API key
21
+ export OPENAI_API_KEY=...
22
+ ```
23
+
24
+ ## Run
25
+
26
+ Supported algorithms: `adaevolve`, `evox`, `openevolve`, `gepa`, `shinkaevolve`
27
+
28
+
29
+ Single problem:
30
+ ```bash
31
+ cd benchmarks/frontier-cs-eval
32
+ FRONTIER_CS_PROBLEM=0 uv run skydiscover-run initial_program.cpp evaluator.py \
33
+ -c config.yaml -s [search_algorithm] -i 50
34
+ ```
35
+
36
+ All problems in parallel:
37
+ ```bash
38
+ uv run python run_all_frontiercs.py --search [search_algorithm] --iterations 50 --workers 6
39
+ ```
40
+
41
+ ## Evaluate best programs (post-discovery)
42
+
43
+ ```bash
44
+ uv run python run_best_programs_frontiercs.py
45
+ ```
46
+
47
+ ## Analyze results
48
+
49
+ ```bash
50
+ uv run python combine_results.py # merge training/testing scores into CSV
51
+ uv run python analyze_results.py # generate plots and statistics
52
+ ```
53
+
54
+ ## Files
55
+
56
+ | File | Description |
57
+ |------|-------------|
58
+ | `initial_program.cpp` | Seed C++ program |
59
+ | `evaluator.py` | Evaluates C++ solutions via Frontier-CS docker judge |
60
+ | `config.yaml` | Config with system prompt template |
61
+ | `run_all_frontiercs.py` | Parallelizes evolution across all problems |
62
+ | `run_best_programs_frontiercs.py` | Re-evaluates best programs after evolution |
63
+ | `combine_results.py` | Combines training/testing scores into CSV |
64
+ | `analyze_results.py` | Generates score analysis plots and statistics |
65
+
66
+ ## Environment variables
67
+
68
+ | Variable | Default | Description |
69
+ |----------|---------|-------------|
70
+ | `OPENAI_API_KEY` | (required) | API key |
71
+ | `FRONTIER_CS_PROBLEM` | `0` | Problem ID to evolve |
72
+ | `JUDGE_URLS` | `http://localhost:8081` | Comma-separated judge server URLs |
benchmarks/frontier-cs-eval/analyze_results.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import matplotlib.pyplot as plt
3
+ import numpy as np
4
+ from pathlib import Path
5
+
6
+ # Define paths
7
+ _script_dir = str(Path(__file__).resolve().parent)
8
+ input_csv = str(Path(_script_dir) / "combined_results.csv")
9
+ output_dir = _script_dir
10
+
11
+ # Read the CSV file
12
+ df = pd.read_csv(input_csv)
13
+
14
+ # Calculate average of training and testing scores
15
+ df['average_score'] = (df['training_score'] + df['testing_score']) / 2
16
+
17
+ # Remove rows where either score is None (NaN)
18
+ df_complete = df.dropna(subset=['training_score', 'testing_score'])
19
+
20
+ print(f"\n=== Analysis Results ===")
21
+ print(f"Total problems: {len(df)}")
22
+ print(f"Problems with complete data: {len(df_complete)}")
23
+ print(f"\nTraining Scores:")
24
+ print(f" Mean: {df_complete['training_score'].mean():.4f}")
25
+ print(f" Median: {df_complete['training_score'].median():.4f}")
26
+ print(f" Std Dev: {df_complete['training_score'].std():.4f}")
27
+ print(f" Min: {df_complete['training_score'].min():.4f}")
28
+ print(f" Max: {df_complete['training_score'].max():.4f}")
29
+
30
+ print(f"\nTesting Scores:")
31
+ print(f" Mean: {df_complete['testing_score'].mean():.4f}")
32
+ print(f" Median: {df_complete['testing_score'].median():.4f}")
33
+ print(f" Std Dev: {df_complete['testing_score'].std():.4f}")
34
+ print(f" Min: {df_complete['testing_score'].min():.4f}")
35
+ print(f" Max: {df_complete['testing_score'].max():.4f}")
36
+
37
+ print(f"\nAverage Scores:")
38
+ print(f" Mean: {df_complete['average_score'].mean():.4f}")
39
+ print(f" Median: {df_complete['average_score'].median():.4f}")
40
+ print(f" Std Dev: {df_complete['average_score'].std():.4f}")
41
+
42
+ # Save the updated CSV with averages
43
+ output_csv = Path(output_dir) / "combined_results_with_averages.csv"
44
+ df.to_csv(output_csv, index=False)
45
+ print(f"\nUpdated CSV with averages saved to {output_csv}")
46
+
47
+ # Create visualizations
48
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
49
+
50
+ # 1. Scatter plot: Training vs Testing scores
51
+ ax = axes[0, 0]
52
+ ax.scatter(df_complete['training_score'], df_complete['testing_score'], alpha=0.6, s=50)
53
+ # Add diagonal line for reference (where training == testing)
54
+ lim = [min(df_complete['training_score'].min(), df_complete['testing_score'].min()),
55
+ max(df_complete['training_score'].max(), df_complete['testing_score'].max())]
56
+ ax.plot(lim, lim, 'r--', alpha=0.5, label='Training = Testing')
57
+ ax.set_xlabel('Training Score')
58
+ ax.set_ylabel('Testing Score')
59
+ ax.set_title('Training vs Testing Scores')
60
+ ax.legend()
61
+ ax.grid(True, alpha=0.3)
62
+
63
+ # 2. Distribution comparison - histograms
64
+ ax = axes[0, 1]
65
+ ax.hist(df_complete['training_score'], bins=20, alpha=0.6, label='Training', edgecolor='black')
66
+ ax.hist(df_complete['testing_score'], bins=20, alpha=0.6, label='Testing', edgecolor='black')
67
+ ax.set_xlabel('Score')
68
+ ax.set_ylabel('Frequency')
69
+ ax.set_title('Distribution of Training vs Testing Scores')
70
+ ax.legend()
71
+ ax.grid(True, alpha=0.3, axis='y')
72
+
73
+ # 3. Box plot comparison
74
+ ax = axes[1, 0]
75
+ box_data = [df_complete['training_score'], df_complete['testing_score'], df_complete['average_score']]
76
+ bp = ax.boxplot(box_data, labels=['Training', 'Testing', 'Average'])
77
+ ax.set_ylabel('Score')
78
+ ax.set_title('Score Comparison (Box Plot)')
79
+ ax.grid(True, alpha=0.3, axis='y')
80
+
81
+ # 4. Difference plot: Training - Testing
82
+ ax = axes[1, 1]
83
+ difference = df_complete['training_score'] - df_complete['testing_score']
84
+ ax.scatter(df_complete['problem_id'].astype(int), difference, alpha=0.6, s=50)
85
+ ax.axhline(y=0, color='r', linestyle='--', alpha=0.5, label='No Difference')
86
+ ax.set_xlabel('Problem ID')
87
+ ax.set_ylabel('Training Score - Testing Score')
88
+ ax.set_title('Score Difference (Training - Testing)')
89
+ ax.legend()
90
+ ax.grid(True, alpha=0.3)
91
+
92
+ plt.tight_layout()
93
+ plot_path = Path(output_dir) / "results_analysis.png"
94
+ plt.savefig(plot_path, dpi=300, bbox_inches='tight')
95
+ print(f"Plot saved to {plot_path}")
96
+
97
+ # Additional statistics about differences
98
+ print(f"\nScore Differences (Training - Testing):")
99
+ print(f" Mean Difference: {difference.mean():.4f}")
100
+ print(f" Median Difference: {difference.median():.4f}")
101
+ print(f" Std Dev: {difference.std():.4f}")
102
+ print(f" Problems where training > testing: {(difference > 0).sum()}")
103
+ print(f" Problems where testing > training: {(difference < 0).sum()}")
104
+
105
+ plt.show()
benchmarks/frontier-cs-eval/combine_results.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import csv
3
+ import os
4
+ from pathlib import Path
5
+
6
+ # Define paths
7
+ _script_dir = Path(__file__).resolve().parent
8
+ _repo_root = _script_dir.parent.parent
9
+ training_dir = str(_repo_root / "outputs" / "frontier_cs")
10
+ testing_dir = str(_script_dir / "evaluation_results")
11
+ output_csv = str(_script_dir / "combined_results.csv")
12
+
13
+ # Collect all problems
14
+ results = []
15
+
16
+ # Get all problem directories from training data
17
+ training_problems = sorted([d for d in os.listdir(training_dir) if d.startswith("problem_")])
18
+
19
+ print(f"Found {len(training_problems)} training problems")
20
+
21
+ for problem_dir in training_problems:
22
+ problem_id = problem_dir.replace("problem_", "")
23
+
24
+ # Get training score from best_program_info.json
25
+ training_score = None
26
+ training_info_path = os.path.join(training_dir, problem_dir, "best", "best_program_info.json")
27
+
28
+ if os.path.exists(training_info_path):
29
+ try:
30
+ with open(training_info_path, 'r') as f:
31
+ training_data = json.load(f)
32
+ training_score = training_data.get("metrics", {}).get("combined_score")
33
+ except Exception as e:
34
+ print(f"Error reading training data for problem {problem_id}: {e}")
35
+
36
+ # Get testing score from evaluation_results json
37
+ testing_score = None
38
+ testing_json_path = os.path.join(testing_dir, f"problem_{problem_id}.json")
39
+
40
+ if os.path.exists(testing_json_path):
41
+ try:
42
+ with open(testing_json_path, 'r') as f:
43
+ testing_data = json.load(f)
44
+ testing_score = testing_data.get("combined_score")
45
+ except Exception as e:
46
+ print(f"Error reading testing data for problem {problem_id}: {e}")
47
+
48
+ results.append({
49
+ "problem_id": problem_id,
50
+ "training_score": training_score,
51
+ "testing_score": testing_score
52
+ })
53
+
54
+ # Write to CSV
55
+ with open(output_csv, 'w', newline='') as csvfile:
56
+ fieldnames = ["problem_id", "training_score", "testing_score"]
57
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
58
+
59
+ writer.writeheader()
60
+ writer.writerows(results)
61
+
62
+ print(f"\nResults written to {output_csv}")
63
+ print(f"Total problems: {len(results)}")
64
+ print(f"Problems with both scores: {sum(1 for r in results if r['training_score'] is not None and r['testing_score'] is not None)}")
65
+ print(f"Problems missing training score: {sum(1 for r in results if r['training_score'] is None)}")
66
+ print(f"Problems missing testing score: {sum(1 for r in results if r['testing_score'] is None)}")
benchmarks/frontier-cs-eval/config.yaml ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Frontier-CS Benchmark
2
+ # Usage: uv run skydiscover-run initial_program.cpp evaluator.py -c config.yaml -s <strategy> -i 50
3
+
4
+ max_iterations: 100
5
+ checkpoint_interval: 10
6
+ log_level: INFO
7
+
8
+ llm:
9
+ models:
10
+ - name: "gpt-5"
11
+ weight: 1.0
12
+ api_base: https://api.openai.com/v1
13
+ temperature: 0.7
14
+ # top_p: 0.95 # omitted by default; some providers (e.g. Anthropic) reject both temperature and top_p
15
+ max_tokens: 32000
16
+ timeout: 600
17
+ # To use Gemini: override with --model gemini-3-pro-preview
18
+
19
+ prompt:
20
+ system_message: |
21
+ You are an expert competitive programmer specializing in algorithmic optimization.
22
+
23
+ PROBLEM STATEMENT:
24
+ {problem_statement}
25
+
26
+ CONSTRAINTS:
27
+ {problem_constraints}
28
+
29
+ OBJECTIVE: Maximize the score returned by the Frontier-CS judge (higher is better).
30
+ Your solution must be valid C++ code that compiles and runs correctly.
31
+
32
+ KEY STRATEGIES:
33
+ - Analyze the problem structure carefully before coding
34
+ - Consider time and space complexity constraints
35
+ - Use efficient data structures (vectors, maps, sets, priority queues)
36
+ - Implement clean, well-structured code
37
+ - Handle edge cases properly
38
+ - Optimize hot loops and critical sections
39
+
40
+ COMMON TECHNIQUES:
41
+ - Dynamic programming for optimization problems
42
+ - Greedy algorithms with proper ordering
43
+ - Graph algorithms (BFS, DFS, shortest paths)
44
+ - Binary search for monotonic functions
45
+ - Divide and conquer approaches
46
+ - Heuristic search (simulated annealing, genetic algorithms, local search)
47
+
48
+ OUTPUT: Complete C++ program with main() function that reads from stdin and writes to stdout.
49
+
50
+ evaluator:
51
+ timeout: 300
52
+ max_retries: 3
53
+ cascade_evaluation: false
54
+
55
+ diff_based_generation: true
56
+ max_solution_length: 50000
57
+ random_seed: 42
benchmarks/frontier-cs-eval/evaluator.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Evaluator for Frontier-CS algorithmic problems.
3
+
4
+ This evaluator integrates with SkyDiscover to evaluate generated C++ solutions
5
+ against Frontier-CS benchmark problems using the local judge server.
6
+ """
7
+
8
+ import traceback
9
+ from pathlib import Path
10
+ import logging
11
+ import sys
12
+ import os
13
+ import random
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Support multiple judge servers for load balancing
18
+ DEFAULT_JUDGE_URL = "http://localhost:8081"
19
+ JUDGE_URLS = os.environ.get("JUDGE_URLS", DEFAULT_JUDGE_URL).split(",")
20
+ JUDGE_URLS = [url.strip() for url in JUDGE_URLS if url.strip()]
21
+
22
+ def get_judge_url() -> str:
23
+ """Get a judge URL using random selection for load balancing."""
24
+ return random.choice(JUDGE_URLS)
25
+
26
+ # Add Frontier-CS to path
27
+ frontier_cs_path = Path(__file__).resolve().parent / "Frontier-CS" / "src"
28
+ if str(frontier_cs_path) not in sys.path:
29
+ sys.path.insert(0, str(frontier_cs_path))
30
+
31
+ try:
32
+ from frontier_cs.single_evaluator import SingleEvaluator as FrontierCSEvaluator
33
+ from frontier_cs.runner.base import EvaluationStatus
34
+ except ImportError as e:
35
+ logger.error(f"Failed to import Frontier-CS: {e}")
36
+ logger.error("Please ensure Frontier-CS is installed as a submodule in benchmarks/frontier-cs-eval/Frontier-CS")
37
+ raise
38
+
39
+ def evaluate(program_path: str, problem_id: str = None, **kwargs) -> dict:
40
+ """
41
+ Evaluate a C++ solution for a Frontier-CS algorithmic problem.
42
+
43
+ Args:
44
+ program_path: Path to the C++ solution file
45
+ problem_id: Frontier-CS problem ID (e.g., "0", "1", "2", etc.)
46
+ If None, will be read from FRONTIER_CS_PROBLEM env var or config
47
+
48
+ Returns:
49
+ dict with evaluation results:
50
+ - combined_score: The score from the judge (higher is better)
51
+ - runs_successfully: 1.0 if evaluation succeeded, 0.0 otherwise
52
+ - status: Evaluation status string
53
+ - message: Any error or status messages
54
+ - problem_id: The problem ID
55
+ - program_path: Path to the evaluated program
56
+ - score_unbounded: Unbounded score if available
57
+ - metadata: Additional evaluation metadata
58
+ """
59
+ # Get problem_id from parameter, environment, or kwargs
60
+ if problem_id is None:
61
+ import os
62
+ problem_id = os.environ.get('FRONTIER_CS_PROBLEM')
63
+ if problem_id is None:
64
+ problem_id = kwargs.get('frontier_cs_problem', '0')
65
+
66
+ logger.info(f"Evaluating program {program_path} for Frontier-CS problem {problem_id}")
67
+
68
+ try:
69
+ # Initialize evaluator with judge server (load balanced if multiple configured)
70
+ judge_url = get_judge_url()
71
+ logger.info(f"Using judge server: {judge_url}")
72
+ evaluator = FrontierCSEvaluator(
73
+ backend="docker",
74
+ judge_url=judge_url,
75
+ register_cleanup=False,
76
+ )
77
+
78
+ # Read the solution code
79
+ solution_path = Path(program_path)
80
+ if not solution_path.exists():
81
+ error_msg = f"Solution file not found: {program_path}"
82
+ logger.error(error_msg)
83
+ return {
84
+ "combined_score": 0.0,
85
+ "runs_successfully": 0.0,
86
+ "status": "error",
87
+ "message": error_msg,
88
+ "problem_id": problem_id,
89
+ "program_path": program_path,
90
+ }
91
+
92
+ # Extract code and remove any EVOLVE-BLOCK markers
93
+ code = solution_path.read_text().replace(
94
+ "// EVOLVE-BLOCK-START", ""
95
+ ).replace(
96
+ "// EVOLVE-BLOCK-END", ""
97
+ ).strip()
98
+
99
+ logger.info(f"Code extracted from {program_path}")
100
+
101
+ # Evaluate the solution
102
+ result = evaluator.evaluate(
103
+ track="algorithmic",
104
+ problem_id=problem_id,
105
+ code=code,
106
+ backend="docker",
107
+ )
108
+
109
+ logger.info(f"Evaluation completed with status: {result.status}")
110
+
111
+ # Process result
112
+ if result.status == EvaluationStatus.SUCCESS:
113
+ print(result)
114
+ score = result.score
115
+ # Use unbounded score for optimization (allows >100 if beating reference)
116
+ score_unbounded = result.metadata.get('scoreUnbounded', score) if result.metadata else score
117
+ print(f"score={score}, score_unbounded={score_unbounded}")
118
+
119
+ # Extract only essential metadata (exclude large test case outputs)
120
+ essential_metadata = {}
121
+ if result.metadata:
122
+ essential_metadata = {
123
+ "status": result.metadata.get("status"),
124
+ "passed": result.metadata.get("passed"),
125
+ "result": result.metadata.get("result"),
126
+ "score": result.metadata.get("score"),
127
+ "scoreUnbounded": result.metadata.get("scoreUnbounded"),
128
+ }
129
+
130
+ return {
131
+ "combined_score": float(score), # Ensure it's a float
132
+ "score_unbounded": score_unbounded,
133
+ "runs_successfully": 1.0,
134
+ "status": "success",
135
+ "message": result.message or "Evaluation successful",
136
+ "problem_id": problem_id,
137
+ "program_path": program_path,
138
+ "duration_seconds": result.duration_seconds,
139
+ "metadata": essential_metadata,
140
+ }
141
+ elif result.status == EvaluationStatus.TIMEOUT:
142
+ logger.warning(f"Evaluation timed out: {result.message}")
143
+ return {
144
+ "combined_score": 0.0,
145
+ "runs_successfully": 0.0,
146
+ "status": "timeout",
147
+ "message": result.message or "Evaluation timed out",
148
+ "problem_id": problem_id,
149
+ "program_path": program_path,
150
+ }
151
+ else: # ERROR status
152
+ logger.error(f"Evaluation error: {result.message}")
153
+ return {
154
+ "combined_score": 0.0,
155
+ "runs_successfully": 0.0,
156
+ "status": "error",
157
+ "message": result.message or "Evaluation failed",
158
+ "problem_id": problem_id,
159
+ "program_path": program_path,
160
+ "logs": result.logs,
161
+ }
162
+
163
+ except Exception as e:
164
+ logger.error(f"Evaluation failed completely: {str(e)}")
165
+ logger.error(traceback.format_exc())
166
+ return {
167
+ "combined_score": 0.0,
168
+ "runs_successfully": 0.0,
169
+ "status": "error",
170
+ "message": str(e),
171
+ "problem_id": problem_id,
172
+ "program_path": program_path,
173
+ "error": str(e),
174
+ }
benchmarks/frontier-cs-eval/initial_program.cpp ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #include <bits/stdc++.h>
2
+ using namespace std;
3
+ int main(){
4
+ std::cout << "Hello, World!" << std::endl;
5
+ return 0;
6
+ }
benchmarks/frontier-cs-eval/run_all_frontiercs.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import sys
4
+ import subprocess
5
+ from pathlib import Path
6
+ from concurrent.futures import ProcessPoolExecutor
7
+
8
+ from dotenv import load_dotenv
9
+ load_dotenv()
10
+
11
+ SCRIPT_DIR = Path(__file__).resolve().parent
12
+
13
+ frontier_cs_path = SCRIPT_DIR / "Frontier-CS" / "src"
14
+ if str(frontier_cs_path) not in sys.path:
15
+ sys.path.insert(0, str(frontier_cs_path))
16
+
17
+ from frontier_cs.runner.algorithmic_local import AlgorithmicLocalRunner
18
+
19
+
20
+ def run_single_problem(args):
21
+ p_id, search, iterations, env = args
22
+ print(f"\n[START] Problem ID: {p_id}")
23
+ command = [
24
+ "uv", "run", "skydiscover-run",
25
+ "initial_program.cpp", "evaluator.py",
26
+ "-c", "config.yaml",
27
+ "-s", search,
28
+ "-i", str(iterations),
29
+ "-o", f"outputs/frontier_cs/problem_{p_id}",
30
+ ]
31
+ env = {**env, "FRONTIER_CS_PROBLEM": str(p_id)}
32
+ try:
33
+ subprocess.run(command, check=True, env=env, cwd=str(SCRIPT_DIR))
34
+ return f"✅ Problem {p_id} completed."
35
+ except subprocess.CalledProcessError as e:
36
+ return f"❌ Problem {p_id} failed: {e}"
37
+
38
+
39
+ def main():
40
+ parser = argparse.ArgumentParser(description="Run SkyDiscover on all Frontier-CS problems")
41
+ parser.add_argument("--search", "-s", default="adaevolve",
42
+ help="Search algorithm (default: adaevolve)")
43
+ parser.add_argument("--iterations", "-i", type=int, default=50,
44
+ help="Iterations per problem (default: 50)")
45
+ parser.add_argument("--workers", "-w", type=int, default=6,
46
+ help="Parallel workers (default: 6)")
47
+ args = parser.parse_args()
48
+
49
+ runner = AlgorithmicLocalRunner()
50
+ problems_data = runner.list_problems()
51
+ problem_ids = sorted([p['id'] for p in problems_data['problems']], key=int)
52
+
53
+ print(f"Running {len(problem_ids)} problems with {args.workers} workers "
54
+ f"(search={args.search}, iterations={args.iterations})...")
55
+
56
+ env = os.environ.copy()
57
+ task_args = [(p_id, args.search, args.iterations, env) for p_id in problem_ids]
58
+
59
+ with ProcessPoolExecutor(max_workers=args.workers) as executor:
60
+ results = list(executor.map(run_single_problem, task_args))
61
+
62
+ print("\n" + "=" * 30)
63
+ print("ALL RUNS COMPLETE")
64
+ print("=" * 30)
65
+ for result in results:
66
+ print(result)
67
+
68
+
69
+ if __name__ == "__main__":
70
+ main()
benchmarks/frontier-cs-eval/run_best_programs_frontiercs.py ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import json
4
+ import logging
5
+ import threading
6
+ from pathlib import Path
7
+ from typing import Dict, List, Tuple
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+
10
+ # Set up logging
11
+ logging.basicConfig(
12
+ level=logging.INFO,
13
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
14
+ )
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Add Frontier-CS to path
18
+ frontier_cs_path = Path(__file__).resolve().parent / "Frontier-CS" / "src"
19
+ if str(frontier_cs_path) not in sys.path:
20
+ sys.path.insert(0, str(frontier_cs_path))
21
+
22
+ try:
23
+ from frontier_cs.evaluator import FrontierCSEvaluator
24
+ from frontier_cs.runner.base import EvaluationStatus
25
+ except ImportError as e:
26
+ logger.error(f"Failed to import Frontier-CS: {e}")
27
+ logger.error("Please ensure Frontier-CS is installed as a submodule in benchmarks/frontier-cs-eval/Frontier-CS")
28
+ sys.exit(1)
29
+
30
+
31
+ class BestProgramEvaluator:
32
+ """Evaluates all best_program.cpp files in the outputs directory."""
33
+
34
+ def __init__(self, outputs_dir: str, judge_url: str = "http://localhost:8081", num_workers: int = 8):
35
+ """
36
+ Initialize the evaluator.
37
+
38
+ Args:
39
+ outputs_dir: Path to the outputs directory containing problem folders
40
+ judge_url: URL of the judge server
41
+ num_workers: Number of parallel workers for evaluation
42
+ """
43
+ self.outputs_dir = Path(outputs_dir)
44
+ self.judge_url = judge_url
45
+ self.num_workers = num_workers
46
+
47
+ # Use thread-local storage for evaluator instances (avoid race condition)
48
+ self._evaluator_local = threading.local()
49
+
50
+ self.results = []
51
+
52
+ # Create results directory in the script's directory
53
+ self.results_dir = Path(__file__).resolve().parent / "evaluation_results"
54
+ self.results_dir.mkdir(exist_ok=True)
55
+ logger.info(f"Results will be saved to {self.results_dir}")
56
+ logger.info(f"Using {self.num_workers} parallel workers with thread-local evaluators")
57
+
58
+ def _get_evaluator(self) -> 'FrontierCSEvaluator':
59
+ """
60
+ Get the evaluator for the current thread.
61
+ Creates a new instance if this thread hasn't created one yet.
62
+ This avoids race conditions from sharing a single evaluator across threads.
63
+ """
64
+ if not hasattr(self._evaluator_local, 'evaluator'):
65
+ self._evaluator_local.evaluator = FrontierCSEvaluator(
66
+ backend="docker",
67
+ judge_url=self.judge_url,
68
+ )
69
+ logger.debug(f"Created new evaluator for thread {threading.current_thread().name}")
70
+ return self._evaluator_local.evaluator
71
+
72
+ def find_best_programs(self) -> Dict[str, Path]:
73
+ """
74
+ Find all best_program.cpp files in the outputs directory.
75
+
76
+ Returns:
77
+ Dict mapping problem_id to best_program.cpp path
78
+ """
79
+ best_programs = {}
80
+
81
+ # Look for frontier_cs subdirectory
82
+ frontier_cs_dir = self.outputs_dir / "frontier_cs"
83
+ if not frontier_cs_dir.exists():
84
+ logger.error(f"frontier_cs directory not found at {frontier_cs_dir}")
85
+ return best_programs
86
+
87
+ # Iterate through problem directories
88
+ for problem_dir in sorted(frontier_cs_dir.iterdir()):
89
+ if not problem_dir.is_dir() or not problem_dir.name.startswith("problem_"):
90
+ continue
91
+
92
+ # Extract problem ID
93
+ problem_id = problem_dir.name.replace("problem_", "")
94
+
95
+ # Look for best_program.cpp
96
+ best_program_path = problem_dir / "best" / "best_program.cpp"
97
+ if best_program_path.exists():
98
+ best_programs[problem_id] = best_program_path
99
+ logger.info(f"Found best_program.cpp for problem {problem_id}")
100
+ else:
101
+ logger.warning(f"best_program.cpp not found for problem {problem_id} at {best_program_path}")
102
+
103
+ return best_programs
104
+
105
+ def evaluate_program(self, problem_id: str, program_path: Path) -> Dict:
106
+ """
107
+ Evaluate a single best_program.cpp file.
108
+
109
+ Args:
110
+ problem_id: The Frontier-CS problem ID
111
+ program_path: Path to the best_program.cpp file
112
+
113
+ Returns:
114
+ Dictionary with evaluation results
115
+ """
116
+ logger.info(f"Evaluating problem {problem_id}: {program_path}")
117
+
118
+ try:
119
+ # Read the solution code
120
+ if not program_path.exists():
121
+ error_msg = f"Solution file not found: {program_path}"
122
+ logger.error(error_msg)
123
+ return {
124
+ "problem_id": problem_id,
125
+ "program_path": str(program_path),
126
+ "combined_score": 0.0,
127
+ "runs_successfully": 0.0,
128
+ "status": "error",
129
+ "message": error_msg,
130
+ }
131
+
132
+ # Read the code
133
+ code = program_path.read_text().replace(
134
+ "// EVOLVE-BLOCK-START", ""
135
+ ).replace(
136
+ "// EVOLVE-BLOCK-END", ""
137
+ ).strip()
138
+
139
+ logger.info(f"Code extracted from {program_path}, length: {len(code)} characters")
140
+
141
+ # Evaluate the solution (use thread-local evaluator)
142
+ evaluator = self._get_evaluator()
143
+ result = evaluator.evaluate(
144
+ track="algorithmic",
145
+ problem_id=problem_id,
146
+ code=code,
147
+ backend="docker",
148
+ )
149
+
150
+ logger.info(f"Evaluation completed for problem {problem_id} with status: {result.status}")
151
+
152
+ # Log the result object and its properties
153
+ logger.info(f"Judger output for problem {problem_id}:")
154
+ logger.info(f" Status: {result.status}")
155
+ logger.info(f" Message: {result.message}")
156
+ if hasattr(result, 'score'):
157
+ logger.info(f" Score: {result.score}")
158
+ if hasattr(result, 'duration_seconds'):
159
+ logger.info(f" Duration: {result.duration_seconds}s")
160
+ if hasattr(result, 'metadata'):
161
+ logger.info(f" Metadata: {result.metadata}")
162
+ logger.info(f" Full result object: {result}")
163
+
164
+ # Process result
165
+ if result.status == EvaluationStatus.SUCCESS:
166
+ score = result.score
167
+ logger.info(f"Problem {problem_id}: Score = {score}")
168
+
169
+ return {
170
+ "problem_id": problem_id,
171
+ "program_path": str(program_path),
172
+ "combined_score": float(score),
173
+ "runs_successfully": 1.0,
174
+ "status": "success",
175
+ "message": result.message or "Evaluation successful",
176
+ "duration_seconds": result.duration_seconds,
177
+ "judger_output": str(result),
178
+ "metadata": result.metadata if hasattr(result, 'metadata') else None,
179
+ }
180
+ elif result.status == EvaluationStatus.TIMEOUT:
181
+ logger.warning(f"Problem {problem_id}: Evaluation timed out")
182
+ return {
183
+ "problem_id": problem_id,
184
+ "program_path": str(program_path),
185
+ "combined_score": 0.0,
186
+ "runs_successfully": 0.0,
187
+ "status": "timeout",
188
+ "message": f"Evaluation timed out: {result.message}",
189
+ "duration_seconds": result.duration_seconds,
190
+ "judger_output": str(result),
191
+ }
192
+ elif result.status == EvaluationStatus.COMPILATION_ERROR:
193
+ logger.warning(f"Problem {problem_id}: Compilation error")
194
+ return {
195
+ "problem_id": problem_id,
196
+ "program_path": str(program_path),
197
+ "combined_score": 0.0,
198
+ "runs_successfully": 0.0,
199
+ "status": "compilation_error",
200
+ "message": f"Compilation error: {result.message}",
201
+ "duration_seconds": result.duration_seconds,
202
+ "judger_output": str(result),
203
+ }
204
+ else:
205
+ logger.error(f"Problem {problem_id}: Evaluation failed with status {result.status}")
206
+ return {
207
+ "problem_id": problem_id,
208
+ "program_path": str(program_path),
209
+ "combined_score": 0.0,
210
+ "runs_successfully": 0.0,
211
+ "status": str(result.status),
212
+ "message": f"Evaluation failed: {result.message}",
213
+ "duration_seconds": result.duration_seconds,
214
+ "judger_output": str(result),
215
+ }
216
+
217
+ except Exception as e:
218
+ logger.error(f"Exception while evaluating problem {problem_id}: {str(e)}")
219
+ logger.error(f"Exception traceback: {type(e).__name__}")
220
+ import traceback
221
+ logger.error(traceback.format_exc())
222
+
223
+ return {
224
+ "problem_id": problem_id,
225
+ "program_path": str(program_path),
226
+ "combined_score": 0.0,
227
+ "runs_successfully": 0.0,
228
+ "status": "exception",
229
+ "message": str(e),
230
+ }
231
+
232
+ def run_all_evaluations(self) -> List[Dict]:
233
+ """
234
+ Run evaluations for all best_program.cpp files sequentially (one at a time).
235
+
236
+ Returns:
237
+ List of evaluation results
238
+ """
239
+ logger.info(f"Starting evaluation of all best programs in {self.outputs_dir}")
240
+
241
+ best_programs = self.find_best_programs()
242
+ logger.info(f"Found {len(best_programs)} best_program.cpp files")
243
+
244
+ if not best_programs:
245
+ logger.warning("No best_program.cpp files found!")
246
+ return []
247
+
248
+ # Sort problems by ID for consistent ordering
249
+ sorted_problems = sorted(best_programs.items(), key=lambda x: int(x[0]))
250
+
251
+ # Evaluate each program sequentially (no parallelization)
252
+ results = []
253
+ total = len(sorted_problems)
254
+ for idx, (problem_id, program_path) in enumerate(sorted_problems, 1):
255
+ logger.info(f"[SEQ] Evaluating problem {problem_id} ({idx}/{total})")
256
+ try:
257
+ result = self.evaluate_program(problem_id, program_path)
258
+
259
+ # CRITICAL: Ensure problem_id matches
260
+ if result.get("problem_id") != problem_id:
261
+ logger.error(f"[CRITICAL] Problem ID MISMATCH! Expected {problem_id}, got {result.get('problem_id')}")
262
+ result["problem_id"] = problem_id # Force correct problem_id
263
+
264
+ results.append(result)
265
+ self.results.append(result)
266
+
267
+ logger.info(f"[SAVE] Saving problem {problem_id} result to file")
268
+ # Save result immediately after evaluation
269
+ self.save_problem_result(result)
270
+
271
+ except Exception as e:
272
+ logger.error(f"Exception evaluating problem {problem_id}: {str(e)}")
273
+ import traceback
274
+ logger.error(traceback.format_exc())
275
+
276
+ error_result = {
277
+ "problem_id": problem_id,
278
+ "combined_score": 0.0,
279
+ "runs_successfully": 0.0,
280
+ "status": "exception",
281
+ "message": str(e),
282
+ }
283
+ results.append(error_result)
284
+ self.results.append(error_result)
285
+ self.save_problem_result(error_result)
286
+
287
+ return results
288
+
289
+ def save_results(self, output_file: str = "evaluation_results.json"):
290
+ """
291
+ Save evaluation results to a JSON file.
292
+
293
+ Args:
294
+ output_file: Path to save the results
295
+ """
296
+ output_path = Path(output_file)
297
+ with open(output_path, 'w') as f:
298
+ json.dump(self.results, f, indent=2)
299
+ logger.info(f"Results saved to {output_path}")
300
+
301
+ def save_problem_result(self, result: Dict):
302
+ """
303
+ Save individual problem result to a separate file.
304
+
305
+ Args:
306
+ result: The evaluation result for a single problem
307
+ """
308
+ problem_id = result.get("problem_id", "unknown")
309
+ result_file = self.results_dir / f"problem_{problem_id}.json"
310
+
311
+ with open(result_file, 'w') as f:
312
+ json.dump(result, f, indent=2)
313
+ logger.info(f"Problem {problem_id} result saved to {result_file}")
314
+
315
+ def print_summary(self):
316
+ """Print a summary of the evaluation results."""
317
+ if not self.results:
318
+ logger.info("No results to summarize")
319
+ return
320
+
321
+ logger.info("\n" + "="*80)
322
+ logger.info("EVALUATION SUMMARY")
323
+ logger.info("="*80)
324
+
325
+ successful = [r for r in self.results if r.get("status") == "success"]
326
+ timeout = [r for r in self.results if r.get("status") == "timeout"]
327
+ compilation_error = [r for r in self.results if r.get("status") == "compilation_error"]
328
+ other_error = [r for r in self.results if r.get("status") not in ["success", "timeout", "compilation_error"]]
329
+
330
+ logger.info(f"Total problems evaluated: {len(self.results)}")
331
+ logger.info(f"Successful: {len(successful)}")
332
+ logger.info(f"Timeouts: {len(timeout)}")
333
+ logger.info(f"Compilation errors: {len(compilation_error)}")
334
+ logger.info(f"Other errors: {len(other_error)}")
335
+
336
+ if successful:
337
+ scores = [r["combined_score"] for r in successful]
338
+ logger.info(f"\nSuccessful evaluation scores:")
339
+ logger.info(f" Average score: {sum(scores) / len(scores):.2f}")
340
+ logger.info(f" Min score: {min(scores):.2f}")
341
+ logger.info(f" Max score: {max(scores):.2f}")
342
+
343
+ logger.info(f"\nTop 5 problems by score:")
344
+ top_5 = sorted(successful, key=lambda r: r["combined_score"], reverse=True)[:5]
345
+ for i, result in enumerate(top_5, 1):
346
+ logger.info(f" {i}. Problem {result['problem_id']}: {result['combined_score']:.2f}")
347
+
348
+ logger.info("="*80 + "\n")
349
+
350
+
351
+ def main():
352
+ """Main entry point."""
353
+ import argparse
354
+
355
+ parser = argparse.ArgumentParser(
356
+ description="Evaluate all best_program.cpp files in the outputs directory"
357
+ )
358
+
359
+ # Default outputs directory is two levels up from this script
360
+ default_outputs_dir = Path(__file__).resolve().parent.parent.parent / "outputs"
361
+
362
+ parser.add_argument(
363
+ "--outputs-dir",
364
+ type=str,
365
+ default=str(default_outputs_dir),
366
+ help="Path to the outputs directory (default: ../../outputs from script location)"
367
+ )
368
+ parser.add_argument(
369
+ "--judge-url",
370
+ type=str,
371
+ default="http://localhost:8081",
372
+ help="URL of the judge server (default: http://localhost:8081)"
373
+ )
374
+ parser.add_argument(
375
+ "--output-file",
376
+ type=str,
377
+ default="evaluation_results.json",
378
+ help="Path to save the evaluation results (default: evaluation_results.json)"
379
+ )
380
+ parser.add_argument(
381
+ "--workers",
382
+ type=int,
383
+ default=8,
384
+ help="Number of parallel workers for evaluation (default: 8)"
385
+ )
386
+
387
+ args = parser.parse_args()
388
+
389
+ # Run evaluations
390
+ evaluator = BestProgramEvaluator(
391
+ outputs_dir=args.outputs_dir,
392
+ judge_url=args.judge_url,
393
+ num_workers=args.workers
394
+ )
395
+
396
+ results = evaluator.run_all_evaluations()
397
+ evaluator.save_results(args.output_file)
398
+ evaluator.print_summary()
399
+
400
+ logger.info(f"Evaluation complete. Results saved to {args.output_file}")
401
+
402
+
403
+ if __name__ == "__main__":
404
+ main()
benchmarks/gpu_mode/mla_decode/config.yaml ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPU Mode: MLA Decode (Multi-Head Latent Attention) Triton Kernel
2
+
3
+ max_iterations: 100
4
+ checkpoint_interval: 1
5
+ log_level: "INFO"
6
+
7
+ llm:
8
+ models:
9
+ - name: "gpt-5"
10
+ weight: 1.0
11
+ api_base: https://api.openai.com/v1
12
+ temperature: 1.0
13
+ # top_p: 0.95 # omitted by default; some providers (e.g. Anthropic) reject both temperature and top_p
14
+ max_tokens: 32000
15
+ timeout: 600
16
+
17
+ prompt:
18
+ system_message: |
19
+ You are an expert Triton engineer tasked with translating PyTorch code into highly optimized Triton kernel code.
20
+
21
+ Below is a pytorch implementation of the multi-head latent attention (MLA) module. You will want to implement a Triton kernel for the operations in the forward call:
22
+
23
+ ```python
24
+ import math
25
+ from dataclasses import dataclass
26
+ import torch
27
+ from torch import nn
28
+ import torch.nn.functional as F
29
+
30
+ class RoPE(nn.Module):
31
+ def __init__(self, d_model: int):
32
+ super().__init__()
33
+ self.d_model = d_model
34
+ theta = 10000 ** (-torch.arange(0, d_model//2,dtype=torch.bfloat16) / (d_model//2))
35
+ self.register_buffer("theta", theta)
36
+
37
+ def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
38
+ x1, x2 = x.chunk(2, dim=-1)
39
+ return torch.cat((-x2, x1), dim=-1)
40
+
41
+ def forward(self, x: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
42
+ seq_len = x.size(-2)
43
+ d_model = x.size(-1)
44
+ assert d_model == self.d_model
45
+ seq_idx = torch.arange(start_pos, start_pos + seq_len, device=x.device)
46
+ idx_theta = torch.einsum('s,d->sd', seq_idx, self.theta)
47
+ idx_theta2 = torch.cat([idx_theta, idx_theta], dim=-1)
48
+ cos = idx_theta2.cos().to(torch.bfloat16)
49
+ sin = idx_theta2.sin().to(torch.bfloat16)
50
+ return x * cos + self.rotate_half(x) * sin
51
+
52
+ class KVCache(nn.Module):
53
+ def __init__(self, kv_cache_shape: tuple) -> None:
54
+ super().__init__()
55
+ self.register_buffer('data', torch.zeros(kv_cache_shape, dtype=torch.bfloat16, device='cuda'))
56
+ self.seq_len = 0
57
+ self.zero()
58
+
59
+ def zero(self) -> None:
60
+ self.data.zero_()
61
+
62
+ def get_data(self) -> torch.Tensor:
63
+ return self.data
64
+
65
+ def forward(self, c_kv: torch.Tensor) -> torch.Tensor:
66
+ assert self.seq_len + c_kv.size(1) <= self.data.size(1), "KV Cache Exceeded"
67
+
68
+ self.data = self.data.to(c_kv.dtype)
69
+ self.data[
70
+ :, self.seq_len : self.seq_len + c_kv.size(1), :
71
+ ] = c_kv
72
+ self.seq_len += c_kv.size(1)
73
+
74
+ return self.data[:, :self.seq_len], self.seq_len
75
+
76
+ @dataclass
77
+ class Config:
78
+ batch_size: int
79
+ dim: int
80
+ n_heads: int
81
+ q_lora_rank: int
82
+ kv_lora_rank: int
83
+ qk_nope_head_dim: int
84
+ qk_rope_head_dim: int
85
+ v_head_dim: int
86
+ seq_len: int
87
+ max_seq_len: int
88
+ kv_cache_shape: tuple
89
+ Q_proj_down_weight: torch.Tensor
90
+ Q_proj_up_weight: torch.Tensor
91
+ KV_proj_down_weight: torch.Tensor
92
+ KV_proj_up_weight: torch.Tensor
93
+ wo_weight: torch.Tensor
94
+
95
+ class MLA(nn.Module):
96
+ def __init__(self, config: Config):
97
+ super().__init__()
98
+ self.dim = config.dim
99
+ self.n_heads = config.n_heads
100
+ self.q_lora_rank = config.q_lora_rank
101
+ self.kv_lora_rank = config.kv_lora_rank
102
+ self.nope_head_dim = config.qk_nope_head_dim
103
+ self.rope_head_dim = config.qk_rope_head_dim
104
+ self.v_head_dim = config.v_head_dim
105
+ # Down-projection matrices
106
+ self.Q_proj_down = nn.Linear(self.dim, self.q_lora_rank, bias=False, dtype=torch.bfloat16)
107
+ self.KV_proj_down = nn.Linear(self.dim, self.kv_lora_rank + self.rope_head_dim, bias=False, dtype=torch.bfloat16)
108
+
109
+ # Up-projection and rope projection matrices
110
+ self.Q_proj_up = nn.Linear(self.q_lora_rank, (self.nope_head_dim + self.rope_head_dim) * self.n_heads, bias=False, dtype=torch.bfloat16)
111
+ self.KV_proj_up = nn.Linear(self.kv_lora_rank, (self.nope_head_dim + self.v_head_dim) * self.n_heads, bias=False, dtype=torch.bfloat16)
112
+
113
+ # RoPE on half embeddings
114
+ self.q_rope = RoPE(self.rope_head_dim)
115
+ self.k_rope = RoPE(self.rope_head_dim)
116
+
117
+ # Output projection
118
+ self.wo = nn.Linear(self.v_head_dim * self.n_heads, self.dim, dtype=torch.bfloat16, bias=False)
119
+ self.eps = 1e-6
120
+
121
+ def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
122
+ # seq_len = 1 always here
123
+ batch_size, seq_len, model_dim = x.size()
124
+
125
+ ## Step 1: Handle down-projection + KV cache ##
126
+
127
+ q_lora = self.Q_proj_down(x)
128
+ kv_lora = self.KV_proj_down(x)
129
+ kv_lora, kv_len = kv_cache(kv_lora)
130
+ query_pos = kv_len - 1
131
+
132
+ ## Step 2: Up-project and prepare NoPE + RoPE ##
133
+
134
+ # Handle queries Q first
135
+ q_nope_and_rope = self.Q_proj_up(q_lora).view(
136
+ batch_size, seq_len, self.n_heads, self.nope_head_dim + self.rope_head_dim)
137
+ q_nope, q_rope = torch.split(q_nope_and_rope, [self.nope_head_dim, self.rope_head_dim], dim=-1)
138
+
139
+ # Handle keys and values K/V. V does not need RoPE
140
+ kv_nope, k_rope = torch.split(kv_lora, [self.kv_lora_rank, self.rope_head_dim], dim=-1)
141
+ kv_nope = self.KV_proj_up(kv_nope).view(
142
+ batch_size, kv_len, self.n_heads, self.nope_head_dim + self.v_head_dim)
143
+ k_nope, v = torch.split(kv_nope, [self.nope_head_dim, self.v_head_dim], dim=-1)
144
+
145
+ ## Step 3: Handle RoPE Stream ##
146
+
147
+ # Compute RoPE for queries and combine with no-RoPE part
148
+ q_rope = q_rope.permute(0, 2, 1, 3) # bs x n_heads x seq_len x rope_head_dim
149
+ q_rope = self.q_rope(q_rope, start_pos=query_pos)
150
+
151
+ q_nope = q_nope.permute(0, 2, 1, 3) # bs x n_heads x seq_len x rope_head_dim
152
+ q = torch.concat([q_nope, q_rope], dim=-1)
153
+
154
+ # Compute RoPE for keys and combine with no-RoPE part
155
+ k_rope = k_rope[:, None, :, :]
156
+ k_rope = self.k_rope(k_rope).expand(-1,self.n_heads,-1,-1)
157
+ k_nope = k_nope.permute(0, 2, 1, 3) # bs x kv_len x n_heads x rope_head_dim
158
+ k = torch.concat([k_nope, k_rope], dim=-1)
159
+
160
+ ## Step 4: Compute Multi-head Attention ##
161
+
162
+ v = v.permute(0, 2, 1, 3) # bs x n_heads x kv_len x v_head_dim
163
+ scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.rope_head_dim + self.nope_head_dim)
164
+ attn = F.softmax(scores, dim=-1).to(torch.bfloat16)
165
+ y = torch.matmul(attn, v).view(batch_size, 1, -1)
166
+ y = self.wo(y)
167
+
168
+ return y, kv_cache.get_data()
169
+ ```
170
+
171
+ Your function should be defined as 'custom_kernel' (skeleton provided below)
172
+
173
+ ```python
174
+ ### DO NOT CHANGE THIS IMPORT STATEMENTS BLOCK ###
175
+ import os
176
+ import math
177
+ from typing import Tuple
178
+ import torch
179
+ import torch.nn.functional as F
180
+ import triton
181
+ from reference import KVCache, Config # Definition of KVCache and Config classes are shown above. Must import this way. Do not rewrite yourself.
182
+ ### END OF IMPORT STATEMENTS BLOCK ###
183
+
184
+ ### Import other packages here if needed
185
+
186
+ def custom_kernel(data: Tuple[Config, torch.Tensor, KVCache]) -> Tuple[torch.Tensor, KVCache]:
187
+ config, x, kv_cache = data
188
+
189
+ bs = config.batch_size
190
+ sl = config.seq_len
191
+ pl = kv_cache.seq_len
192
+ msl = config.max_seq_len
193
+ nh = config.n_heads
194
+ d = config.dim
195
+ dq = config.q_lora_rank
196
+ dkv = config.kv_lora_rank
197
+ dnope = config.qk_nope_head_dim
198
+ drope = config.qk_rope_head_dim
199
+ dv = config.v_head_dim
200
+
201
+ wDQ = config.Q_proj_down_weight
202
+ wDKV = config.KV_proj_down_weight
203
+ wUQ = config.Q_proj_up_weight
204
+ wUKV = config.KV_proj_up_weight
205
+ wO = config.wo_weight
206
+
207
+ # Perform MLA operations to process data into output and updated kv_cache
208
+
209
+ return output, kv_cache.data
210
+ ```
211
+
212
+ with the following signature:
213
+
214
+ Input:
215
+ - `data`: Tuple of (config: Config, x: torch.Tensor, kv_cache: KVCache)
216
+ - config: An instance of class `Config` containing model configurations and weights
217
+ - x: Input tensor of shape [batch_size, seq_len, dim]
218
+ - kv_cache: An instance of KVCache class for caching the keys and values
219
+
220
+ Output:
221
+ - output: Output tensor [batch_size, seq_len, dim]
222
+ - kv_cache.data: The data field of the updated `KVCache` instance with the new keys and values added
223
+
224
+ To warm you up in writing optimized triton code, here is an example code which is correct for your task but very unoptimized. Your code should be as optimized as possible but still correct.
225
+
226
+ ```python
227
+ import os
228
+ import math
229
+ from typing import Tuple
230
+ import torch
231
+ import torch.nn.functional as F
232
+ import triton
233
+ import triton.language as tl
234
+ from reference import KVCache, Config
235
+
236
+ @triton.jit
237
+ def rope_swap_halves_kernel(
238
+ x_ptr,
239
+ cos_ptr, sin_ptr,
240
+ B: tl.constexpr,
241
+ T: tl.constexpr,
242
+ D: tl.constexpr,
243
+ stride_xb, stride_xt, stride_xd,
244
+ stride_cos_t, stride_cos_d,
245
+ stride_sin_t, stride_sin_d,
246
+ BLOCK_HALF: tl.constexpr,
247
+ ):
248
+ pid = tl.program_id(0)
249
+ bt = pid
250
+ b = bt // T
251
+ t = bt - b * T
252
+ half = D // 2
253
+ off = tl.arange(0, BLOCK_HALF)
254
+ mask = off < half
255
+ x_base = x_ptr + b * stride_xb + t * stride_xt
256
+ x0_ptr = x_base + off * stride_xd
257
+ x1_ptr = x_base + (half + off) * stride_xd
258
+ cos_base = cos_ptr + t * stride_cos_t
259
+ sin_base = sin_ptr + t * stride_sin_t
260
+ c_ptr = cos_base + off * stride_cos_d
261
+ s_ptr = sin_base + off * stride_sin_d
262
+ x0 = tl.load(x0_ptr, mask=mask, other=0.0).to(tl.float32)
263
+ x1 = tl.load(x1_ptr, mask=mask, other=0.0).to(tl.float32)
264
+ c = tl.load(c_ptr, mask=mask, other=0.0).to(tl.float32)
265
+ s = tl.load(s_ptr, mask=mask, other=0.0).to(tl.float32)
266
+ out0 = x0 * c - x1 * s
267
+ out1 = x1 * c + x0 * s
268
+ tl.store(x0_ptr, out0.to(tl.bfloat16), mask=mask)
269
+ tl.store(x1_ptr, out1.to(tl.bfloat16), mask=mask)
270
+
271
+ # ... (see initial_program.py for full working baseline)
272
+ ```
273
+
274
+ Below are the different configs that your kernel will be tested on:
275
+
276
+ Common configs:
277
+ - {"batch_size": 128, "seq_len": 1, "kv_lora_rank": 512, "qk_rope_head_dim": 64, "v_head_dim": 128, "n_heads": 128, "dim": 7168, "q_lora_rank": 1536, "max_seq_len": 8192}
278
+
279
+ For correctness check:
280
+ - {"prefill": 128}
281
+ - {"prefill": 512}
282
+ - {"prefill": 1024}
283
+ - {"prefill": 2048}
284
+
285
+ For performance benchmark (optimize runtime for these):
286
+ - {"prefill": 6144}
287
+
288
+ Rules:
289
+ - The tensors arguments passed in will be already on your cuda device.
290
+ - The weights for all parameters in the MLA will be given as input.
291
+ - All weights and data will be in `torch.bfloat16` format.
292
+ - Define all of your code in one final ```python ``` block.
293
+ - The entrypoint to your code must be named 'custom_kernel'.
294
+ - You will be using trition 3.4.0 and your kernels will be run on an Nvidia H200 GPU.
295
+ - Consider optimizing multiple operations with triton, not just limited to softmax. E.g., rope, attention, etc.
296
+ - You are allowed to use torch.compile().
297
+
298
+ Important rules in triton 3.4.0:
299
+ - `tl.load` does not have an argument called `dtype`. Never use it like `tl.load(..., dtype=...)`.
300
+ - Triton dtypes are not callable, so never use them like `tl.float16(1.0)`, `tl.float32(0.0)`.
301
+ - `tl.arange(start, end)`:
302
+ - range length (end - start) must be power-of-2
303
+ - start, end must be of type `tl.constexpr`
304
+ - `tl.range(start, end, step, num_stages)`:
305
+ - keep loop index type stable, don't reassign it
306
+ - start, end, step do not have to be `tl.constexpr` but must stay scalar integer types
307
+ - num_stages must be `tl.constexpr`
308
+ - Do not something like x[0] or offs[0] inside a Triton kernel. Triton tensors are SIMD vectors; scalar indexing like [0] is not generally supported.
309
+
310
+ Here's an simple example correctly following these rules:
311
+
312
+ ```python
313
+ import torch
314
+ import triton
315
+ import triton.language as tl
316
+
317
+ @triton.jit
318
+ def kernel_right(
319
+ x_ptr, y_ptr, out_ptr,
320
+ n_elements: tl.constexpr,
321
+ BLOCK: tl.constexpr,
322
+ ROW_STEP: tl.constexpr,
323
+ NUM_STAGES: tl.constexpr,
324
+ ):
325
+ pid = tl.program_id(axis=0)
326
+ offs = pid * BLOCK + tl.arange(0, BLOCK)
327
+ mask = offs < n_elements
328
+ x = tl.load(x_ptr + offs, mask=mask, other=0.0)
329
+ y = tl.load(y_ptr + offs, mask=mask, other=0.0)
330
+ one_f32 = tl.full([], 1.0, tl.float32)
331
+ acc = tl.zeros((BLOCK,), dtype=tl.float32)
332
+ acc = tl.cast(x, tl.float32) + tl.cast(y, tl.float32) + one_f32
333
+ base = tl.full([], pid * BLOCK, tl.int32)
334
+ x0 = tl.load(x_ptr + base, mask=(base < n_elements), other=0.0)
335
+ x0_vec = tl.full((BLOCK,), x0, tl.float32)
336
+ out_vec = acc + x0_vec
337
+ n_rows = tl.full([], 4, tl.int32)
338
+ extra = tl.zeros((BLOCK,), dtype=tl.float32)
339
+ for r in tl.range(0, n_rows, ROW_STEP, num_stages=NUM_STAGES):
340
+ shift = r * tl.full([], 1, tl.int32)
341
+ offs_r = offs + shift
342
+ xr = tl.load(x_ptr + offs_r, mask=(offs_r < n_elements), other=0.0)
343
+ extra += tl.cast(xr, tl.float32)
344
+ out_vec = out_vec + extra
345
+ tl.store(out_ptr + offs, tl.cast(out_vec, tl.float16), mask=mask)
346
+ ```
347
+ evaluator:
348
+ timeout: 600
349
+ max_retries: 3
350
+ cascade_evaluation: true
351
+ cascade_thresholds: [0.4, 0.3]
352
+
353
+ diff_based_generation: true
354
+ max_solution_length: 60000
355
+ random_seed: 42
benchmarks/gpu_mode/mla_decode/initial_program.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # EVOLVE-BLOCK-START
2
+ """
3
+ Initial MLA Decode submission — optimised baseline with Triton softmax and RoPE kernels.
4
+ """
5
+
6
+ import os
7
+ import math
8
+ from typing import Tuple
9
+ import torch
10
+ import torch.nn.functional as F
11
+ import triton
12
+ import triton.language as tl
13
+ from reference import KVCache, Config
14
+
15
+
16
+ @triton.jit
17
+ def rope_swap_halves_kernel(
18
+ x_ptr,
19
+ cos_ptr, sin_ptr,
20
+ B: tl.constexpr,
21
+ T: tl.constexpr,
22
+ D: tl.constexpr,
23
+ stride_xb, stride_xt, stride_xd,
24
+ stride_cos_t, stride_cos_d,
25
+ stride_sin_t, stride_sin_d,
26
+ BLOCK_HALF: tl.constexpr,
27
+ ):
28
+ pid = tl.program_id(0)
29
+ bt = pid
30
+ b = bt // T
31
+ t = bt - b * T
32
+
33
+ half = D // 2
34
+
35
+ off = tl.arange(0, BLOCK_HALF)
36
+ mask = off < half
37
+
38
+ x_base = x_ptr + b * stride_xb + t * stride_xt
39
+ x0_ptr = x_base + off * stride_xd
40
+ x1_ptr = x_base + (half + off) * stride_xd
41
+
42
+ cos_base = cos_ptr + t * stride_cos_t
43
+ sin_base = sin_ptr + t * stride_sin_t
44
+
45
+ c_ptr = cos_base + off * stride_cos_d
46
+ s_ptr = sin_base + off * stride_sin_d
47
+
48
+ x0 = tl.load(x0_ptr, mask=mask, other=0.0).to(tl.float32)
49
+ x1 = tl.load(x1_ptr, mask=mask, other=0.0).to(tl.float32)
50
+ c = tl.load(c_ptr, mask=mask, other=0.0).to(tl.float32)
51
+ s = tl.load(s_ptr, mask=mask, other=0.0).to(tl.float32)
52
+
53
+ out0 = x0 * c - x1 * s
54
+ out1 = x1 * c + x0 * s
55
+
56
+ tl.store(x0_ptr, out0.to(tl.bfloat16), mask=mask)
57
+ tl.store(x1_ptr, out1.to(tl.bfloat16), mask=mask)
58
+
59
+
60
+ def rope_inplace_query(q_rope: torch.Tensor, cos_q: torch.Tensor, sin_q: torch.Tensor):
61
+ assert q_rope.is_cuda
62
+ assert q_rope.shape[-1] % 2 == 0
63
+ bs, nh, d_rope = q_rope.shape
64
+
65
+ half = d_rope // 2
66
+ BLOCK_HALF = 1 << (half - 1).bit_length()
67
+
68
+ grid = (bs * nh,)
69
+
70
+ rope_swap_halves_kernel[grid](
71
+ q_rope,
72
+ cos_q, sin_q,
73
+ B=bs, T=nh, D=d_rope,
74
+ stride_xb=q_rope.stride(0),
75
+ stride_xt=q_rope.stride(1),
76
+ stride_xd=q_rope.stride(2),
77
+ stride_cos_t=0, stride_cos_d=cos_q.stride(0),
78
+ stride_sin_t=0, stride_sin_d=sin_q.stride(0),
79
+ BLOCK_HALF=BLOCK_HALF,
80
+ num_warps=4,
81
+ )
82
+
83
+
84
+ _rope_cache = {}
85
+
86
+
87
+ def _rotate_half(x: torch.Tensor) -> torch.Tensor:
88
+ half = x.shape[-1] // 2
89
+ return torch.cat((-x[..., half:], x[..., :half]), dim=-1)
90
+
91
+
92
+ def _get_rope_tables(dim: int, max_seq_len: int, device: torch.device):
93
+ key = (dim, max_seq_len, device)
94
+ if key not in _rope_cache:
95
+ half = dim // 2
96
+ theta = (10000.0 ** (-torch.arange(half, dtype=torch.float32, device=device) / half)).to(
97
+ torch.bfloat16
98
+ )
99
+ pos = torch.arange(max_seq_len, dtype=torch.int64, device=device).unsqueeze_(1)
100
+ idx = pos * theta[None, :]
101
+ idx = torch.cat([idx, idx], dim=-1)
102
+ _rope_cache[key] = (idx.cos().to(torch.bfloat16), idx.sin().to(torch.bfloat16))
103
+ return _rope_cache[key]
104
+
105
+
106
+ @triton.jit
107
+ def _softmax_kernel(
108
+ out_ptr, in_ptr,
109
+ stride_out, stride_in,
110
+ n_cols,
111
+ BLOCK_SIZE: tl.constexpr,
112
+ NUM_STAGES: tl.constexpr,
113
+ ):
114
+ row = tl.program_id(0)
115
+ row_off_in = row * stride_in
116
+ row_off_out = row * stride_out
117
+
118
+ max_val = tl.full([BLOCK_SIZE], -float("inf"), tl.float32)
119
+ col = tl.arange(0, BLOCK_SIZE)
120
+ for start in range(0, n_cols, BLOCK_SIZE):
121
+ cur = start + col
122
+ mask = cur < n_cols
123
+ val = tl.load(in_ptr + row_off_in + cur, mask=mask, other=-float('inf'))
124
+ max_val = tl.maximum(max_val, tl.cast(val, tl.float32))
125
+ row_max = tl.max(max_val)
126
+
127
+ sum_val = tl.full([BLOCK_SIZE], 0.0, tl.float32)
128
+ for start in range(0, n_cols, BLOCK_SIZE):
129
+ cur = start + col
130
+ mask = cur < n_cols
131
+ val = tl.load(in_ptr + row_off_in + cur, mask=mask, other=-float('inf'))
132
+ exp_val = tl.exp(tl.cast(val, tl.float32) - row_max)
133
+ tl.store(out_ptr + row_off_out + cur, tl.cast(exp_val, tl.bfloat16), mask=mask)
134
+ sum_val += exp_val
135
+ row_sum = tl.sum(sum_val)
136
+
137
+ for start in range(0, n_cols, BLOCK_SIZE):
138
+ cur = start + col
139
+ mask = cur < n_cols
140
+ val = tl.load(out_ptr + row_off_out + cur, mask=mask, other=0.0)
141
+ norm = tl.cast(val, tl.float32) / row_sum
142
+ tl.store(out_ptr + row_off_out + cur, tl.cast(norm, tl.bfloat16), mask=mask)
143
+
144
+
145
+ def _triton_softmax(x: torch.Tensor) -> torch.Tensor:
146
+ assert x.is_cuda and x.dtype == torch.bfloat16
147
+ n_rows, n_cols = x.shape
148
+
149
+ if n_cols <= 32:
150
+ BLOCK_SIZE = 32
151
+ elif n_cols <= 64:
152
+ BLOCK_SIZE = 64
153
+ elif n_cols <= 128:
154
+ BLOCK_SIZE = 128
155
+ else:
156
+ BLOCK_SIZE = 1 << (n_cols - 1).bit_length()
157
+ BLOCK_SIZE = min(BLOCK_SIZE, 1024)
158
+
159
+ out = torch.empty_like(x)
160
+ grid = (n_rows,)
161
+ _softmax_kernel[grid](
162
+ out, x,
163
+ out.stride(0), x.stride(0),
164
+ n_cols,
165
+ BLOCK_SIZE=BLOCK_SIZE,
166
+ NUM_STAGES=2,
167
+ num_warps=4,
168
+ )
169
+ return out
170
+
171
+
172
+ def custom_kernel(data: Tuple[Config, torch.Tensor, KVCache]) -> Tuple[torch.Tensor, torch.Tensor]:
173
+ """
174
+ Optimised forward step of the Multi-head Latent Attention (MLA) module.
175
+ """
176
+ config, x, kv_cache = data
177
+
178
+ bs = config.batch_size
179
+ sl = config.seq_len
180
+ nh = config.n_heads
181
+ dq = config.q_lora_rank
182
+ dkv = config.kv_lora_rank
183
+ d_nope = config.qk_nope_head_dim
184
+ d_rope = config.qk_rope_head_dim
185
+ dv = config.v_head_dim
186
+ msl = config.max_seq_len
187
+
188
+ wDQ = config.Q_proj_down_weight
189
+ wDKV = config.KV_proj_down_weight
190
+ wUQ = config.Q_proj_up_weight
191
+ wUKV = config.KV_proj_up_weight
192
+ wO = config.wo_weight
193
+
194
+ q_lora = F.linear(x, wDQ)
195
+ kv_lora_input = F.linear(x, wDKV)
196
+
197
+ kv_lora, kv_len = kv_cache(kv_lora_input)
198
+ query_pos = kv_len - 1
199
+
200
+ q_up = F.linear(q_lora.squeeze(1), wUQ)
201
+ q_up = q_up.view(bs, nh, d_nope + d_rope)
202
+ q_nope = q_up[..., :d_nope]
203
+ q_rope = q_up[..., d_nope:]
204
+
205
+ kv_nope_input = kv_lora[..., :dkv]
206
+ k_rope_input = kv_lora[..., dkv:]
207
+
208
+ cos_table, sin_table = _get_rope_tables(d_rope, msl, x.device)
209
+
210
+ cos_q = cos_table[query_pos].view(d_rope).contiguous()
211
+ sin_q = sin_table[query_pos].view(d_rope).contiguous()
212
+ rope_inplace_query(q_rope, cos_q, sin_q)
213
+
214
+ cos_k = cos_table[:kv_len]
215
+ sin_k = sin_table[:kv_len]
216
+ k_rope = k_rope_input * cos_k + _rotate_half(k_rope_input) * sin_k
217
+
218
+ wUKV_view = wUKV.view(nh, d_nope + dv, dkv)
219
+ wK = wUKV_view[:, :d_nope, :]
220
+ q_nope_latent = torch.einsum('bhd,hdk->bhk', q_nope, wK)
221
+
222
+ kv_nope_T = kv_nope_input.transpose(1, 2)
223
+ scores_nope = torch.matmul(q_nope_latent, kv_nope_T)
224
+
225
+ scores_rope = torch.matmul(q_rope, k_rope.transpose(-2, -1))
226
+
227
+ scale = 1.0 / math.sqrt(d_nope + d_rope)
228
+ scores = (scores_nope + scores_rope) * scale
229
+
230
+ scores_flat = scores.reshape(bs * nh, kv_len)
231
+ attn_flat = _triton_softmax(scores_flat)
232
+ attn = attn_flat.view(bs, nh, kv_len)
233
+
234
+ M = torch.matmul(attn, kv_nope_input)
235
+
236
+ wV = wUKV_view[:, d_nope:, :]
237
+ wV_T = wV.permute(0, 2, 1)
238
+ y_head = torch.einsum('bhd,hdk->bhk', M, wV_T)
239
+
240
+ y = y_head.reshape(bs, nh * dv)
241
+ y = y.unsqueeze(1)
242
+ output = F.linear(y, wO)
243
+
244
+ return output, kv_cache.data
245
+ # EVOLVE-BLOCK-END
benchmarks/gpu_mode/mla_decode/reference.py ADDED
@@ -0,0 +1,520 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Reference implementation for MLA Decode (Multi-Head Latent Attention) Triton kernel.
3
+ Same test cases, benchmarks, generate_input, ref_kernel, and check_implementation.
4
+ """
5
+
6
+ import math
7
+ from dataclasses import dataclass
8
+ import torch
9
+ from torch import nn
10
+ import torch.nn.functional as F
11
+
12
+ # ---------------------------------------------------------------------------
13
+ # Scoring and benchmark configuration (read by shared_eval.py)
14
+ # ---------------------------------------------------------------------------
15
+
16
+ SCORE_SCALE = 3000.0
17
+
18
+ # MLA uses wall-clock timing, 1% rel error, no wall clock timeout, torch.no_grad()
19
+ BENCH_USE_CUDA_EVENTS = False
20
+ BENCH_REL_ERROR = 0.01
21
+ BENCH_WALL_TIMEOUT_NS = None
22
+ BENCH_NO_GRAD = True
23
+ BENCH_MAX_REPEATS = 100
24
+ BENCH_MAX_TIME_NS = 10e9
25
+ BENCH_WARMUP_STYLE = 'timed_calls'
26
+
27
+ # ---------------------------------------------------------------------------
28
+ # Model classes (needed by both reference and submissions)
29
+ # ---------------------------------------------------------------------------
30
+
31
+
32
+ class RoPE(nn.Module):
33
+ def __init__(self, d_model: int):
34
+ super().__init__()
35
+ self.d_model = d_model
36
+ theta = 10000 ** (-torch.arange(0, d_model // 2, dtype=torch.bfloat16) / (d_model // 2))
37
+ self.register_buffer("theta", theta)
38
+
39
+ def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
40
+ x1, x2 = x.chunk(2, dim=-1)
41
+ return torch.cat((-x2, x1), dim=-1)
42
+
43
+ def forward(self, x: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
44
+ seq_len = x.size(-2)
45
+ d_model = x.size(-1)
46
+ assert d_model == self.d_model
47
+ seq_idx = torch.arange(start_pos, start_pos + seq_len, device=x.device)
48
+ idx_theta = torch.einsum('s,d->sd', seq_idx, self.theta)
49
+ idx_theta2 = torch.cat([idx_theta, idx_theta], dim=-1)
50
+ cos = idx_theta2.cos().to(torch.bfloat16)
51
+ sin = idx_theta2.sin().to(torch.bfloat16)
52
+ return x * cos + self.rotate_half(x) * sin
53
+
54
+
55
+ class KVCache(nn.Module):
56
+ def __init__(self, kv_cache_shape: tuple, **kwargs) -> None:
57
+ super().__init__(**kwargs)
58
+ self.register_buffer('data', torch.zeros(kv_cache_shape, dtype=torch.bfloat16))
59
+ self.seq_len = 0
60
+ self.zero()
61
+
62
+ def zero(self) -> None:
63
+ self.data.zero_()
64
+
65
+ def get_data(self) -> torch.Tensor:
66
+ return self.data
67
+
68
+ def forward(self, c_kv: torch.Tensor) -> torch.Tensor:
69
+ assert self.seq_len + c_kv.size(1) <= self.data.size(1), "KV Cache Exceeded"
70
+
71
+ self.data = self.data.to(c_kv.dtype)
72
+ self.data[
73
+ :, self.seq_len: self.seq_len + c_kv.size(1), :
74
+ ] = c_kv
75
+ self.seq_len += c_kv.size(1)
76
+
77
+ return self.data[:, :self.seq_len], self.seq_len
78
+
79
+
80
+ @dataclass
81
+ class Config:
82
+ batch_size: int
83
+ dim: int
84
+ n_heads: int
85
+ q_lora_rank: int
86
+ kv_lora_rank: int
87
+ qk_nope_head_dim: int
88
+ qk_rope_head_dim: int
89
+ v_head_dim: int
90
+ seq_len: int
91
+ max_seq_len: int
92
+ kv_cache_shape: tuple
93
+ Q_proj_down_weight: torch.Tensor
94
+ Q_proj_up_weight: torch.Tensor
95
+ KV_proj_down_weight: torch.Tensor
96
+ KV_proj_up_weight: torch.Tensor
97
+ wo_weight: torch.Tensor
98
+
99
+
100
+ class MLA(nn.Module):
101
+ def __init__(self, config: Config):
102
+ super().__init__()
103
+ self.dim = config.dim
104
+ self.n_heads = config.n_heads
105
+ self.q_lora_rank = config.q_lora_rank
106
+ self.kv_lora_rank = config.kv_lora_rank
107
+ self.nope_head_dim = config.qk_nope_head_dim
108
+ self.rope_head_dim = config.qk_rope_head_dim
109
+ self.v_head_dim = config.v_head_dim
110
+ self.Q_proj_down = nn.Linear(self.dim, self.q_lora_rank, dtype=torch.bfloat16, bias=False)
111
+ self.KV_proj_down = nn.Linear(self.dim, self.kv_lora_rank + self.rope_head_dim, dtype=torch.bfloat16, bias=False)
112
+ self.Q_proj_up = nn.Linear(self.q_lora_rank, (self.nope_head_dim + self.rope_head_dim) * self.n_heads, dtype=torch.bfloat16, bias=False)
113
+ self.KV_proj_up = nn.Linear(self.kv_lora_rank, (self.nope_head_dim + self.v_head_dim) * self.n_heads, dtype=torch.bfloat16, bias=False)
114
+ self.q_rope = RoPE(self.rope_head_dim)
115
+ self.k_rope = RoPE(self.rope_head_dim)
116
+ self.wo = nn.Linear(self.v_head_dim * self.n_heads, self.dim, dtype=torch.bfloat16, bias=False)
117
+ self.eps = 1e-6
118
+
119
+ def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
120
+ batch_size, seq_len, model_dim = x.size()
121
+
122
+ q_lora = self.Q_proj_down(x)
123
+ kv_lora = self.KV_proj_down(x)
124
+ kv_lora, kv_len = kv_cache(kv_lora)
125
+ query_pos = kv_len - 1
126
+
127
+ q_nope_and_rope = self.Q_proj_up(q_lora).view(
128
+ batch_size, seq_len, self.n_heads, self.nope_head_dim + self.rope_head_dim)
129
+ q_nope, q_rope = torch.split(q_nope_and_rope, [self.nope_head_dim, self.rope_head_dim], dim=-1)
130
+
131
+ kv_nope, k_rope = torch.split(kv_lora, [self.kv_lora_rank, self.rope_head_dim], dim=-1)
132
+ kv_nope = self.KV_proj_up(kv_nope).view(
133
+ batch_size, kv_len, self.n_heads, self.nope_head_dim + self.v_head_dim)
134
+ k_nope, v = torch.split(kv_nope, [self.nope_head_dim, self.v_head_dim], dim=-1)
135
+
136
+ q_rope = q_rope.permute(0, 2, 1, 3)
137
+ q_rope = self.q_rope(q_rope, start_pos=query_pos)
138
+
139
+ q_nope = q_nope.permute(0, 2, 1, 3)
140
+ q = torch.concat([q_nope, q_rope], dim=-1)
141
+
142
+ k_rope = k_rope[:, None, :, :]
143
+ k_rope = self.k_rope(k_rope).expand(-1, self.n_heads, -1, -1)
144
+ k_nope = k_nope.permute(0, 2, 1, 3)
145
+ k = torch.concat([k_nope, k_rope], dim=-1)
146
+
147
+ v = v.permute(0, 2, 1, 3)
148
+ scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.rope_head_dim + self.nope_head_dim)
149
+ attn = F.softmax(scores, dim=-1).to(torch.bfloat16)
150
+ y = torch.matmul(attn, v).view(batch_size, 1, -1)
151
+ y = self.wo(y)
152
+
153
+ return y, kv_cache.get_data()
154
+
155
+
156
+ # ---------------------------------------------------------------------------
157
+ # Test / benchmark cases — from discover task.yml
158
+ # ---------------------------------------------------------------------------
159
+
160
+ TEST_CASES = [
161
+ {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 128, "seed": 9247},
162
+ {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 512, "seed": 2197},
163
+ {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 1024, "seed": 9107},
164
+ {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 2048, "seed": 5291},
165
+ ]
166
+
167
+ BENCHMARK_CASES = [
168
+ {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 4096, "seed": 9817},
169
+ {"batchsize": 128, "dim": 7168, "dq": 1536, "prefill": 6144, "seed": 5291},
170
+ ]
171
+
172
+
173
+ # ---------------------------------------------------------------------------
174
+ # Input generation
175
+ # ---------------------------------------------------------------------------
176
+
177
+
178
+ def generate_input(batchsize, dim, dq, prefill, seed):
179
+ gen = torch.Generator(device='cuda')
180
+ gen.manual_seed(seed)
181
+
182
+ Q_proj_down_weight = torch.randn((dq, dim), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dim)
183
+ KV_proj_down_weight = torch.randn((512 + 64, dim), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dim)
184
+ Q_proj_up_weight = torch.randn(((128 + 64) * 128, dq), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dq)
185
+ KV_proj_up_weight = torch.randn(((128 + 128) * 128, 512), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(512)
186
+ wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(128 * 128)
187
+
188
+ config = Config(
189
+ batch_size=batchsize,
190
+ dim=dim,
191
+ q_lora_rank=dq,
192
+ n_heads=128,
193
+ kv_lora_rank=512,
194
+ qk_nope_head_dim=128,
195
+ qk_rope_head_dim=64,
196
+ v_head_dim=128,
197
+ seq_len=1,
198
+ max_seq_len=8192,
199
+ kv_cache_shape=(batchsize, 8192, 512 + 64),
200
+ Q_proj_down_weight=Q_proj_down_weight,
201
+ Q_proj_up_weight=Q_proj_up_weight,
202
+ KV_proj_down_weight=KV_proj_down_weight,
203
+ KV_proj_up_weight=KV_proj_up_weight,
204
+ wo_weight=wo_weight,
205
+ )
206
+ x = torch.randn((config.batch_size, 1, config.dim), dtype=torch.bfloat16, generator=gen, device='cuda')
207
+
208
+ kv_cache = KVCache((config.batch_size, config.max_seq_len, config.kv_lora_rank + config.qk_rope_head_dim)).to('cuda')
209
+ pre_filled_cache = torch.randn(
210
+ (config.batch_size, prefill, config.kv_lora_rank + config.qk_rope_head_dim),
211
+ dtype=torch.bfloat16, generator=gen, device='cuda')
212
+ kv_cache(pre_filled_cache)
213
+
214
+ return config, x, kv_cache
215
+
216
+
217
+ # ---------------------------------------------------------------------------
218
+ # Reference kernel
219
+ # ---------------------------------------------------------------------------
220
+
221
+
222
+ def ref_kernel(data):
223
+ config, x, kv_cache = data
224
+
225
+ model = MLA(config).to('cuda')
226
+ model.Q_proj_down.weight = nn.Parameter(config.Q_proj_down_weight)
227
+ model.Q_proj_up.weight = nn.Parameter(config.Q_proj_up_weight)
228
+ model.KV_proj_down.weight = nn.Parameter(config.KV_proj_down_weight)
229
+ model.KV_proj_up.weight = nn.Parameter(config.KV_proj_up_weight)
230
+ model.wo.weight = nn.Parameter(config.wo_weight)
231
+
232
+ output, kv_data = model(x, kv_cache)
233
+ return output, kv_data
234
+
235
+
236
+ # ---------------------------------------------------------------------------
237
+ # Correctness checking
238
+ # ---------------------------------------------------------------------------
239
+
240
+
241
+ @torch.no_grad()
242
+ def _verbose_allclose(received, expected, rtol=1e-05, atol=1e-08, max_print=5):
243
+ if received.shape != expected.shape:
244
+ return False, [f"SIZE MISMATCH. received shape: {received.shape}, expected shape: {expected.shape}"]
245
+
246
+ diff = torch.abs(received.to(torch.float32) - expected.to(torch.float32))
247
+ tolerance = atol + rtol * torch.abs(expected.to(torch.float32))
248
+ tol_mismatched = diff > tolerance
249
+ nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
250
+ posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
251
+ neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
252
+ mismatched = torch.logical_or(
253
+ torch.logical_or(tol_mismatched, nan_mismatched),
254
+ torch.logical_or(posinf_mismatched, neginf_mismatched),
255
+ )
256
+
257
+ mismatched_indices = torch.nonzero(mismatched)
258
+ num_mismatched = mismatched.count_nonzero().item()
259
+
260
+ if num_mismatched >= 1:
261
+ mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
262
+ for index in mismatched_indices[:max_print]:
263
+ i = tuple(index.tolist())
264
+ mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
265
+ if num_mismatched > max_print:
266
+ mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
267
+ return False, mismatch_details
268
+
269
+ return True, [f"Maximum error: {torch.max(diff)}"]
270
+
271
+
272
+ def check_implementation(data, submission_output, rtol=2e-2, atol=8e-3):
273
+ """Check submission output against reference. Returns (passed: bool, msg: str)."""
274
+ import gc
275
+ output_mla, output_kv = submission_output
276
+
277
+ # Move submission output to CPU and free GPU memory before running ref kernel
278
+ output_mla_cpu = output_mla.cpu()
279
+ output_kv_cpu = output_kv.cpu()
280
+ del output_mla, output_kv
281
+ gc.collect()
282
+ torch.cuda.empty_cache()
283
+
284
+ config, x, kv_cache = data
285
+ with torch.no_grad():
286
+ expected_mla, expected_kv = ref_kernel((config, x, kv_cache))
287
+
288
+ # Move ref output to CPU and free GPU memory before comparison
289
+ expected_mla_cpu = expected_mla.cpu()
290
+ expected_kv_cpu = expected_kv.cpu()
291
+ del expected_mla, expected_kv
292
+ gc.collect()
293
+ torch.cuda.empty_cache()
294
+
295
+ good_mla, reasons_mla = _verbose_allclose(output_mla_cpu, expected_mla_cpu, rtol=rtol, atol=atol)
296
+ good_kv, reasons_kv = _verbose_allclose(output_kv_cpu, expected_kv_cpu, rtol=rtol, atol=atol)
297
+
298
+ if not good_mla:
299
+ return False, "MLA output mismatch: " + " ".join(reasons_mla)
300
+ if not good_kv:
301
+ return False, "KV cache mismatch: " + " ".join(reasons_kv)
302
+
303
+ return True, "Match"
304
+
305
+
306
+ # ---------------------------------------------------------------------------
307
+ # Self-contained reference code for Modal remote execution
308
+ # ---------------------------------------------------------------------------
309
+
310
+ MODAL_REFERENCE_CODE = r'''
311
+ import math
312
+ from dataclasses import dataclass
313
+ import torch
314
+ from torch import nn
315
+ import torch.nn.functional as F
316
+
317
+
318
+ class RoPE(nn.Module):
319
+ def __init__(self, d_model: int):
320
+ super().__init__()
321
+ self.d_model = d_model
322
+ theta = 10000 ** (-torch.arange(0, d_model // 2, dtype=torch.bfloat16) / (d_model // 2))
323
+ self.register_buffer("theta", theta)
324
+
325
+ def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
326
+ x1, x2 = x.chunk(2, dim=-1)
327
+ return torch.cat((-x2, x1), dim=-1)
328
+
329
+ def forward(self, x: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
330
+ seq_len = x.size(-2)
331
+ d_model = x.size(-1)
332
+ assert d_model == self.d_model
333
+ seq_idx = torch.arange(start_pos, start_pos + seq_len, device=x.device)
334
+ idx_theta = torch.einsum('s,d->sd', seq_idx, self.theta)
335
+ idx_theta2 = torch.cat([idx_theta, idx_theta], dim=-1)
336
+ cos = idx_theta2.cos().to(torch.bfloat16)
337
+ sin = idx_theta2.sin().to(torch.bfloat16)
338
+ return x * cos + self.rotate_half(x) * sin
339
+
340
+
341
+ class KVCache(nn.Module):
342
+ def __init__(self, kv_cache_shape: tuple, **kwargs) -> None:
343
+ super().__init__(**kwargs)
344
+ self.register_buffer('data', torch.zeros(kv_cache_shape, dtype=torch.bfloat16))
345
+ self.seq_len = 0
346
+ self.zero()
347
+
348
+ def zero(self) -> None:
349
+ self.data.zero_()
350
+
351
+ def get_data(self) -> torch.Tensor:
352
+ return self.data
353
+
354
+ def forward(self, c_kv: torch.Tensor) -> torch.Tensor:
355
+ assert self.seq_len + c_kv.size(1) <= self.data.size(1), "KV Cache Exceeded"
356
+ self.data = self.data.to(c_kv.dtype)
357
+ self.data[:, self.seq_len: self.seq_len + c_kv.size(1), :] = c_kv
358
+ self.seq_len += c_kv.size(1)
359
+ return self.data[:, :self.seq_len], self.seq_len
360
+
361
+
362
+ @dataclass
363
+ class Config:
364
+ batch_size: int
365
+ dim: int
366
+ n_heads: int
367
+ q_lora_rank: int
368
+ kv_lora_rank: int
369
+ qk_nope_head_dim: int
370
+ qk_rope_head_dim: int
371
+ v_head_dim: int
372
+ seq_len: int
373
+ max_seq_len: int
374
+ kv_cache_shape: tuple
375
+ Q_proj_down_weight: torch.Tensor
376
+ Q_proj_up_weight: torch.Tensor
377
+ KV_proj_down_weight: torch.Tensor
378
+ KV_proj_up_weight: torch.Tensor
379
+ wo_weight: torch.Tensor
380
+
381
+
382
+ class MLA(nn.Module):
383
+ def __init__(self, config: Config):
384
+ super().__init__()
385
+ self.dim = config.dim
386
+ self.n_heads = config.n_heads
387
+ self.q_lora_rank = config.q_lora_rank
388
+ self.kv_lora_rank = config.kv_lora_rank
389
+ self.nope_head_dim = config.qk_nope_head_dim
390
+ self.rope_head_dim = config.qk_rope_head_dim
391
+ self.v_head_dim = config.v_head_dim
392
+ self.Q_proj_down = nn.Linear(self.dim, self.q_lora_rank, dtype=torch.bfloat16, bias=False)
393
+ self.KV_proj_down = nn.Linear(self.dim, self.kv_lora_rank + self.rope_head_dim, dtype=torch.bfloat16, bias=False)
394
+ self.Q_proj_up = nn.Linear(self.q_lora_rank, (self.nope_head_dim + self.rope_head_dim) * self.n_heads, dtype=torch.bfloat16, bias=False)
395
+ self.KV_proj_up = nn.Linear(self.kv_lora_rank, (self.nope_head_dim + self.v_head_dim) * self.n_heads, dtype=torch.bfloat16, bias=False)
396
+ self.q_rope = RoPE(self.rope_head_dim)
397
+ self.k_rope = RoPE(self.rope_head_dim)
398
+ self.wo = nn.Linear(self.v_head_dim * self.n_heads, self.dim, dtype=torch.bfloat16, bias=False)
399
+ self.eps = 1e-6
400
+
401
+ def forward(self, x: torch.Tensor, kv_cache: KVCache) -> torch.Tensor:
402
+ batch_size, seq_len, model_dim = x.size()
403
+ q_lora = self.Q_proj_down(x)
404
+ kv_lora = self.KV_proj_down(x)
405
+ kv_lora, kv_len = kv_cache(kv_lora)
406
+ query_pos = kv_len - 1
407
+ q_nope_and_rope = self.Q_proj_up(q_lora).view(
408
+ batch_size, seq_len, self.n_heads, self.nope_head_dim + self.rope_head_dim)
409
+ q_nope, q_rope = torch.split(q_nope_and_rope, [self.nope_head_dim, self.rope_head_dim], dim=-1)
410
+ kv_nope, k_rope = torch.split(kv_lora, [self.kv_lora_rank, self.rope_head_dim], dim=-1)
411
+ kv_nope = self.KV_proj_up(kv_nope).view(
412
+ batch_size, kv_len, self.n_heads, self.nope_head_dim + self.v_head_dim)
413
+ k_nope, v = torch.split(kv_nope, [self.nope_head_dim, self.v_head_dim], dim=-1)
414
+ q_rope = q_rope.permute(0, 2, 1, 3)
415
+ q_rope = self.q_rope(q_rope, start_pos=query_pos)
416
+ q_nope = q_nope.permute(0, 2, 1, 3)
417
+ q = torch.concat([q_nope, q_rope], dim=-1)
418
+ k_rope = k_rope[:, None, :, :]
419
+ k_rope = self.k_rope(k_rope).expand(-1, self.n_heads, -1, -1)
420
+ k_nope = k_nope.permute(0, 2, 1, 3)
421
+ k = torch.concat([k_nope, k_rope], dim=-1)
422
+ v = v.permute(0, 2, 1, 3)
423
+ scores = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.rope_head_dim + self.nope_head_dim)
424
+ attn = F.softmax(scores, dim=-1).to(torch.bfloat16)
425
+ y = torch.matmul(attn, v).view(batch_size, 1, -1)
426
+ y = self.wo(y)
427
+ return y, kv_cache.get_data()
428
+
429
+
430
+ def ref_kernel(data):
431
+ config, x, kv_cache = data
432
+ model = MLA(config).to('cuda')
433
+ model.Q_proj_down.weight = nn.Parameter(config.Q_proj_down_weight)
434
+ model.Q_proj_up.weight = nn.Parameter(config.Q_proj_up_weight)
435
+ model.KV_proj_down.weight = nn.Parameter(config.KV_proj_down_weight)
436
+ model.KV_proj_up.weight = nn.Parameter(config.KV_proj_up_weight)
437
+ model.wo.weight = nn.Parameter(config.wo_weight)
438
+ output, kv_data = model(x, kv_cache)
439
+ return output, kv_data
440
+
441
+
442
+ def generate_input(batchsize, dim, dq, prefill, seed):
443
+ gen = torch.Generator(device='cuda')
444
+ gen.manual_seed(seed)
445
+ Q_proj_down_weight = torch.randn((dq, dim), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dim)
446
+ KV_proj_down_weight = torch.randn((512 + 64, dim), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dim)
447
+ Q_proj_up_weight = torch.randn(((128 + 64) * 128, dq), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(dq)
448
+ KV_proj_up_weight = torch.randn(((128 + 128) * 128, 512), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(512)
449
+ wo_weight = torch.randn((dim, 128 * 128), dtype=torch.bfloat16, generator=gen, device='cuda') / math.sqrt(128 * 128)
450
+ config = Config(
451
+ batch_size=batchsize, dim=dim, q_lora_rank=dq, n_heads=128,
452
+ kv_lora_rank=512, qk_nope_head_dim=128, qk_rope_head_dim=64,
453
+ v_head_dim=128, seq_len=1, max_seq_len=8192,
454
+ kv_cache_shape=(batchsize, 8192, 512 + 64),
455
+ Q_proj_down_weight=Q_proj_down_weight, Q_proj_up_weight=Q_proj_up_weight,
456
+ KV_proj_down_weight=KV_proj_down_weight, KV_proj_up_weight=KV_proj_up_weight,
457
+ wo_weight=wo_weight,
458
+ )
459
+ x = torch.randn((config.batch_size, 1, config.dim), dtype=torch.bfloat16, generator=gen, device='cuda')
460
+ kv_cache = KVCache((config.batch_size, config.max_seq_len, config.kv_lora_rank + config.qk_rope_head_dim)).to('cuda')
461
+ pre_filled_cache = torch.randn(
462
+ (config.batch_size, prefill, config.kv_lora_rank + config.qk_rope_head_dim),
463
+ dtype=torch.bfloat16, generator=gen, device='cuda')
464
+ kv_cache(pre_filled_cache)
465
+ return config, x, kv_cache
466
+
467
+
468
+ @torch.no_grad()
469
+ def _verbose_allclose(received, expected, rtol=1e-05, atol=1e-08, max_print=5):
470
+ if received.shape != expected.shape:
471
+ return False, [f"SIZE MISMATCH. received shape: {received.shape}, expected shape: {expected.shape}"]
472
+ diff = torch.abs(received.to(torch.float32) - expected.to(torch.float32))
473
+ tolerance = atol + rtol * torch.abs(expected.to(torch.float32))
474
+ tol_mismatched = diff > tolerance
475
+ nan_mismatched = torch.logical_xor(torch.isnan(received), torch.isnan(expected))
476
+ posinf_mismatched = torch.logical_xor(torch.isposinf(received), torch.isposinf(expected))
477
+ neginf_mismatched = torch.logical_xor(torch.isneginf(received), torch.isneginf(expected))
478
+ mismatched = torch.logical_or(
479
+ torch.logical_or(tol_mismatched, nan_mismatched),
480
+ torch.logical_or(posinf_mismatched, neginf_mismatched),
481
+ )
482
+ mismatched_indices = torch.nonzero(mismatched)
483
+ num_mismatched = mismatched.count_nonzero().item()
484
+ if num_mismatched >= 1:
485
+ mismatch_details = [f"Number of mismatched elements: {num_mismatched}"]
486
+ for index in mismatched_indices[:max_print]:
487
+ i = tuple(index.tolist())
488
+ mismatch_details.append(f"ERROR at {i}: {received[i]} {expected[i]}")
489
+ if num_mismatched > max_print:
490
+ mismatch_details.append(f"... and {num_mismatched - max_print} more mismatched elements.")
491
+ return False, mismatch_details
492
+ return True, [f"Maximum error: {torch.max(diff)}"]
493
+
494
+
495
+ def check_implementation(data, submission_output, rtol=2e-2, atol=8e-3):
496
+ import gc
497
+ output_mla, output_kv = submission_output
498
+ # Move submission output to CPU and free GPU memory before running ref kernel
499
+ output_mla_cpu = output_mla.cpu()
500
+ output_kv_cpu = output_kv.cpu()
501
+ del output_mla, output_kv
502
+ gc.collect()
503
+ torch.cuda.empty_cache()
504
+ config, x, kv_cache = data
505
+ with torch.no_grad():
506
+ expected_mla, expected_kv = ref_kernel((config, x, kv_cache))
507
+ # Move ref output to CPU and free GPU memory before comparison
508
+ expected_mla_cpu = expected_mla.cpu()
509
+ expected_kv_cpu = expected_kv.cpu()
510
+ del expected_mla, expected_kv
511
+ gc.collect()
512
+ torch.cuda.empty_cache()
513
+ good_mla, reasons_mla = _verbose_allclose(output_mla_cpu, expected_mla_cpu, rtol=rtol, atol=atol)
514
+ good_kv, reasons_kv = _verbose_allclose(output_kv_cpu, expected_kv_cpu, rtol=rtol, atol=atol)
515
+ if not good_mla:
516
+ return False, "MLA output mismatch: " + " ".join(reasons_mla)
517
+ if not good_kv:
518
+ return False, "KV cache mismatch: " + " ".join(reasons_kv)
519
+ return True, "Match"
520
+ '''
benchmarks/gpu_mode/mla_decode/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ triton
2
+ torch
benchmarks/gpu_mode/trimul/initial_program.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # EVOLVE-BLOCK-START
2
+ """
3
+ Initial TriMul submission — PyTorch baseline with dummy Triton kernel.
4
+ """
5
+
6
+ import torch
7
+ from torch import nn, einsum
8
+ import triton
9
+ import triton.language as tl
10
+
11
+
12
+ @triton.jit
13
+ def _dummy_kernel(x_ptr, BLOCK_SIZE: tl.constexpr):
14
+ pid = tl.program_id(0)
15
+ pass
16
+
17
+
18
+ class TriMul(nn.Module):
19
+ def __init__(
20
+ self,
21
+ dim: int,
22
+ hidden_dim: int,
23
+ ):
24
+ super().__init__()
25
+
26
+ self.norm = nn.LayerNorm(dim)
27
+
28
+ self.left_proj = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
29
+ self.right_proj = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
30
+
31
+ self.left_gate = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
32
+ self.right_gate = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
33
+ self.out_gate = nn.Linear(dim, hidden_dim, bias=False, dtype=torch.float32)
34
+
35
+ self.to_out_norm = nn.LayerNorm(hidden_dim)
36
+ self.to_out = nn.Linear(hidden_dim, dim, bias=False, dtype=torch.float32)
37
+
38
+ def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
39
+ batch_size, seq_len, _, dim = x.shape
40
+
41
+ x = self.norm(x)
42
+ x = x.to(torch.float32)
43
+
44
+ left = self.left_proj(x.to(torch.float32))
45
+ right = self.right_proj(x.to(torch.float32))
46
+
47
+ mask = mask.unsqueeze(-1)
48
+ left = left * mask
49
+ right = right * mask
50
+
51
+ left_gate = self.left_gate(x.to(torch.float32)).sigmoid()
52
+ right_gate = self.right_gate(x.to(torch.float32)).sigmoid()
53
+ out_gate = self.out_gate(x.to(torch.float32)).sigmoid()
54
+
55
+ left = left * left_gate
56
+ right = right * right_gate
57
+
58
+ out = einsum('... i k d, ... j k d -> ... i j d', left.to(torch.bfloat16), right.to(torch.bfloat16))
59
+
60
+ out = out.to(torch.float32)
61
+ out = self.to_out_norm(out)
62
+ out = out * out_gate
63
+ return self.to_out(out)
64
+
65
+
66
+ def custom_kernel(data):
67
+ input_tensor, mask, weights, config = data
68
+ trimul = TriMul(config["dim"], config["hidden_dim"]).to(input_tensor.device)
69
+
70
+ trimul.norm.weight = nn.Parameter(weights['norm.weight'].to(torch.float32))
71
+ trimul.left_proj.weight = nn.Parameter(weights['left_proj.weight'].to(torch.float32))
72
+ trimul.right_proj.weight = nn.Parameter(weights['right_proj.weight'].to(torch.float32))
73
+ trimul.left_gate.weight = nn.Parameter(weights['left_gate.weight'].to(torch.float32))
74
+ trimul.right_gate.weight = nn.Parameter(weights['right_gate.weight'].to(torch.float32))
75
+ trimul.out_gate.weight = nn.Parameter(weights['out_gate.weight'].to(torch.float32))
76
+ trimul.to_out_norm.weight = nn.Parameter(weights['to_out_norm.weight'].to(torch.float32))
77
+ trimul.to_out.weight = nn.Parameter(weights['to_out.weight'].to(torch.float32))
78
+ trimul.norm.bias = nn.Parameter(weights['norm.bias'].to(torch.float32))
79
+ trimul.to_out_norm.bias = nn.Parameter(weights['to_out_norm.bias'].to(torch.float32))
80
+
81
+ output = trimul(input_tensor, mask).to(torch.float32)
82
+
83
+ return output
84
+ # EVOLVE-BLOCK-END
benchmarks/image_gen/README.md ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Image Generation Benchmark
2
+
3
+ This benchmark evaluates whether SkyDiscover can optimize images, not just code or text. Each "solution" in the population is an image, evolved by generating and scoring variants from a candidate pool stored in the database. The evolutionary loop is the same as for code — parent selection, mutation via LLM, crossover via other context images from other islands — but instead of evolving Python programs, SkyDiscover evolves text prompts fed to GPT-5's native image generation. The VLM receives actual parent and other context images alongside text guidance, reasons about what to improve, and generates a new image. Setting `language: "image"` in the config is the only change needed.
4
+
5
+ ## Benchmark: Sky Festival
6
+
7
+ **Directory:** `sky_festival/`
8
+
9
+ The system must generate a floating sky-festival image where many details must match exact structural constraints: 9 clouds with specific shapes (rabbit, teacup, musical note, crescent moon, whale, etc.), 5 hot-air balloons with exact colors, passengers, and a banner reading "HAPPY 100TH SKY FESTIVAL", a floating island with 4 trees in a specific left-to-right order, and a party table with precisely counted items (6 cupcakes, 8 golden plates, 5 gift boxes in a pyramid). The scene also includes 6 characters with specific attributes (e.g., a robot with 3 colored buttons on its chest, a grandmother giving a thumbs-up with her left hand), flying creatures, and a correctly ordered 7-band rainbow. The full specification is about 2000 words and lives in `config.yaml`'s `prompt.system_message`.
10
+
11
+ **Evaluator.** Each generated image is graded by a GPT-5 vision judge using a strict rubric. The judge receives the image and a detailed scoring sheet, then returns per-category scores across 7 dimensions — cloud shapes (15 pts), balloons (20 pts), floating island (10 pts), table items (20 pts), characters (15 pts), decorations/creatures (10 pts), and rainbow/lighting (10 pts) — for a total of 100 points. The judge is instructed to be extremely harsh: points are awarded only when requirements are clearly and unambiguously met in the image.
12
+
13
+ ## Setup
14
+
15
+ 1. **Set your API key:**
16
+
17
+ ```bash
18
+ export OPENAI_API_KEY=...
19
+ ```
20
+
21
+ Both the image generator (GPT-5) and the evaluator judge (GPT-5) use the OpenAI API.
22
+
23
+ ## Run
24
+
25
+ ```bash
26
+ cd benchmarks/image_gen/sky_festival
27
+
28
+ # AdaEvolve
29
+ uv run skydiscover-run evaluator.py -c config.yaml -s adaevolve -o sky_festival_output
30
+
31
+ # EvoX
32
+ uv run skydiscover-run evaluator.py -c config.yaml -s evox -o sky_festival_output
33
+ ```
34
+
35
+ ## Files
36
+
37
+ | File | Description |
38
+ |------|-------------|
39
+ | `sky_festival/evaluator.py` | GPT-5 vision judge that scores images against the 100-point rubric |
40
+ | `sky_festival/config.yaml` | Config — scene specification in `prompt.system_message` |
benchmarks/image_gen/sky_festival/evaluator.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sky Festival evaluator — GPT-5 LLM-as-a-judge.
3
+
4
+ Scores VLM-generated images against a 100-point rubric using GPT-5 vision.
5
+ Returns combined_score normalized to [0, 1].
6
+
7
+ The framework passes the image path via a sidecar file:
8
+ <program_path>.image_path -> absolute path to the generated image
9
+
10
+ Requirements:
11
+ pip install openai
12
+ Environment: OPENAI_API_KEY (required), JUDGE_MODEL (optional, default gpt-5)
13
+ """
14
+
15
+ import base64
16
+ import json
17
+ import logging
18
+ import os
19
+ import re
20
+ from typing import Dict, Union
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ JUDGE_MODEL = os.environ.get("JUDGE_MODEL", "gpt-5")
25
+
26
+ SYSTEM_PROMPT = """\
27
+ You are an extremely strict image evaluation judge. You score images against a precise rubric.
28
+ You must output ONLY valid JSON with the exact keys specified. No markdown, no explanation outside JSON.
29
+ Be harsh — most AI-generated images fail these criteria. Award points only when clearly met.
30
+ If you cannot verify a requirement (e.g., too small to see), award 0 for that item."""
31
+
32
+ RUBRIC_PROMPT = """\
33
+ Score this image against the following rubric for a "Floating Sky Festival" scene.
34
+ Be extremely strict. Only award points when requirements are CLEARLY and UNAMBIGUOUSLY met.
35
+
36
+ ## Category 1: Cloud Counting and Shapes (15 pts)
37
+ - Exactly 9 clouds visible in the sky: 5 pts (8 or 10 clouds = 0)
38
+ - At least 5 of the 9 clouds have recognizable distinct shapes (rabbit, teacup, musical note, crescent moon, whale, bicycle, crown, butterfly, number 7): 10 pts (2 pts per recognizable shape, max 10)
39
+
40
+ ## Category 2: Hot Air Balloons — Count, Colors, and Passengers (20 pts)
41
+ - Exactly 5 hot air balloons visible: 4 pts (4 or 6 = 0)
42
+ - Each balloon has correct distinct color/pattern (red-striped, yellow-dotted, rainbow, purple-stars, green-peace-sign): 6 pts (deduct 2 per wrong/missing pattern)
43
+ - Correct passenger count per balloon (2 children, 1 woman, 3 cats, 1 violinist, empty): 6 pts (deduct 2 per wrong count)
44
+ - Banner on Balloon 5 reads exactly "HAPPY 100TH SKY FESTIVAL": 4 pts (any word wrong = 0)
45
+
46
+ ## Category 3: Floating Island and Trees (10 pts)
47
+ - Floating island visible suspended in air: 3 pts
48
+ - Exactly 4 different trees on the island: 4 pts (3 or 5 = 0)
49
+ - Trees in correct order left to right (oak, cherry blossom, palm, pine): 3 pts
50
+
51
+ ## Category 4: Party Table Items — Counting and Arrangement (20 pts)
52
+ - 3-tier cake with candle present: 3 pts
53
+ - Cake text "100 YEARS" legible on middle tier: 3 pts
54
+ - Exactly 6 cupcakes in 2 rows of 3 with different colored frostings: 4 pts
55
+ - Lemonade pitcher with 3 lemon slices and 2 ice cubes: 3 pts
56
+ - Stack of exactly 8 golden plates: 3 pts
57
+ - Exactly 5 gift boxes in pyramid (3 bottom, 2 top): 4 pts
58
+
59
+ ## Category 5: Characters — Count, Identity, and Details (15 pts)
60
+ - Exactly 6 characters seated at the table (3 per side): 5 pts
61
+ - Correct characters identifiable (girl with pigtails, penguin with bowtie, giraffe, robot, grandmother, golden retriever): 5 pts (1 pt per correct character, max 5 — giraffe counts as 1 even if neck extends)
62
+ - Specific details: robot has 3 colored buttons on chest, grandmother thumbs-up with LEFT hand, dog wears striped party hat, girl has 5 fingers per hand: 5 pts (deduct 1.5 per missing detail)
63
+
64
+ ## Category 6: Decorations and Flying Creatures (10 pts)
65
+ - Bunting banner with approximately 11 flags in alternating red/yellow/blue: 3 pts
66
+ - Exactly 7 paper lanterns in different colors: 3 pts
67
+ - Correct flying creatures: 4 birds (blue jay, cardinal, canary, hummingbird) + 2 butterflies (monarch, morpho): 4 pts (1 pt per 2 correct creatures)
68
+
69
+ ## Category 7: Rainbow, Lighting, and Overall Composition (10 pts)
70
+ - Complete semicircular rainbow with 7 color bands in correct order: 4 pts
71
+ - Consistent warm golden lighting from upper left with shadows falling lower right: 3 pts
72
+ - Overall magical/celebratory mood, scene is joyful and cohesive: 3 pts
73
+
74
+ Respond with ONLY this JSON (no other text):
75
+ {
76
+ "cloud_shapes": <0-15>,
77
+ "balloons": <0-20>,
78
+ "floating_island": <0-10>,
79
+ "table_items": <0-20>,
80
+ "characters": <0-15>,
81
+ "decorations_creatures": <0-10>,
82
+ "rainbow_lighting": <0-10>,
83
+ "reasoning": "<brief 2-3 sentence explanation>"
84
+ }"""
85
+
86
+ # Category maximum scores for validation
87
+ CATEGORY_MAXES = {
88
+ "cloud_shapes": 15,
89
+ "balloons": 20,
90
+ "floating_island": 10,
91
+ "table_items": 20,
92
+ "characters": 15,
93
+ "decorations_creatures": 10,
94
+ "rainbow_lighting": 10,
95
+ }
96
+
97
+ _client = None
98
+
99
+
100
+ def _get_client():
101
+ global _client
102
+ if _client is None:
103
+ from openai import OpenAI
104
+ _client = OpenAI()
105
+ return _client
106
+
107
+
108
+ def _encode_image(image_path: str) -> str:
109
+ with open(image_path, "rb") as f:
110
+ return base64.b64encode(f.read()).decode("utf-8")
111
+
112
+
113
+ def _judge_image(image_path: str) -> Dict[str, Union[float, str]]:
114
+ """Call GPT-5 to score the image. Retries once on failure."""
115
+ client = _get_client()
116
+ b64 = _encode_image(image_path)
117
+
118
+ ext = os.path.splitext(image_path)[1].lstrip(".").lower()
119
+ mime = {"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg", "webp": "image/webp"}.get(ext, "image/png")
120
+ data_url = f"data:{mime};base64,{b64}"
121
+
122
+ messages = [
123
+ {"role": "system", "content": SYSTEM_PROMPT},
124
+ {
125
+ "role": "user",
126
+ "content": [
127
+ {"type": "image_url", "image_url": {"url": data_url, "detail": "high"}},
128
+ {"type": "text", "text": RUBRIC_PROMPT},
129
+ ],
130
+ },
131
+ ]
132
+
133
+ last_error = None
134
+ for attempt in range(2):
135
+ try:
136
+ response = client.chat.completions.create(
137
+ model=JUDGE_MODEL,
138
+ messages=messages,
139
+ max_completion_tokens=16384,
140
+ )
141
+ content = response.choices[0].message.content or ""
142
+ raw = content.strip()
143
+ logger.info(f"Judge raw response (first 300 chars): {raw[:300]}")
144
+
145
+ # Extract JSON from markdown code block if present
146
+ if "```" in raw:
147
+ m = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", raw, re.DOTALL)
148
+ if m:
149
+ raw = m.group(1).strip()
150
+
151
+ # Find JSON object in response
152
+ start = raw.find("{")
153
+ end = raw.rfind("}") + 1
154
+ if start >= 0 and end > start:
155
+ raw = raw[start:end]
156
+
157
+ result = json.loads(raw)
158
+
159
+ # Validate and clamp scores
160
+ scores = {}
161
+ for cat, max_val in CATEGORY_MAXES.items():
162
+ val = result.get(cat, 0)
163
+ if not isinstance(val, (int, float)):
164
+ val = 0
165
+ scores[cat] = max(0, min(max_val, float(val)))
166
+
167
+ scores["reasoning"] = str(result.get("reasoning", ""))
168
+ return scores
169
+
170
+ except Exception as e:
171
+ last_error = e
172
+ logger.warning(f"Judge attempt {attempt + 1} failed: {e}")
173
+
174
+ logger.error(f"GPT-5 judge failed after retries: {last_error}")
175
+ return {cat: 0.0 for cat in CATEGORY_MAXES}
176
+
177
+
178
+ def evaluate(program_path: str) -> Dict[str, Union[float, str]]:
179
+ """Score a VLM-generated image using GPT-5 as judge.
180
+
181
+ Args:
182
+ program_path: Path to the text file (VLM reasoning).
183
+ A sidecar file ``<program_path>.image_path`` contains the
184
+ absolute path to the generated image.
185
+
186
+ Returns:
187
+ Dictionary with combined_score (0-1), per-category scores, and image_path.
188
+ """
189
+ # Read image path from sidecar
190
+ sidecar = program_path + ".image_path"
191
+ image_path = None
192
+ if os.path.exists(sidecar):
193
+ with open(sidecar) as f:
194
+ image_path = f.read().strip()
195
+
196
+ if not image_path or not os.path.exists(image_path):
197
+ logger.warning("No image found for scoring")
198
+ return {"combined_score": 0.0, "error": "No image to score"}
199
+
200
+ # Score with GPT-5
201
+ scores = _judge_image(image_path)
202
+
203
+ # Compute total out of 100, normalize to 0-1
204
+ total = sum(v for k, v in scores.items() if k in CATEGORY_MAXES)
205
+ combined = round(total / 100.0, 4)
206
+
207
+ result = {"combined_score": combined, "image_path": image_path}
208
+
209
+ # Add per-category scores (normalized to 0-1 for each category)
210
+ for cat, max_val in CATEGORY_MAXES.items():
211
+ result[cat] = round(scores.get(cat, 0) / max_val, 4)
212
+
213
+ # Also store raw scores
214
+ result["raw_total"] = round(total, 1)
215
+
216
+ reasoning = scores.get("reasoning", "")
217
+ if reasoning:
218
+ result["judge_reasoning"] = reasoning
219
+
220
+ return result
benchmarks/math/README.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Math Benchmarks
2
+
3
+ Mathematical optimization and algorithm evolution problems.
4
+
5
+ ## Problems
6
+
7
+ ### Signal processing & geometry (from SkyDiscover demos)
8
+
9
+ - [signal_processing](signal_processing/) — Real-time adaptive filtering for non-stationary time series
10
+ - [circle_packing](circle_packing/) — Pack 26 circles in a unit square to maximize sum of radii (AlphaEvolve B.12)
11
+
12
+ ### AlphaEvolve mathematical problems
13
+
14
+ 12 problems from [AlphaEvolve Appendices A and B](https://storage.googleapis.com/deepmind-media/DeepMind.com/Blog/alphaevolve-a-gemini-powered-coding-agent-for-designing-advanced-algorithms/AlphaEvolve.pdf). All evaluators are normalized to **maximize** the target metric.
15
+
16
+ **Appendix A:**
17
+ - [matmul](matmul/) — Faster algorithm for matrix multiplication (A)
18
+
19
+ **Appendix B:**
20
+ 1. [first_autocorr_ineq](first_autocorr_ineq/) — Upper bound on autoconvolution constant (B.1)
21
+ 2. [second_autocorr_ineq](second_autocorr_ineq/) — Lower bound on autoconvolution norm constant (B.2)
22
+ 3. [third_autocorr_ineq](third_autocorr_ineq/) — Upper bound on absolute autoconvolution constant (B.3)
23
+ 4. [uncertainty_ineq](uncertainty_ineq/) — Upper bound on Fourier uncertainty constant (B.4)
24
+ 5. [erdos_min_overlap](erdos_min_overlap/) — Upper bound on Erdos minimum overlap constant (B.5)
25
+ 6. [sums_diffs_finite_sets](sums_diffs_finite_sets/) — Lower bound on sums/differences of finite sets (B.6)
26
+ 7. [hexagon_packing](hexagon_packing/) — Pack unit hexagons in a regular hexagon, n=11,12 (B.7)
27
+ 8. [minimizing_max_min_dist](minimizing_max_min_dist/) — Minimize max/min distance ratio, n=16 d=2 and n=14 d=3 (B.8)
28
+ 9. [heilbronn_triangle](heilbronn_triangle/) — Heilbronn problem for triangles, n=11 (B.9)
29
+ 10. [heilbronn_convex](heilbronn_convex/) — Heilbronn problem for convex regions, n=13,14 (B.10)
30
+ 11. [circle_packing_rect](circle_packing_rect/) — Pack circles in a rectangle of perimeter 4 (B.13)
31
+
32
+ ## Run
33
+
34
+ ```bash
35
+ uv run skydiscover-run \
36
+ benchmarks/math/signal_processing/initial_program.py \
37
+ benchmarks/math/signal_processing/evaluator.py \
38
+ -c benchmarks/math/signal_processing/config.yaml \
39
+ -s [your_algorithm] \
40
+ -i 100
41
+ ```
42
+
43
+ Each problem directory contains `initial_program.py`, `evaluator.py`, and either `config.yaml` or per-search configs. Some multi-variant problems have numbered subdirectories (e.g., `heilbronn_convex/13/`, `hexagon_packing/11/`).
benchmarks/math/circle_packing/README.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Circle Packing
2
+
3
+ Pack 26 non-overlapping circles in a unit square to maximize the sum of their radii (AlphaEvolve B.12). Target: 2.635.
4
+
5
+ ## Problem
6
+
7
+ - Pack exactly 26 circles inside a unit square
8
+ - No circles may overlap
9
+ - Each circle must lie entirely within the square
10
+ - Maximize the sum of all radii
11
+
12
+ ## Run
13
+
14
+ ```bash
15
+ # From repo root
16
+ uv run skydiscover-run \
17
+ benchmarks/math/circle_packing/initial_program.py \
18
+ benchmarks/math/circle_packing/evaluator.py \
19
+ -c benchmarks/math/circle_packing/config.yaml \
20
+ -s [your_algorithm] \
21
+ -i 100
22
+ ```
23
+
24
+ A `codebase/reference/` directory is provided with geometric insights (hex grids, optimization patterns, packing strategies) that can be used with agentic mode (`--agentic`).
25
+
26
+ ## Scoring
27
+
28
+ - **combined_score**: `sum_of_radii / 2.635` (ratio to AlphaEvolve target)
29
+ - Evaluator validates no overlaps and boundary constraints
30
+
31
+ ## Files
32
+
33
+ | File | Description |
34
+ |------|-------------|
35
+ | `initial_program.py` | Seed: simple ring-based circle arrangement |
36
+ | `evaluator.py` | Validates constraints, computes sum-of-radii ratio to target |
37
+ | `config.yaml` | LLM and evaluator settings |
38
+ | `codebase/reference/` | Geometric reference material for agentic mode |
benchmarks/math/circle_packing/codebase/reference/hex_grid.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hexagonal grid initialization for circle packing.
3
+
4
+ A hexagonal (offset) grid provides a good starting arrangement
5
+ because it's the densest regular packing pattern. Even rows are
6
+ offset by half the spacing, which reduces wasted space.
7
+ """
8
+
9
+ import numpy as np
10
+
11
+
12
+ def hexagonal_grid(n, margin=0.1):
13
+ """
14
+ Generate n points on a hexagonal grid inside [margin, 1-margin]^2.
15
+
16
+ Args:
17
+ n: number of points to generate
18
+ margin: distance from edges to keep clear
19
+
20
+ Returns:
21
+ np.array of shape (n, 2) with (x, y) coordinates
22
+ """
23
+ usable = 1.0 - 2 * margin
24
+ cols = int(np.ceil(np.sqrt(n * 2 / np.sqrt(3))))
25
+ rows = int(np.ceil(n / cols))
26
+
27
+ dx = usable / max(cols - 1, 1)
28
+ dy = usable / max(rows - 1, 1)
29
+
30
+ points = []
31
+ for row in range(rows):
32
+ for col in range(cols):
33
+ if len(points) >= n:
34
+ break
35
+ x = margin + col * dx
36
+ if row % 2 == 1:
37
+ x += dx / 2 # offset for hex pattern
38
+ y = margin + row * dy
39
+ x = np.clip(x, margin, 1 - margin)
40
+ y = np.clip(y, margin, 1 - margin)
41
+ points.append([x, y])
42
+
43
+ return np.array(points[:n])
benchmarks/math/circle_packing/codebase/reference/optimization_patterns.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Common patterns for constrained geometric optimization using scipy.
3
+
4
+ This module shows how to use scipy.optimize.minimize with inequality
5
+ constraints and the SLSQP solver — useful for any problem where you
6
+ need to maximize/minimize an objective subject to geometric constraints.
7
+ """
8
+
9
+ import numpy as np
10
+ from scipy.optimize import minimize
11
+
12
+
13
+ def example_constrained_optimization():
14
+ """
15
+ Template: pack n objects by optimizing positions + sizes jointly.
16
+
17
+ Decision vector: x = [pos_0, pos_1, ..., pos_{n-1}, size_0, ..., size_{n-1}]
18
+ Objective: maximize sum(sizes) => minimize -sum(sizes)
19
+ Constraints: non-overlap + boundary containment (all >= 0)
20
+ """
21
+ n = 10 # number of objects
22
+
23
+ # --- Objective: negative sum of sizes (we minimize, so negate to maximize) ---
24
+ def objective(x):
25
+ sizes = x[2 * n:]
26
+ return -np.sum(sizes)
27
+
28
+ # --- Constraints as a single function returning array of values >= 0 ---
29
+ def constraints_fn(x):
30
+ positions = x[:2 * n].reshape(n, 2)
31
+ sizes = x[2 * n:]
32
+
33
+ c = []
34
+ # Pairwise non-overlap: dist(i,j) - size_i - size_j >= 0
35
+ for i in range(n):
36
+ for j in range(i + 1, n):
37
+ dist = np.linalg.norm(positions[i] - positions[j])
38
+ c.append(dist - sizes[i] - sizes[j])
39
+
40
+ # Boundary: each object stays inside [0, 1] x [0, 1]
41
+ for i in range(n):
42
+ c.append(positions[i, 0] - sizes[i]) # left
43
+ c.append(1 - positions[i, 0] - sizes[i]) # right
44
+ c.append(positions[i, 1] - sizes[i]) # bottom
45
+ c.append(1 - positions[i, 1] - sizes[i]) # top
46
+
47
+ return np.array(c)
48
+
49
+ # --- Initial guess ---
50
+ x0_pos = np.random.rand(n, 2) * 0.6 + 0.2 # avoid edges
51
+ x0_sizes = np.full(n, 0.05)
52
+ x0 = np.concatenate([x0_pos.flatten(), x0_sizes])
53
+
54
+ # --- Bounds ---
55
+ pos_bounds = [(0, 1)] * (2 * n)
56
+ size_bounds = [(0.01, 0.25)] * n
57
+ bounds = pos_bounds + size_bounds
58
+
59
+ # --- Solve ---
60
+ result = minimize(
61
+ objective,
62
+ x0,
63
+ method="SLSQP",
64
+ bounds=bounds,
65
+ constraints={"type": "ineq", "fun": constraints_fn},
66
+ options={"maxiter": 1000, "ftol": 1e-9},
67
+ )
68
+
69
+ opt_positions = result.x[:2 * n].reshape(n, 2)
70
+ opt_sizes = result.x[2 * n:]
71
+ return opt_positions, opt_sizes, -result.fun # return positive sum
72
+
73
+
74
+ def multi_start_optimization(objective, constraint_fn, bounds, n_starts=5):
75
+ """
76
+ Run SLSQP from multiple random starts and keep the best.
77
+
78
+ This helps escape local optima — the solver is gradient-based
79
+ and sensitive to the initial guess.
80
+ """
81
+ best_result = None
82
+ for _ in range(n_starts):
83
+ x0 = np.array([np.random.uniform(lo, hi) for lo, hi in bounds])
84
+ result = minimize(
85
+ objective,
86
+ x0,
87
+ method="SLSQP",
88
+ bounds=bounds,
89
+ constraints={"type": "ineq", "fun": constraint_fn},
90
+ options={"maxiter": 500, "ftol": 1e-8},
91
+ )
92
+ if best_result is None or result.fun < best_result.fun:
93
+ best_result = result
94
+ return best_result
benchmarks/math/circle_packing/codebase/reference/packing_strategies.md ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Circle Packing Strategies for n=26 in a Unit Square
2
+
3
+ ## Key Insight
4
+ Naive geometric placement (rings, grids) gives sum_radii ~ 1.0.
5
+ Using numerical optimization (scipy.optimize) with proper constraint formulation
6
+ can push sum_radii above 2.5.
7
+
8
+ ## Why Optimization Works Better Than Manual Placement
9
+
10
+ Manual placement fixes circle positions, then computes maximum radii.
11
+ This leaves gaps because positions aren't optimized for the radii they produce.
12
+
13
+ **Joint optimization** treats both positions (x,y for each circle) AND radii
14
+ as decision variables, optimizing them simultaneously. This is the key insight.
15
+
16
+ Decision vector: [x0, y0, x1, y1, ..., x25, y25, r0, r1, ..., r25]
17
+ Total variables: 26*2 + 26 = 78
18
+
19
+ ## Constraint Formulation
20
+
21
+ 1. **Non-overlap**: For every pair (i,j): distance(center_i, center_j) >= r_i + r_j
22
+ 2. **Boundary**: For every circle i: x_i - r_i >= 0, x_i + r_i <= 1, y_i - r_i >= 0, y_i + r_i <= 1
23
+ 3. **Positive radii**: r_i > 0 for all i (use bounds, not constraints)
24
+
25
+ ## Recommended Solver
26
+
27
+ scipy.optimize.minimize with method="SLSQP":
28
+ - Handles inequality constraints natively
29
+ - Works with bounds on variables
30
+ - Good for smooth, continuous problems like circle packing
31
+ - Sensitive to initial guess — use multiple starts or a good heuristic
32
+
33
+ ## Initial Guess Strategy
34
+
35
+ A hexagonal grid initial guess works well:
36
+ - Place circles on offset rows (hex pattern)
37
+ - Start with equal small radii (e.g., 0.05)
38
+ - Let the optimizer adjust both positions and radii
39
+
40
+ ## Performance Tips
41
+
42
+ - Set maxiter=1000 or higher for 26 circles
43
+ - Use ftol=1e-8 or smaller for precise solutions
44
+ - Radii bounds: (0.01, 0.2) is a reasonable range for n=26
45
+ - The objective is -sum(radii) (minimize negative to maximize)
benchmarks/math/circle_packing/config.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Math benchmark: circle_packing
2
+ # Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
3
+ language: python
4
+ diff_based_generation: true
5
+ max_iterations: 100
6
+ checkpoint_interval: 10
7
+ max_solution_length: 60000
8
+ llm:
9
+ api_base: https://api.openai.com/v1
10
+ models:
11
+ - name: "gpt-5"
12
+ weight: 1.0
13
+ max_tokens: 16384
14
+ timeout: 600
15
+ prompt:
16
+ system_message: 'You are an expert mathematician specializing in circle packing problems and computational geometry. Your
17
+ task is to improve a constructor function that directly produces a specific arrangement of 26 circles in a unit square,
18
+ maximizing the sum of their radii. The AlphaEvolve paper achieved a sum of 2.635 for n=26.
19
+
20
+
21
+ Key geometric insights:
22
+
23
+ - Circle packings often follow hexagonal patterns in the densest regions
24
+
25
+ - Maximum density for infinite circle packing is pi/(2*sqrt(3)) ≈ 0.9069
26
+
27
+ - Edge effects make square container packing harder than infinite packing
28
+
29
+ - Circles can be placed in layers or shells when confined to a square
30
+
31
+ - Similar radius circles often form regular patterns, while varied radii allow better space utilization
32
+
33
+ - Perfect symmetry may not yield the optimal packing due to edge effects
34
+
35
+
36
+ Focus on designing an explicit constructor that places each circle in a specific position, rather than an iterative search
37
+ algorithm.
38
+
39
+ '
40
+ evaluator:
41
+ timeout: 360
42
+ cascade_evaluation: true
43
+ cascade_thresholds:
44
+ - 0.3
45
+ - 0.6
46
+
47
+ # Live monitor dashboard
48
+ monitor:
49
+ enabled: true
50
+ port: 8765
51
+ host: "127.0.0.1"
52
+
53
+ # Human feedback
54
+ human_feedback_enabled: true
benchmarks/math/circle_packing/evaluator.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Evaluator for circle packing example (n=26) with improved timeout handling
3
+ """
4
+
5
+ import numpy as np
6
+ import time
7
+ import os
8
+ import subprocess
9
+ import tempfile
10
+ import traceback
11
+ import sys
12
+ import pickle
13
+
14
+
15
+ class TimeoutError(Exception):
16
+ pass
17
+
18
+
19
+ def timeout_handler(signum, frame):
20
+ """Handle timeout signal"""
21
+ raise TimeoutError("Function execution timed out")
22
+
23
+
24
+ def validate_packing(centers, radii):
25
+ """
26
+ Validate that circles don't overlap and are inside the unit square
27
+
28
+ Args:
29
+ centers: np.array of shape (n, 2) with (x, y) coordinates
30
+ radii: np.array of shape (n) with radius of each circle
31
+
32
+ Returns:
33
+ True if valid, False otherwise
34
+ """
35
+ n = centers.shape[0]
36
+
37
+ # Check for NaN values
38
+ if np.isnan(centers).any():
39
+ print("NaN values detected in circle centers")
40
+ return False
41
+
42
+ if np.isnan(radii).any():
43
+ print("NaN values detected in circle radii")
44
+ return False
45
+
46
+ # Check if radii are nonnegative and not nan
47
+ for i in range(n):
48
+ if radii[i] < 0:
49
+ print(f"Circle {i} has negative radius {radii[i]}")
50
+ return False
51
+ elif np.isnan(radii[i]):
52
+ print(f"Circle {i} has nan radius")
53
+ return False
54
+
55
+ # Check if circles are inside the unit square
56
+ for i in range(n):
57
+ x, y = centers[i]
58
+ r = radii[i]
59
+ if x - r < -1e-6 or x + r > 1 + 1e-6 or y - r < -1e-6 or y + r > 1 + 1e-6:
60
+ print(f"Circle {i} at ({x}, {y}) with radius {r} is outside the unit square")
61
+ return False
62
+
63
+ # Check for overlaps
64
+ for i in range(n):
65
+ for j in range(i + 1, n):
66
+ dist = np.sqrt(np.sum((centers[i] - centers[j]) ** 2))
67
+ if dist < radii[i] + radii[j] - 1e-6: # Allow for tiny numerical errors
68
+ print(f"Circles {i} and {j} overlap: dist={dist}, r1+r2={radii[i]+radii[j]}")
69
+ return False
70
+
71
+ return True
72
+
73
+
74
+ def run_with_timeout(program_path, timeout_seconds=20):
75
+ """
76
+ Run the program in a separate process with timeout
77
+ using a simple subprocess approach
78
+
79
+ Args:
80
+ program_path: Path to the program file
81
+ timeout_seconds: Maximum execution time in seconds
82
+
83
+ Returns:
84
+ centers, radii, sum_radii tuple from the program
85
+ """
86
+ # Create a temporary file to execute
87
+ with tempfile.NamedTemporaryFile(suffix=".py", delete=False) as temp_file:
88
+ # Write a script that executes the program and saves results
89
+ script = f"""
90
+ import sys
91
+ import numpy as np
92
+ import os
93
+ import pickle
94
+ import traceback
95
+
96
+ # Add the directory to sys.path
97
+ sys.path.insert(0, os.path.dirname('{program_path}'))
98
+
99
+ # Debugging info
100
+ print(f"Running in subprocess, Python version: {{sys.version}}")
101
+ print(f"Program path: {program_path}")
102
+
103
+ try:
104
+ # Import the program
105
+ spec = __import__('importlib.util').util.spec_from_file_location("program", '{program_path}')
106
+ program = __import__('importlib.util').util.module_from_spec(spec)
107
+ spec.loader.exec_module(program)
108
+
109
+ # Run the packing function
110
+ print("Calling run_packing()...")
111
+ centers, radii, sum_radii = program.run_packing()
112
+ print(f"run_packing() returned successfully: sum_radii = {{sum_radii}}")
113
+
114
+ # Save results to a file
115
+ results = {{
116
+ 'centers': centers,
117
+ 'radii': radii,
118
+ 'sum_radii': sum_radii
119
+ }}
120
+
121
+ with open('{temp_file.name}.results', 'wb') as f:
122
+ pickle.dump(results, f)
123
+ print(f"Results saved to {temp_file.name}.results")
124
+
125
+ except Exception as e:
126
+ # If an error occurs, save the error instead
127
+ print(f"Error in subprocess: {{str(e)}}")
128
+ traceback.print_exc()
129
+ with open('{temp_file.name}.results', 'wb') as f:
130
+ pickle.dump({{'error': str(e)}}, f)
131
+ print(f"Error saved to {temp_file.name}.results")
132
+ """
133
+ temp_file.write(script.encode())
134
+ temp_file_path = temp_file.name
135
+
136
+ results_path = f"{temp_file_path}.results"
137
+
138
+ try:
139
+ # Run the script with timeout
140
+ process = subprocess.Popen(
141
+ [sys.executable, temp_file_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE
142
+ )
143
+
144
+ try:
145
+ stdout, stderr = process.communicate(timeout=timeout_seconds)
146
+ exit_code = process.returncode
147
+
148
+ # Always print output for debugging purposes
149
+ print(f"Subprocess stdout: {stdout.decode()}")
150
+ if stderr:
151
+ print(f"Subprocess stderr: {stderr.decode()}")
152
+
153
+ # Still raise an error for non-zero exit codes, but only after printing the output
154
+ if exit_code != 0:
155
+ raise RuntimeError(f"Process exited with code {exit_code}")
156
+
157
+ # Load the results
158
+ if os.path.exists(results_path):
159
+ with open(results_path, "rb") as f:
160
+ results = pickle.load(f)
161
+
162
+ # Check if an error was returned
163
+ if "error" in results:
164
+ raise RuntimeError(f"Program execution failed: {results['error']}")
165
+
166
+ return results["centers"], results["radii"], results["sum_radii"]
167
+ else:
168
+ raise RuntimeError("Results file not found")
169
+
170
+ except subprocess.TimeoutExpired:
171
+ # Kill the process if it times out
172
+ process.kill()
173
+ process.wait()
174
+ raise TimeoutError(f"Process timed out after {timeout_seconds} seconds")
175
+
176
+ finally:
177
+ # Clean up temporary files
178
+ if os.path.exists(temp_file_path):
179
+ os.unlink(temp_file_path)
180
+ if os.path.exists(results_path):
181
+ os.unlink(results_path)
182
+
183
+
184
+ def evaluate(program_path):
185
+ """
186
+ Evaluate the program by running it once and checking the sum of radii
187
+
188
+ Args:
189
+ program_path: Path to the program file
190
+
191
+ Returns:
192
+ Dictionary of metrics
193
+ """
194
+ # Target value from the paper
195
+ TARGET_VALUE = 2.635 # AlphaEvolve result for n=26
196
+
197
+ try:
198
+ # For constructor-based approaches, a single evaluation is sufficient
199
+ # since the result is deterministic
200
+ start_time = time.time()
201
+
202
+ # Use subprocess to run with timeout
203
+ centers, radii, reported_sum = run_with_timeout(
204
+ program_path, timeout_seconds=600 # Single timeout
205
+ )
206
+
207
+ end_time = time.time()
208
+ eval_time = end_time - start_time
209
+
210
+ # Ensure centers and radii are numpy arrays
211
+ if not isinstance(centers, np.ndarray):
212
+ centers = np.array(centers)
213
+ if not isinstance(radii, np.ndarray):
214
+ radii = np.array(radii)
215
+
216
+ # Check for NaN values before validation
217
+ if np.isnan(centers).any() or np.isnan(radii).any():
218
+ print("NaN values detected in solution")
219
+ return {
220
+ "sum_radii": 0.0,
221
+ "target_ratio": 0.0,
222
+ "validity": 0.0,
223
+ "eval_time": float(time.time() - start_time),
224
+ "combined_score": 0.0,
225
+ }
226
+
227
+ # Validate solution
228
+ valid = validate_packing(centers, radii)
229
+
230
+ # Check shape and size
231
+ shape_valid = centers.shape == (26, 2) and radii.shape == (26,)
232
+ if not shape_valid:
233
+ print(
234
+ f"Invalid shapes: centers={centers.shape}, radii={radii.shape}, expected (26, 2) and (26,)"
235
+ )
236
+ valid = False
237
+
238
+ # Calculate sum
239
+ sum_radii = np.sum(radii) if valid else 0.0
240
+
241
+ # Make sure reported_sum matches the calculated sum
242
+ if abs(sum_radii - reported_sum) > 1e-6:
243
+ print(f"Warning: Reported sum {reported_sum} doesn't match calculated sum {sum_radii}")
244
+
245
+ # Target ratio (how close we are to the target)
246
+ target_ratio = sum_radii / TARGET_VALUE if valid else 0.0
247
+
248
+ # Validity score
249
+ validity = 1.0 if valid else 0.0
250
+
251
+ # Combined score - higher is better
252
+ combined_score = target_ratio * validity
253
+
254
+ print(
255
+ f"Evaluation: valid={valid}, sum_radii={sum_radii:.6f}, target={TARGET_VALUE}, ratio={target_ratio:.6f}, time={eval_time:.2f}s"
256
+ )
257
+
258
+ return {
259
+ "sum_radii": float(sum_radii),
260
+ "target_ratio": float(target_ratio),
261
+ "validity": float(validity),
262
+ "eval_time": float(eval_time),
263
+ "combined_score": float(combined_score),
264
+ }
265
+
266
+ except Exception as e:
267
+ print(f"Evaluation failed completely: {str(e)}")
268
+ traceback.print_exc()
269
+ return {
270
+ "sum_radii": 0.0,
271
+ "target_ratio": 0.0,
272
+ "validity": 0.0,
273
+ "eval_time": 0.0,
274
+ "combined_score": 0.0,
275
+ }
276
+
277
+
278
+ # Stage-based evaluation for cascade evaluation
279
+ def evaluate_stage1(program_path):
280
+ """
281
+ First stage evaluation - quick validation check
282
+ """
283
+ try:
284
+ # Use the simplified subprocess approach
285
+ try:
286
+ centers, radii, sum_radii = run_with_timeout(program_path, timeout_seconds=600)
287
+
288
+ # Ensure centers and radii are numpy arrays
289
+ if not isinstance(centers, np.ndarray):
290
+ centers = np.array(centers)
291
+ if not isinstance(radii, np.ndarray):
292
+ radii = np.array(radii)
293
+
294
+ # Validate solution (shapes and constraints)
295
+ shape_valid = centers.shape == (26, 2) and radii.shape == (26,)
296
+ if not shape_valid:
297
+ print(f"Invalid shapes: centers={centers.shape}, radii={radii.shape}")
298
+ return {"validity": 0.0, "error": "Invalid shapes"}
299
+
300
+ valid = validate_packing(centers, radii)
301
+
302
+ # Calculate sum
303
+ actual_sum = np.sum(radii) if valid else 0.0
304
+
305
+ # Target from paper
306
+ target = 2.635
307
+
308
+ # Simple combined score for stage 1
309
+ combined_score = (actual_sum / target) if valid else 0.0
310
+
311
+ # Return evaluation metrics
312
+ return {
313
+ "validity": 1.0 if valid else 0.0,
314
+ "sum_radii": float(actual_sum),
315
+ "target_ratio": float(actual_sum / target if valid else 0.0),
316
+ "combined_score": float(combined_score),
317
+ }
318
+
319
+ except TimeoutError as e:
320
+ print(f"Stage 1 evaluation timed out: {e}")
321
+ return {"validity": 0.0, "combined_score": 0.0, "error": "Timeout"}
322
+ except Exception as e:
323
+ print(f"Stage 1 evaluation failed: {e}")
324
+ print(traceback.format_exc())
325
+ return {"validity": 0.0, "combined_score": 0.0, "error": str(e)}
326
+
327
+ except Exception as e:
328
+ print(f"Stage 1 evaluation failed completely: {e}")
329
+ print(traceback.format_exc())
330
+ return {"validity": 0.0, "combined_score": 0.0, "error": str(e)}
331
+
332
+
333
+ def evaluate_stage2(program_path):
334
+ """
335
+ Second stage evaluation - full evaluation
336
+ """
337
+ # Full evaluation as in the main evaluate function
338
+ return evaluate(program_path)
benchmarks/math/circle_packing/evaluator/Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+ WORKDIR /benchmark
3
+
4
+ COPY requirements.txt .
5
+ RUN pip install --no-cache-dir -r requirements.txt
6
+
7
+ COPY evaluator.py .
8
+ COPY evaluate.sh .
9
+ RUN chmod +x evaluate.sh
10
+
11
+ ENTRYPOINT ["./evaluate.sh"]
benchmarks/math/circle_packing/evaluator/evaluate.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ PROGRAM="$1"
5
+ # MODE ($2) is accepted but ignored — pure optimization has no data split.
6
+
7
+ echo "[$(date '+%H:%M:%S')] eval start: $PROGRAM" >> /tmp/eval.log
8
+ python /benchmark/evaluator.py "$PROGRAM"