JustinTX commited on
Commit
730e01e
·
verified ·
1 Parent(s): bc9b4d5

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. benchmarks/ADRS/eplb/README.md +63 -0
  2. benchmarks/ADRS/eplb/initial_program.py +238 -0
  3. benchmarks/ADRS/llm_sql/evaluator/utils.py +81 -0
  4. benchmarks/ADRS/prism/config.yaml +24 -0
  5. benchmarks/ADRS/prism/evaluator/Dockerfile +13 -0
  6. benchmarks/ADRS/prism/evaluator/evaluator.py +259 -0
  7. benchmarks/ADRS/prism/evaluator/requirements.txt +1 -0
  8. benchmarks/ADRS/prism/evaluator/wrapper.py +98 -0
  9. benchmarks/arc_benchmark/config.yaml +51 -0
  10. benchmarks/arc_benchmark/convert_arc_agi2_data.py +63 -0
  11. benchmarks/arc_benchmark/evaluator/Dockerfile +13 -0
  12. benchmarks/arc_benchmark/evaluator/evaluate.sh +7 -0
  13. benchmarks/arc_benchmark/evaluator/evaluator.py +407 -0
  14. benchmarks/arc_benchmark/evaluator/requirements.txt +1 -0
  15. benchmarks/arc_benchmark/evaluator/wrapper.py +98 -0
  16. benchmarks/arc_benchmark/generate_config.py +101 -0
  17. benchmarks/arc_benchmark/initial_program.py +42 -0
  18. benchmarks/arc_benchmark/post_discovery_eval.py +157 -0
  19. benchmarks/frontier-cs-eval/README.md +72 -0
  20. benchmarks/frontier-cs-eval/analyze_results.py +105 -0
  21. benchmarks/frontier-cs-eval/combine_results.py +66 -0
  22. benchmarks/frontier-cs-eval/config.yaml +57 -0
  23. benchmarks/frontier-cs-eval/evaluator.py +174 -0
  24. benchmarks/frontier-cs-eval/initial_program.cpp +6 -0
  25. benchmarks/frontier-cs-eval/run_all_frontiercs.py +70 -0
  26. benchmarks/frontier-cs-eval/run_best_programs_frontiercs.py +404 -0
  27. benchmarks/image_gen/README.md +40 -0
  28. benchmarks/image_gen/sky_festival/config.yaml +103 -0
  29. benchmarks/image_gen/sky_festival/evaluator.py +220 -0
  30. benchmarks/math/circle_packing_rect/evaluator/evaluator.py +119 -0
  31. benchmarks/math/erdos_min_overlap/config.yaml +41 -0
  32. benchmarks/math/erdos_min_overlap/evaluator/Dockerfile +13 -0
  33. benchmarks/math/erdos_min_overlap/evaluator/requirements.txt +3 -0
  34. benchmarks/math/erdos_min_overlap/initial_program.py +96 -0
  35. benchmarks/math/heilbronn_convex/13/evaluator/evaluate.sh +7 -0
  36. benchmarks/math/heilbronn_convex/13/evaluator/wrapper.py +98 -0
  37. benchmarks/math/hexagon_packing/11/evaluator/evaluate.sh +7 -0
  38. benchmarks/math/matmul/evaluator/Dockerfile +13 -0
  39. benchmarks/math/matmul/evaluator/evaluate.sh +7 -0
  40. benchmarks/math/matmul/evaluator/evaluator.py +115 -0
  41. benchmarks/math/matmul/evaluator/requirements.txt +3 -0
  42. benchmarks/math/matmul/evaluator/wrapper.py +98 -0
  43. benchmarks/math/matmul/initial_program.py +199 -0
  44. benchmarks/math/minimizing_max_min_dist/2/config.yaml +29 -0
  45. benchmarks/math/minimizing_max_min_dist/2/evaluator/Dockerfile +13 -0
  46. benchmarks/math/minimizing_max_min_dist/2/evaluator/evaluate.sh +7 -0
  47. benchmarks/math/minimizing_max_min_dist/2/evaluator/evaluator.py +78 -0
  48. benchmarks/math/minimizing_max_min_dist/2/evaluator/requirements.txt +2 -0
  49. benchmarks/math/minimizing_max_min_dist/2/evaluator/wrapper.py +98 -0
  50. benchmarks/math/minimizing_max_min_dist/2/initial_program.py +24 -0
benchmarks/ADRS/eplb/README.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Expert Parallelism Load Balancer (EPLB)
2
+
3
+ This benchmark uses SkyDiscover to optimize the Expert Parallelism Load Balancer (EPLB) algorithm for Mixture-of-Expert (MoE) models. The goal is to rearrange and replicate experts across GPUs to balance load, while keeping the rearrangement algorithm itself fast.
4
+
5
+ ## Setup
6
+
7
+ 1. **Install PyTorch** (required by the evaluator):
8
+
9
+ ```bash
10
+ uv pip install torch
11
+ ```
12
+
13
+ 2. **Download the workload file** from [Hugging Face](https://huggingface.co/datasets/abmfy/eplb-openevolve) into this directory:
14
+
15
+ ```bash
16
+ cd benchmarks/ADRS/eplb
17
+ wget https://huggingface.co/datasets/abmfy/eplb-openevolve/resolve/main/expert-load.json
18
+ ```
19
+
20
+ 3. **Set your API key:**
21
+
22
+ ```bash
23
+ export OPENAI_API_KEY=...
24
+ ```
25
+
26
+ ## Run
27
+
28
+ From the repo root:
29
+
30
+ ```bash
31
+ uv run skydiscover-run \
32
+ benchmarks/ADRS/eplb/initial_program.py \
33
+ benchmarks/ADRS/eplb/evaluator.py \
34
+ -c benchmarks/ADRS/eplb/config.yaml \
35
+ -s [your_algorithm] \
36
+ -i 100 \
37
+ -o eplb_output
38
+ ```
39
+
40
+ Or from this directory:
41
+
42
+ ```bash
43
+ uv run skydiscover-run initial_program.py evaluator.py \
44
+ -c config.yaml \
45
+ -s [your_algorithm] \
46
+ -i 100
47
+ ```
48
+
49
+ ## Evaluate a saved program
50
+
51
+ ```bash
52
+ python evaluate_best_program.py
53
+ ```
54
+
55
+ ## Files
56
+
57
+ | File | Description |
58
+ |------|-------------|
59
+ | `initial_program.py` | Baseline `rebalance_experts` function to evolve |
60
+ | `evaluator.py` | Scores programs on load-balance quality and execution speed |
61
+ | `config.yaml` | Task-specific config (LLM, evaluator timeout, system prompt) |
62
+ | `evaluate_best_program.py` | Standalone script to evaluate a saved best program |
63
+ | `expert-load.json` | Workload data (must be downloaded — see Setup) |
benchmarks/ADRS/eplb/initial_program.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """
3
+ Expert parallelism load balancer (EPLB) for vLLM.
4
+
5
+ This module implements the core rearrangement algorithm.
6
+
7
+ The rearrangement algorithm is adapted from
8
+ [DeepSeek EPLB](https://github.com/deepseek-ai/eplb).
9
+
10
+ Please find at [#12](https://github.com/deepseek-ai/EPLB/issues/12) an example
11
+ on how the EPLB algorithm works.
12
+ """
13
+
14
+ # EVOLVE-BLOCK-START
15
+
16
+ import torch
17
+
18
+
19
+ def balanced_packing(weight: torch.Tensor,
20
+ num_packs: int) -> tuple[torch.Tensor, torch.Tensor]:
21
+ """
22
+ Pack n weighted objects to m packs, such that each bin contains exactly
23
+ n/m objects and the weights of all packs are as balanced as possible.
24
+
25
+ Parameters:
26
+ weight: [X, n], the weight of each item
27
+ num_packs: number of packs
28
+
29
+ Returns:
30
+ pack_index: [X, n], the pack index of each item
31
+ rank_in_pack: [X, n], the rank of the item in the pack
32
+ """
33
+ num_layers, num_groups = weight.shape
34
+ assert num_groups % num_packs == 0
35
+ groups_per_pack = num_groups // num_packs
36
+
37
+ if groups_per_pack == 1:
38
+ pack_index = torch.arange(weight.size(-1),
39
+ dtype=torch.int64,
40
+ device=weight.device).expand(weight.shape)
41
+ rank_in_pack = torch.zeros_like(weight, dtype=torch.int64)
42
+ return pack_index, rank_in_pack
43
+
44
+ indices = weight.float().sort(-1, descending=True).indices.cpu()
45
+ pack_index = torch.full_like(weight,
46
+ fill_value=-1,
47
+ dtype=torch.int64,
48
+ device="cpu")
49
+ rank_in_pack = torch.full_like(pack_index, fill_value=-1)
50
+ for i in range(num_layers):
51
+ pack_weights = [0] * num_packs
52
+ pack_items = [0] * num_packs
53
+ for group in indices[i]:
54
+ pack = min(
55
+ (i
56
+ for i in range(num_packs) if pack_items[i] < groups_per_pack),
57
+ key=pack_weights.__getitem__,
58
+ )
59
+ assert pack_items[pack] < groups_per_pack
60
+ pack_index[i, group] = pack
61
+ rank_in_pack[i, group] = pack_items[pack]
62
+ pack_weights[pack] += weight[i, group]
63
+ pack_items[pack] += 1
64
+ return pack_index, rank_in_pack
65
+
66
+
67
+ def replicate_experts(
68
+ weight: torch.Tensor,
69
+ num_phy: int) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
70
+ """
71
+ Replicate `num_log` experts to `num_phy` replicas, such that the maximum
72
+ load of all replicas is minimized.
73
+
74
+ Parameters:
75
+ weight: [X, num_log]
76
+ num_phy: total number of experts after replication
77
+
78
+ Returns:
79
+ phy2log: [X, num_phy], logical expert id of each physical expert
80
+ rank: [X, num_phy], the replica rank
81
+ logcnt: [X, num_log], number of replicas for each logical expert
82
+ """
83
+ n, num_log = weight.shape
84
+ num_redundant = num_phy - num_log
85
+ assert num_redundant >= 0
86
+ device = weight.device
87
+ phy2log = torch.arange(num_phy, dtype=torch.int64,
88
+ device=device).repeat(n, 1)
89
+ rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device)
90
+ logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device)
91
+ arangen = torch.arange(n, dtype=torch.int64, device=device)
92
+ for i in range(num_log, num_phy):
93
+ redundant_indices = (weight / logcnt).max(dim=-1).indices
94
+ phy2log[:, i] = redundant_indices
95
+ rank[:, i] = logcnt[arangen, redundant_indices]
96
+ logcnt[arangen, redundant_indices] += 1
97
+ return phy2log, rank, logcnt
98
+
99
+
100
+ def rebalance_experts_hierarchical(
101
+ weight: torch.Tensor,
102
+ num_physical_experts: int,
103
+ num_groups: int,
104
+ num_nodes: int,
105
+ num_gpus: int,
106
+ ):
107
+ """
108
+ Parameters:
109
+ weight: [num_moe_layers, num_logical_experts]
110
+ num_physical_experts: number of physical experts after replication
111
+ num_groups: number of expert groups
112
+ num_nodes: number of server nodes, where the intra-node network
113
+ (e.g, NVLink) is faster
114
+ num_gpus: number of GPUs, must be a multiple of `num_nodes`
115
+
116
+ Returns:
117
+ physical_to_logical_map: [num_moe_layers, num_physical_experts]
118
+ logical_to_physical_map: [num_moe_layers, num_logical_experts, X]
119
+ logical_count: [num_moe_layers, num_logical_experts]
120
+ """
121
+ num_layers, num_logical_experts = weight.shape
122
+ assert num_logical_experts % num_groups == 0
123
+ group_size = num_logical_experts // num_groups
124
+ assert num_groups % num_nodes == 0
125
+ groups_per_node = num_groups // num_nodes
126
+ assert num_gpus % num_nodes == 0
127
+ assert num_physical_experts % num_gpus == 0
128
+ phy_experts_per_gpu = num_physical_experts // num_gpus
129
+
130
+ def inverse(perm: torch.Tensor) -> torch.Tensor:
131
+ inv = torch.empty_like(perm)
132
+ inv.scatter_(
133
+ 1,
134
+ perm,
135
+ torch.arange(perm.size(1), dtype=torch.int64,
136
+ device=perm.device).expand(perm.shape),
137
+ )
138
+ return inv
139
+
140
+ # Step 1: pack groups to nodes
141
+ tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1)
142
+ group_pack_index, group_rank_in_pack = balanced_packing(
143
+ tokens_per_group, num_nodes)
144
+ log2mlog = (((group_pack_index * groups_per_node + group_rank_in_pack) *
145
+ group_size).unsqueeze(-1) +
146
+ torch.arange(group_size,
147
+ dtype=torch.int64,
148
+ device=group_pack_index.device)).flatten(-2)
149
+ mlog2log = inverse(log2mlog)
150
+
151
+ # Step 2: construct redundant experts within nodes
152
+ # [num_layers * num_nodes, num_logical_experts // num_nodes]
153
+ tokens_per_mlog = weight.gather(-1, mlog2log).view(
154
+ -1, num_logical_experts // num_nodes)
155
+ phy2mlog, phyrank, mlogcnt = replicate_experts(
156
+ tokens_per_mlog, num_physical_experts // num_nodes)
157
+
158
+ # Step 3: pack physical_experts to GPUs
159
+ # [num_layers * num_nodes, num_physical_experts // num_nodes]
160
+ tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog)
161
+ pack_index, rank_in_pack = balanced_packing(tokens_per_phy,
162
+ num_gpus // num_nodes)
163
+ phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
164
+ pphy2phy = inverse(phy2pphy)
165
+
166
+ pphy2mlog = phy2mlog.gather(
167
+ -1, pphy2phy) # [num_layers * num_nodes, num_log_per_nodes]
168
+ pphy2mlog = (pphy2mlog.view(num_layers, num_nodes, -1) + torch.arange(
169
+ 0,
170
+ num_logical_experts,
171
+ num_logical_experts // num_nodes,
172
+ device=group_pack_index.device,
173
+ ).view(1, -1, 1)).flatten(-2)
174
+ pphy2log = mlog2log.gather(-1, pphy2mlog)
175
+ pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1)
176
+ logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog)
177
+ return pphy2log, pphyrank, logcnt
178
+
179
+
180
+ def rebalance_experts(
181
+ weight: torch.Tensor,
182
+ num_replicas: int,
183
+ num_groups: int,
184
+ num_nodes: int,
185
+ num_gpus: int,
186
+ ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
187
+ """
188
+ Entry point for expert-parallelism load balancer.
189
+
190
+ Parameters:
191
+ weight: [layers, num_logical_experts], the load statistics for all
192
+ logical experts
193
+ num_replicas: number of physical experts, must be a multiple of
194
+ `num_gpus`
195
+ num_groups: number of expert groups
196
+ num_nodes: number of server nodes, where the intra-node network
197
+ (e.g, NVLink) is faster
198
+ num_gpus: number of GPUs, must be a multiple of `num_nodes`
199
+
200
+ Returns:
201
+ physical_to_logical_map: [layers, num_replicas], the expert index of
202
+ each replica
203
+ logical_to_physical_map: [layers, num_logical_experts, X], the replica
204
+ indices for each expert
205
+ expert_count: [layers, num_logical_experts], number of physical
206
+ replicas for each logical expert
207
+ """
208
+ num_layers, num_logical_experts = weight.shape
209
+ weight = weight.float().cpu()
210
+ if num_groups % num_nodes == 0:
211
+ # use hierarchical load-balance policy
212
+ phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
213
+ weight, num_replicas, num_groups, num_nodes, num_gpus)
214
+ else:
215
+ # use global load-balance policy
216
+ phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
217
+ weight, num_replicas, 1, 1, num_gpus)
218
+ num_redundant_experts = num_replicas - num_logical_experts
219
+ maxlogcnt = num_redundant_experts + 1
220
+ log2phy: torch.Tensor = torch.full(
221
+ (num_layers, num_logical_experts, maxlogcnt),
222
+ -1,
223
+ dtype=torch.int64,
224
+ device=logcnt.device,
225
+ )
226
+ log2phy.view(num_layers, -1).scatter_(
227
+ -1,
228
+ phy2log * maxlogcnt + phyrank,
229
+ torch.arange(num_replicas, dtype=torch.int64,
230
+ device=log2phy.device).expand(num_layers, -1),
231
+ )
232
+ return phy2log, log2phy, logcnt
233
+
234
+
235
+ # EVOLVE-BLOCK-END
236
+
237
+ __all__ = ["rebalance_experts"]
238
+
benchmarks/ADRS/llm_sql/evaluator/utils.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from concurrent.futures import ThreadPoolExecutor
2
+ import pandas as pd
3
+ from typing import List, Tuple
4
+
5
+ class TrieNode:
6
+ def __init__(self):
7
+ self.children = {}
8
+ self.end_of_word = False
9
+
10
+
11
+ class Trie:
12
+ def __init__(self):
13
+ self.root = TrieNode()
14
+
15
+ def insert(self, word):
16
+ node = self.root
17
+ for char in word:
18
+ if char not in node.children:
19
+ node.children[char] = TrieNode()
20
+ node = node.children[char]
21
+ node.end_of_word = True
22
+
23
+ def longest_common_prefix(self, word):
24
+ node = self.root
25
+ common_prefix_length = 0
26
+ for char in word:
27
+ if char in node.children:
28
+ common_prefix_length += len(char)
29
+ node = node.children[char]
30
+ else:
31
+ break
32
+ return common_prefix_length
33
+
34
+ def calculate_length(value):
35
+ val = 0
36
+ if isinstance(value, bool):
37
+ val = 4 # length of 'True' or 'False'
38
+ elif isinstance(value, (int, float)):
39
+ val = len(str(value))
40
+ elif isinstance(value, str):
41
+ val = len(value)
42
+ else:
43
+ val = 0
44
+ return val**2
45
+
46
+ def evaluate_df_prefix_hit_cnt(df: pd.DataFrame) -> Tuple[int, int]:
47
+ """
48
+ Function to evaluate the prefix hit count of a DataFrame
49
+ """
50
+
51
+ def max_overlap(trie, row_string):
52
+ return min(len(row_string), trie.longest_common_prefix(row_string))
53
+
54
+
55
+ trie = Trie()
56
+ total_prefix_hit_count = 0
57
+ total_string_length = 0
58
+
59
+ def process_row(index, row):
60
+ nonlocal total_string_length
61
+ row_string = "".join(row.fillna("").astype(str).values) # No spaces between columns
62
+ total_string_length += len(row_string)
63
+ row_prefix_hit_count = max_overlap(trie, row_string)
64
+ trie.insert(row_string)
65
+ return row_prefix_hit_count
66
+
67
+ with ThreadPoolExecutor() as executor:
68
+ results = executor.map(process_row, df.index, [row for _, row in df.iterrows()])
69
+
70
+ total_prefix_hit_count = sum(results)
71
+ total_prefix_hit_rate = total_prefix_hit_count / total_string_length
72
+ assert total_prefix_hit_count <= total_string_length
73
+ print(f"Total string length: {total_string_length}")
74
+ no_cache_pricing = 2.5 / 5 # per 1M if not cached
75
+ cache_pricing = 1.25 / 5 # per 1M if cached
76
+ cached_tokens_pricing = total_prefix_hit_count * cache_pricing / 1e6
77
+ non_cached_tokens_pricing = (total_string_length - total_prefix_hit_count) * no_cache_pricing / 1e6
78
+ print(
79
+ f"Cached tokens pricing = {round(cached_tokens_pricing,2)}, Non-cached tokens pricing = {round(non_cached_tokens_pricing,2)}, total pricing = {round(cached_tokens_pricing + non_cached_tokens_pricing,2)}"
80
+ )
81
+ return total_prefix_hit_count, total_prefix_hit_rate * 100
benchmarks/ADRS/prism/config.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Prism (GPU Model Placement) — Prompt Caching Column Reordering Optimization
2
+ # Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
3
+ language: python
4
+ diff_based_generation: true
5
+ max_iterations: 100
6
+ checkpoint_interval: 5
7
+ max_solution_length: 60000
8
+
9
+ llm:
10
+ api_base: https://api.openai.com/v1
11
+ models:
12
+ - name: "gpt-5"
13
+ weight: 1.0
14
+ max_tokens: 32000
15
+ timeout: 600
16
+
17
+ prompt:
18
+ system_message: |-
19
+ You are an expert for model placement on GPUs. Your task is to improve a model placement algorithm by improve the function named compute_model_placement in the intial program that places models to available GPUs.
20
+ The algorithm must MINIMIZE the maximum KVPR across all GPUs while ensuring models can fit into the GPUs' memory. Note that KVPR is KV cache pressure for a GPU. It indicates how crowded a GPU is. For a specific GPU, its KVPR is computed as sum(model.req_rate/model.slo for model in models) / (GPU_MEM_SIZE - sum(model.model_size for model in models)), where models are the models on this GPU. The generated program should be as simple as possible and the code should be executed correctly without errors.
21
+
22
+ evaluator:
23
+ timeout: 360
24
+
benchmarks/ADRS/prism/evaluator/Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+ WORKDIR /benchmark
3
+
4
+ COPY requirements.txt .
5
+ RUN pip install --no-cache-dir -r requirements.txt
6
+
7
+ # wrapper.py provides backwards compatibility for old Python-based evaluators
8
+ # that define evaluate(program_path) -> dict, bridging them to the container
9
+ # JSON protocol. Source of truth: skydiscover/evaluation/wrapper.py
10
+ COPY . .
11
+ RUN chmod +x evaluate.sh
12
+
13
+ ENTRYPOINT ["./evaluate.sh"]
benchmarks/ADRS/prism/evaluator/evaluator.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib.util
2
+ import numpy as np
3
+ import time
4
+ import concurrent.futures
5
+ import traceback
6
+ from dataclasses import dataclass
7
+
8
+ GPU_MEM_SIZE = 80 # GB
9
+ MIN_INT = float('-inf') # Define MIN_INT as negative infinity
10
+
11
+ @dataclass
12
+ class Model:
13
+ model_name: str
14
+ model_size: int
15
+ req_rate: int
16
+ slo: int
17
+ cur_gpu_id: int
18
+
19
+
20
+ def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=30):
21
+ """
22
+ Run a function with a timeout using concurrent.futures
23
+ """
24
+ with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
25
+ future = executor.submit(func, *args, **kwargs)
26
+ try:
27
+ result = future.result(timeout=timeout_seconds)
28
+ return result
29
+ except concurrent.futures.TimeoutError:
30
+ raise TimeoutError(f"Function timed out after {timeout_seconds} seconds")
31
+
32
+
33
+ def safe_float(value):
34
+ """Convert a value to float safely"""
35
+ try:
36
+ if np.isnan(value) or np.isinf(value):
37
+ return 0.0
38
+ return float(value)
39
+ except (TypeError, ValueError):
40
+ return 0.0
41
+
42
+ def verify_gpu_mem_constraint(placement_data: dict[int, list[Model]]) -> bool:
43
+ """
44
+ Verify the whether models can fit into GPU memory
45
+ """
46
+ # Check if the placement data is valid
47
+ if placement_data is None:
48
+ return False
49
+
50
+ # Check if the placement data is valid
51
+ for gpu_id, models in placement_data.items():
52
+ if sum(model.model_size for model in models) > GPU_MEM_SIZE:
53
+ return False
54
+
55
+ return True
56
+
57
+
58
+ def calculate_kvcache_pressure(placement_data: dict[int, list[Model]]) -> float:
59
+ """
60
+ Calculate the KVCache pressure
61
+ """
62
+ max_kvpr = MIN_INT
63
+ for gpu_id, models in placement_data.items():
64
+ total_model_size = sum(model.model_size for model in models)
65
+ total_weighted_req_rate = sum(model.req_rate / model.slo for model in models)
66
+ if GPU_MEM_SIZE - total_model_size > 0:
67
+ kvpr = total_weighted_req_rate / (GPU_MEM_SIZE - total_model_size)
68
+ else:
69
+ kvpr = 1000000
70
+ max_kvpr = max(max_kvpr, kvpr)
71
+
72
+ return max_kvpr
73
+
74
+
75
+ def generate_test_gpu_models(num_tests=50):
76
+ """
77
+ Generate multiple test signals with different characteristics
78
+ """
79
+ test_cases = []
80
+ np.random.seed(42)
81
+
82
+ for i in range(num_tests):
83
+ gpu_num = np.random.randint(5, 10)
84
+ gpu_models = []
85
+ for j in range(gpu_num*2):
86
+ model_size = np.random.randint(10, 30)
87
+ req_rate = np.random.randint(1, 10)
88
+ slo = np.random.randint(5, 10)
89
+ gpu_models.append(Model(model_name=f"model_{j}", model_size=model_size, req_rate=req_rate, slo=slo, cur_gpu_id=j))
90
+
91
+ test_cases.append((gpu_num, gpu_models))
92
+
93
+ return test_cases
94
+
95
+ def evaluate(program_path):
96
+ """
97
+ Main evaluation function that tests the signal processing algorithm
98
+ on multiple test signals and calculates the composite performance metric.
99
+ """
100
+ try:
101
+ # Load the program
102
+ spec = importlib.util.spec_from_file_location("program", program_path)
103
+ program = importlib.util.module_from_spec(spec)
104
+ spec.loader.exec_module(program)
105
+
106
+ # Check if required function exists
107
+ if not hasattr(program, "compute_model_placement"):
108
+ return {
109
+ "max_kvpr": 0.0,
110
+ "success_rate": 0.0,
111
+ "combined_score": 0.0,
112
+ "error": "Missing compute_model_placement function",
113
+ }
114
+
115
+ # Generate test gpu and models
116
+ test_gpu_models = generate_test_gpu_models()
117
+
118
+ # Collect metrics across all tests
119
+ all_kvpr = []
120
+ all_metrics = []
121
+ successful_runs = 0
122
+
123
+ for i, (gpu_num, gpu_models) in enumerate(test_gpu_models):
124
+ try:
125
+ # Run the algorithm with timeout
126
+ start_time = time.time()
127
+
128
+ # Call the program's main function
129
+ result = run_with_timeout(
130
+ program.compute_model_placement,
131
+ kwargs={
132
+ 'gpu_num': gpu_num,
133
+ 'models': gpu_models
134
+ },
135
+ timeout_seconds=10
136
+ )
137
+
138
+ execution_time = time.time() - start_time
139
+
140
+ # Validate result format
141
+ if not isinstance(result, dict):
142
+ return {
143
+ "max_kvpr": 0.0,
144
+ "success_rate": 0.0,
145
+ "combined_score": 0.0,
146
+ "error": f"Placement {i}: Expected dict, got {type(result).__name__}",
147
+ }
148
+
149
+ # Validate all models are placed
150
+ placed_models = []
151
+ for gpu_id, assigned_models in result.items():
152
+ if not isinstance(assigned_models, list):
153
+ return {
154
+ "max_kvpr": 0.0,
155
+ "success_rate": 0.0,
156
+ "combined_score": 0.0,
157
+ "error": f"GPU {gpu_id} value must be list, got {type(assigned_models).__name__}",
158
+ }
159
+ placed_models.extend(assigned_models)
160
+
161
+ if len(placed_models) != len(gpu_models):
162
+ return {
163
+ "max_kvpr": 0.0,
164
+ "success_rate": 0.0,
165
+ "combined_score": 0.0,
166
+ "error": f"Not all models placed: {len(placed_models)}/{len(gpu_models)}",
167
+ }
168
+
169
+ # Check for duplicate placements (by object identity)
170
+ placed_ids = [id(m) for m in placed_models]
171
+ if len(set(placed_ids)) != len(placed_ids):
172
+ return {
173
+ "max_kvpr": 0.0,
174
+ "success_rate": 0.0,
175
+ "combined_score": 0.0,
176
+ "error": f"Duplicate models detected",
177
+ }
178
+
179
+ # Check placed models are the exact input objects
180
+ original_ids = {id(m) for m in gpu_models}
181
+ if set(placed_ids) != original_ids:
182
+ return {
183
+ "max_kvpr": 0.0,
184
+ "success_rate": 0.0,
185
+ "combined_score": 0.0,
186
+ "error": "Placed models don't match input models (missing or foreign models)",
187
+ }
188
+
189
+ # Verify GPU memory constraints
190
+ if not verify_gpu_mem_constraint(result):
191
+ return {
192
+ "max_kvpr": 0.0,
193
+ "success_rate": 0.0,
194
+ "combined_score": 0.0,
195
+ "error": f"GPU memory constraint violated",
196
+ }
197
+
198
+ # Calculate metrics using the generated test signal
199
+ max_kvpr = calculate_kvcache_pressure(result)
200
+
201
+ # Store metrics
202
+ metrics = {
203
+ 'max_kvpr': safe_float(max_kvpr),
204
+ 'execution_time': safe_float(execution_time),
205
+ }
206
+
207
+ all_kvpr.append(safe_float(max_kvpr))
208
+ all_metrics.append(metrics)
209
+ successful_runs += 1
210
+
211
+ except TimeoutError:
212
+ print(f"Placement {i}: Timeout")
213
+ continue
214
+ except Exception as e:
215
+ print(f"Placement {i}: Error - {str(e)}")
216
+ continue
217
+
218
+ # If no successful runs, return minimal scores
219
+ if successful_runs == 0:
220
+ return {
221
+ "max_kvpr": 0.0,
222
+ "success_rate": 0.0,
223
+ "combined_score": 0.0,
224
+ "error": "All test signals failed"
225
+ }
226
+
227
+ print(all_metrics)
228
+ # Calculate aggregate metrics
229
+ avg_kvpr = np.mean(all_kvpr)
230
+ if avg_kvpr != 0:
231
+ avg_kvpr = 1.0 / avg_kvpr
232
+ avg_execution_time = np.mean([m['execution_time'] for m in all_metrics])
233
+ success_rate = successful_runs / len(test_gpu_models)
234
+
235
+ return {
236
+ "max_kvpr": safe_float(avg_kvpr),
237
+ "execution_time": safe_float(avg_execution_time),
238
+ "success_rate": safe_float(success_rate),
239
+ "combined_score": safe_float(avg_kvpr) + safe_float(success_rate),
240
+ }
241
+
242
+ except Exception as e:
243
+ print(f"Evaluation failed: {str(e)}")
244
+ print(traceback.format_exc())
245
+ return {
246
+ "max_kvpr": 0.0,
247
+ "success_rate": 0.0,
248
+ "combined_score": 0.0,
249
+ "error": str(e)
250
+ }
251
+
252
+
253
+ if __name__ == "__main__":
254
+ # Backwards-compat: bridges old evaluate() -> dict to the container JSON
255
+ # protocol. wrapper.py is auto-injected at build time from
256
+ # skydiscover/evaluation/wrapper.py.
257
+ from wrapper import run
258
+
259
+ run(evaluate)
benchmarks/ADRS/prism/evaluator/requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ numpy
benchmarks/ADRS/prism/evaluator/wrapper.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Backwards-compat wrapper for old Python-based evaluators.
2
+
3
+ Old-style evaluators define ``evaluate(program_path) -> dict``. This module
4
+ bridges that interface to the container JSON protocol expected by
5
+ ContainerizedEvaluator.
6
+
7
+ Usage — add this to the bottom of your evaluator.py::
8
+
9
+ if __name__ == "__main__":
10
+ from wrapper import run
11
+ run(evaluate)
12
+ """
13
+
14
+ import json
15
+ import sys
16
+ import traceback
17
+
18
+
19
+ def run(evaluate_fn):
20
+ """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
21
+
22
+ * Reads ``sys.argv[1]`` as the program path.
23
+ * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
24
+ don't contaminate the JSON output.
25
+ * Separates numeric metrics from non-numeric artifacts.
26
+ * Guarantees ``combined_score`` is always present in metrics.
27
+ """
28
+ if len(sys.argv) < 2:
29
+ print("Usage: evaluator.py <program_path>", file=sys.stderr)
30
+ sys.exit(1)
31
+
32
+ program_path = sys.argv[1]
33
+
34
+ # Redirect stdout → stderr during evaluation so debug prints from
35
+ # the evaluator don't contaminate the JSON output on stdout.
36
+ real_stdout = sys.stdout
37
+ sys.stdout = sys.stderr
38
+ try:
39
+ result = evaluate_fn(program_path)
40
+ except Exception as e:
41
+ sys.stdout = real_stdout
42
+ print(
43
+ json.dumps(
44
+ {
45
+ "status": "error",
46
+ "combined_score": 0.0,
47
+ "metrics": {"combined_score": 0.0},
48
+ "artifacts": {
49
+ "error": str(e),
50
+ "traceback": traceback.format_exc(),
51
+ },
52
+ }
53
+ )
54
+ )
55
+ return
56
+ sys.stdout = real_stdout
57
+
58
+ if not isinstance(result, dict):
59
+ print(
60
+ json.dumps(
61
+ {
62
+ "status": "error",
63
+ "combined_score": 0.0,
64
+ "metrics": {"combined_score": 0.0},
65
+ "artifacts": {
66
+ "error": f"evaluate() returned {type(result).__name__}, expected dict"
67
+ },
68
+ }
69
+ )
70
+ )
71
+ return
72
+
73
+ # Separate numeric metrics from non-numeric artifacts.
74
+ metrics = {}
75
+ artifacts = {}
76
+ for k, v in result.items():
77
+ if isinstance(v, bool):
78
+ metrics[k] = float(v)
79
+ elif isinstance(v, (int, float)):
80
+ metrics[k] = float(v)
81
+ elif isinstance(v, str):
82
+ artifacts[k] = v
83
+ elif isinstance(v, (list, dict)):
84
+ artifacts[k] = json.dumps(v)
85
+
86
+ if "combined_score" not in metrics:
87
+ metrics["combined_score"] = 0.0
88
+
89
+ status = "error" if "error" in artifacts else "success"
90
+ output = {
91
+ "status": status,
92
+ "combined_score": metrics["combined_score"],
93
+ "metrics": metrics,
94
+ }
95
+ if artifacts:
96
+ output["artifacts"] = artifacts
97
+
98
+ print(json.dumps(output))
benchmarks/arc_benchmark/config.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ARC Benchmark base config
2
+ # This file is used by generate_config.py to inject a task-specific prompt.
3
+ # Switch models by editing the 'llm' section below.
4
+
5
+ # General settings
6
+ max_iterations: 30
7
+ checkpoint_interval: 10
8
+ log_level: "INFO"
9
+ random_seed: 42
10
+ diff_based_generation: true
11
+ max_solution_length: 50000
12
+
13
+ # LLM configuration
14
+ llm:
15
+ models:
16
+ - name: "gpt-5"
17
+ weight: 1.0
18
+ api_base: "https://api.openai.com/v1"
19
+ temperature: 0.7
20
+ # top_p: 0.95 # omitted by default; some providers (e.g. Anthropic) reject both temperature and top_p
21
+ max_tokens: 32768
22
+ timeout: 3000
23
+
24
+ # Option B: Gemini 3 Pro (comment Option A and uncomment below)
25
+ # llm:
26
+ # models:
27
+ # - name: "gemini-3-pro-preview"
28
+ # weight: 1.0
29
+ # api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
30
+ # temperature: 0.7
31
+ # top_p: 0.95
32
+ # max_tokens: 32768
33
+ # timeout: 3000
34
+
35
+ # Search configuration (default: top-k)
36
+ search:
37
+ type: "topk"
38
+ database:
39
+ random_seed: 42
40
+ num_context_programs: 4
41
+
42
+ # Prompt configuration
43
+ # NOTE: generate_config.py overwrites prompt.system_message per task.
44
+ prompt:
45
+ system_message: "PLACEHOLDER_REPLACED_BY_GENERATE_CONFIG"
46
+
47
+ # Evaluator configuration
48
+ evaluator:
49
+ timeout: 360
50
+ max_retries: 3
51
+ cascade_evaluation: false
benchmarks/arc_benchmark/convert_arc_agi2_data.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Convert ARC-AGI-2-style data (data/training/*.json, data/evaluation/*.json)
4
+ into the format expected by this benchmark:
5
+ - arc-agi_{split}_challenges.json (task_id -> { train, test with inputs only })
6
+ - arc-agi_{split}_solutions.json (task_id -> list of test output grids)
7
+
8
+ Usage (from benchmarks/arc_benchmark, with data already in ./data/training and ./data/evaluation):
9
+ OUT_DIR=./data python3 convert_arc_agi2_data.py .
10
+
11
+ Or with an external ARC-AGI-2 clone:
12
+ python3 convert_arc_agi2_data.py /path/to/ARC-AGI-2
13
+ # Writes into that path by default; set OUT_DIR to write elsewhere.
14
+ """
15
+ import json
16
+ import os
17
+ import sys
18
+
19
+
20
+ def convert_split(repo_root: str, split: str, out_dir: str) -> None:
21
+ """Convert data/{split}/*.json into challenges + solutions JSON."""
22
+ split_dir = os.path.join(repo_root, "data", split)
23
+ if not os.path.isdir(split_dir):
24
+ print(f"Skip {split}: no directory {split_dir}")
25
+ return
26
+
27
+ challenges = {}
28
+ solutions = {}
29
+
30
+ for name in sorted(os.listdir(split_dir)):
31
+ if not name.endswith(".json"):
32
+ continue
33
+ task_id = name[:-5] # strip .json
34
+ path = os.path.join(split_dir, name)
35
+ with open(path, "r") as f:
36
+ task = json.load(f)
37
+ # Challenge: train as-is; test with only "input" (no output)
38
+ challenges[task_id] = {
39
+ "train": task["train"],
40
+ "test": [{"input": p["input"]} for p in task["test"]],
41
+ }
42
+ # Solutions: list of test output grids
43
+ solutions[task_id] = [p["output"] for p in task["test"]]
44
+
45
+ challenges_path = os.path.join(out_dir, f"arc-agi_{split}_challenges.json")
46
+ solutions_path = os.path.join(out_dir, f"arc-agi_{split}_solutions.json")
47
+ with open(challenges_path, "w") as f:
48
+ json.dump(challenges, f)
49
+ with open(solutions_path, "w") as f:
50
+ json.dump(solutions, f)
51
+ print(f"Wrote {challenges_path} ({len(challenges)} tasks)")
52
+ print(f"Wrote {solutions_path} ({len(solutions)} tasks)")
53
+
54
+
55
+ def main():
56
+ repo_root = os.path.abspath(sys.argv[1] if len(sys.argv) > 1 else ".")
57
+ out_dir = os.getenv("OUT_DIR", repo_root)
58
+ for split in ("training", "evaluation"):
59
+ convert_split(repo_root, split, out_dir)
60
+
61
+
62
+ if __name__ == "__main__":
63
+ main()
benchmarks/arc_benchmark/evaluator/Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+ WORKDIR /benchmark
3
+
4
+ COPY requirements.txt .
5
+ RUN pip install --no-cache-dir -r requirements.txt
6
+
7
+ # wrapper.py provides backwards compatibility for old Python-based evaluators
8
+ # that define evaluate(program_path) -> dict. Bridges them to the container
9
+ # JSON protocol. Source of truth: skydiscover/evaluation/wrapper.py
10
+ COPY . .
11
+ RUN chmod +x evaluate.sh
12
+
13
+ ENTRYPOINT ["./evaluate.sh"]
benchmarks/arc_benchmark/evaluator/evaluate.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ PROGRAM="$1"
5
+ # MODE ($2) accepted but ignored — override this file to use train/test splits.
6
+
7
+ python /benchmark/evaluator.py "$PROGRAM"
benchmarks/arc_benchmark/evaluator/evaluator.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from typing import List, Tuple, Dict, Any
3
+ import json
4
+ import os
5
+
6
+ try:
7
+ from skydiscover.evaluation.evaluation_result import EvaluationResult
8
+ except ImportError:
9
+ from dataclasses import dataclass, field
10
+ from typing import Union
11
+
12
+ @dataclass
13
+ class EvaluationResult:
14
+ metrics: Dict[str, float]
15
+ artifacts: Dict[str, Union[str, bytes]] = field(default_factory=dict)
16
+ import importlib.util
17
+
18
+ TASK_FILE = os.getenv("ARC_TASK_FILE", "training")
19
+ TASK_NUM = os.getenv("TASK_NUM", 0)
20
+ DATA_ROOT = os.getenv("DATA_ROOT", os.path.join(os.path.dirname(os.path.abspath(__file__)), "data"))
21
+ INCLUDE_TEST = os.getenv("ARC_EVAL_INCLUDE_TEST", "0").lower() in ("1", "true", "yes")
22
+ USE_TEST_IN_SCORE = os.getenv("ARC_EVAL_USE_TEST_FOR_SCORE", "0").lower() in ("1", "true", "yes")
23
+
24
+
25
+ def cell_accuracy_single(pred: np.ndarray, gt: np.ndarray) -> float:
26
+ """
27
+ Compute continuous cell-level accuracy between prediction and ground truth.
28
+ Returns a float in [0, 1]. Handles shape mismatches gracefully.
29
+ """
30
+ if pred.shape != gt.shape:
31
+ # Partial credit for getting shape partially right
32
+ shape_score = 0.0
33
+ if len(pred.shape) == len(gt.shape) == 2:
34
+ row_match = 1.0 if pred.shape[0] == gt.shape[0] else 0.0
35
+ col_match = 1.0 if pred.shape[1] == gt.shape[1] else 0.0
36
+ shape_score = (row_match + col_match) * 0.1 # up to 0.2 for correct dimensions
37
+ return shape_score
38
+ # Cell-level accuracy
39
+ total_cells = gt.size
40
+ if total_cells == 0:
41
+ return 1.0
42
+ correct_cells = int(np.sum(pred == gt))
43
+ return correct_cells / total_cells
44
+
45
+
46
+ def best_attempt_cell_accuracy(attempts: List[np.ndarray], gt: np.ndarray) -> float:
47
+ """Return the best cell accuracy across all attempts for one example."""
48
+ return max(cell_accuracy_single(a, gt) for a in attempts)
49
+
50
+
51
+ def pass_at_2_accuracy_single(
52
+ attempts: List[np.ndarray],
53
+ gt: np.ndarray
54
+ ) -> Tuple[int, Dict[int, Any]]:
55
+ """
56
+ Compute pass@2 accuracy for a single ARC test case.
57
+
58
+ Args:
59
+ attempts: List of 2 numpy arrays representing model attempts.
60
+ gt: Ground-truth output as a 2D numpy array.
61
+
62
+ Returns:
63
+ pass_at_2: int (1 if any attempt is perfectly correct, else 0)
64
+ diagnostics: dict mapping attempt index -> diagnostic info.
65
+ If sizes match, includes indices of incorrect cells.
66
+ """
67
+ assert len(attempts) == 2, "Expected exactly 2 attempts for pass@2 evaluation."
68
+
69
+ diagnostics = {}
70
+ passed = False
71
+
72
+ for i, pred in enumerate(attempts):
73
+ attempt_info = {}
74
+
75
+ # Size check
76
+ if pred.shape != gt.shape:
77
+ attempt_info["size_match"] = False
78
+ attempt_info["pred_shape"] = list(pred.shape)
79
+ attempt_info["gt_shape"] = list(gt.shape)
80
+ attempt_info["incorrect_indices"] = None
81
+ attempt_info["cell_accuracy"] = 0.0
82
+ attempt_passed = False
83
+ else:
84
+ attempt_info["size_match"] = True
85
+
86
+ # Find incorrect cells
87
+ incorrect_mask = pred != gt
88
+ incorrect_indices = np.argwhere(incorrect_mask)
89
+
90
+ attempt_info["incorrect_indices"] = incorrect_indices.tolist()
91
+ attempt_info["num_incorrect"] = int(incorrect_mask.sum())
92
+ attempt_info["num_total"] = int(gt.size)
93
+ attempt_info["cell_accuracy"] = float(np.sum(~incorrect_mask)) / gt.size
94
+
95
+ # Perfect match
96
+ if incorrect_mask.sum() == 0:
97
+ attempt_passed = True
98
+ else:
99
+ attempt_passed = False
100
+
101
+ attempt_info["perfect_match"] = attempt_passed
102
+ passed = attempt_passed or passed
103
+
104
+ diagnostics[i] = attempt_info
105
+
106
+ pass_at_2 = 1 if passed else 0
107
+
108
+ return pass_at_2, diagnostics
109
+
110
+ def pass_at_2_accuracy_multi_test(
111
+ all_attempts: List[List[np.ndarray]],
112
+ all_gt: List[np.ndarray]
113
+ ) -> Tuple[List[int], List[Dict[int, Any]]]:
114
+ """
115
+ Compute pass@2 accuracy across multiple ARC test cases.
116
+
117
+ Args:
118
+ all_attempts: List of lists of 2 numpy arrays for each test case.
119
+ all_gt: List of ground-truth outputs as 2D numpy arrays.
120
+ """
121
+ assert len(all_attempts) == len(all_gt), "Mismatched number of test cases."
122
+
123
+ all_diagnostics = []
124
+ all_pass = []
125
+
126
+ for attempts, gt in zip(all_attempts, all_gt):
127
+ pass_at_2, diagnostics = pass_at_2_accuracy_single(attempts, gt)
128
+ all_pass.append(pass_at_2)
129
+ all_diagnostics.append(diagnostics)
130
+
131
+ return all_pass, all_diagnostics
132
+
133
+ def extract_failure_artifacts(diagnostics, pred=None, gt=None):
134
+ """
135
+ Extract failure artifacts from diagnostics for a given example.
136
+ Includes actual vs expected output snippets for better LLM feedback.
137
+ """
138
+ artifacts = {}
139
+ if not diagnostics["size_match"]:
140
+ artifacts["error_type"] = "SizeMismatch"
141
+ artifacts["error_message"] = (
142
+ f"Output shape {diagnostics['pred_shape']} does not match "
143
+ f"expected shape {diagnostics['gt_shape']}."
144
+ )
145
+ artifacts["suggestion"] = (
146
+ f"Your output has shape {diagnostics['pred_shape']} but the correct output "
147
+ f"has shape {diagnostics['gt_shape']}. Review how you determine output dimensions."
148
+ )
149
+ else:
150
+ num_incorrect = diagnostics['num_incorrect']
151
+ num_total = diagnostics['num_total']
152
+ accuracy = diagnostics['cell_accuracy']
153
+ artifacts["error_type"] = "IncorrectCells"
154
+ artifacts["error_message"] = (
155
+ f"{num_incorrect}/{num_total} cells incorrect "
156
+ f"(cell accuracy: {accuracy:.1%})."
157
+ )
158
+ # Show a compact diff of expected vs actual for first few wrong cells
159
+ if diagnostics['incorrect_indices'] and pred is not None and gt is not None:
160
+ wrong = diagnostics['incorrect_indices'][:8] # first 8 wrong cells
161
+ diff_lines = []
162
+ for r, c in wrong:
163
+ diff_lines.append(f" [{r},{c}]: got {int(pred[r,c])}, expected {int(gt[r,c])}")
164
+ artifacts["cell_diffs"] = "\n".join(diff_lines)
165
+ if len(diagnostics['incorrect_indices']) > 8:
166
+ artifacts["cell_diffs"] += f"\n ... and {len(diagnostics['incorrect_indices'])-8} more"
167
+ artifacts["suggestion"] = (
168
+ f"Your solution gets {accuracy:.1%} of cells correct. "
169
+ f"Review the transformation logic for the failing cells."
170
+ )
171
+
172
+ return artifacts
173
+
174
+ def evaluate(program_path):
175
+ """
176
+ Evaluate the program on ARC task training (and optionally test) examples.
177
+
178
+ Returns a combined_score that blends:
179
+ - pass@2 (binary perfect-match, weighted 0.6)
180
+ - cell accuracy (continuous partial credit, weighted 0.4)
181
+ This gives evolution gradient signal even when no example is solved perfectly.
182
+ """
183
+ spec = importlib.util.spec_from_file_location("program_module", program_path)
184
+ program_module = importlib.util.module_from_spec(spec)
185
+ spec.loader.exec_module(program_module)
186
+
187
+ if not hasattr(program_module, 'transform_grid_attempt_1') or not hasattr(program_module, 'transform_grid_attempt_2'):
188
+ print(f"Stage 1 validation failed: Program must define 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.")
189
+
190
+ error_artifacts = {
191
+ "error_type": "MissingFunction",
192
+ "error_message": "Stage 1: Program is missing required 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.",
193
+ "suggestion": "Make sure your program includes a functions named 'transform_grid_attempt_1' and 'transform_grid_attempt_2' that take as an argument a 2D numpy array and return a 2D numpy array."
194
+ }
195
+
196
+ return EvaluationResult(
197
+ metrics={
198
+ "runs_successfully": 0.0,
199
+ "combined_score": 0.0,
200
+ "error": "Missing transform_grid_attempt_1 and transform_grid_attempt_2 functions"
201
+ },
202
+ artifacts=error_artifacts
203
+ )
204
+
205
+ # Load ARC tasks
206
+ challenge_path = os.path.join(DATA_ROOT, f"arc-agi_{TASK_FILE}_challenges.json")
207
+
208
+ with open(challenge_path, 'r') as f:
209
+ tasks = json.load(f)
210
+
211
+ task_id = list(tasks.keys())[int(TASK_NUM)]
212
+ task = tasks[task_id]
213
+
214
+ train_inputs = [np.array(inp["input"]) for inp in task['train']]
215
+ train_gts = [np.array(gt["output"]) for gt in task['train']]
216
+
217
+ train_attempts = []
218
+
219
+ # Generate attempts for training data
220
+ for inp in train_inputs:
221
+ attempt_1 = program_module.transform_grid_attempt_1(inp)
222
+ if not isinstance(attempt_1, np.ndarray):
223
+ print(f"transform_grid_attempt_1 did not return a numpy array")
224
+
225
+ error_artifacts = {
226
+ "error_type": "InvalidReturnType",
227
+ "error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array.",
228
+ "suggestion": "Make sure your transform_grid_attempt_1 function returns a 2D numpy array."
229
+ }
230
+
231
+ return EvaluationResult(
232
+ metrics={
233
+ "runs_successfully": 0.0,
234
+ "combined_score": 0.0,
235
+ "error": "transform_grid_attempt_1 did not return a numpy array"
236
+ },
237
+ artifacts=error_artifacts
238
+ )
239
+
240
+ attempt_2 = program_module.transform_grid_attempt_2(inp)
241
+ if not isinstance(attempt_2, np.ndarray):
242
+ print(f"transform_grid_attempt_2 did not return a numpy array")
243
+
244
+ error_artifacts = {
245
+ "error_type": "InvalidReturnType",
246
+ "error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array.",
247
+ "suggestion": "Make sure your transform_grid_attempt_2 function returns a 2D numpy array."
248
+ }
249
+
250
+ return EvaluationResult(
251
+ metrics={
252
+ "runs_successfully": 0.0,
253
+ "combined_score": 0.0,
254
+ "error": "transform_grid_attempt_2 did not return a numpy array"
255
+ },
256
+ artifacts=error_artifacts
257
+ )
258
+ train_attempts.append([attempt_1, attempt_2])
259
+
260
+ pass_at_2_train, train_diagnostics_list = pass_at_2_accuracy_multi_test(train_attempts, train_gts)
261
+
262
+ # Compute both binary pass@2 and continuous cell accuracy
263
+ train_pass_score = sum(pass_at_2_train) / len(pass_at_2_train)
264
+ train_cell_acc = sum(
265
+ best_attempt_cell_accuracy(attempts, gt)
266
+ for attempts, gt in zip(train_attempts, train_gts)
267
+ ) / len(train_gts)
268
+
269
+ # Blended score: pass@2 (60%) + cell accuracy (40%) gives gradient signal
270
+ train_score = 0.6 * train_pass_score + 0.4 * train_cell_acc
271
+
272
+ metrics = {
273
+ "runs_successfully": 1.0,
274
+ "combined_score": train_score,
275
+ "train_combined_score": train_score,
276
+ "train_pass_at_2_score": train_pass_score,
277
+ "train_cell_accuracy": round(train_cell_acc, 4),
278
+ }
279
+ error_artifacts = {}
280
+ for i, (train_pass, train_diagnostics) in enumerate(zip(pass_at_2_train, train_diagnostics_list)):
281
+ example_name = f"train_example_{i}"
282
+ metrics[f"{example_name}_pass_at_2"] = train_pass
283
+ best_acc = best_attempt_cell_accuracy(train_attempts[i], train_gts[i])
284
+ metrics[f"{example_name}_cell_accuracy"] = round(best_acc, 4)
285
+ for attempt in train_diagnostics:
286
+ attempt_pass = train_diagnostics[attempt]["perfect_match"]
287
+ metrics[f"{example_name}_attempt_{attempt}"] = attempt_pass
288
+ if not attempt_pass:
289
+ pred = train_attempts[i][attempt]
290
+ gt = train_gts[i]
291
+ error_artifacts[f"{example_name}_attempt_{attempt}_diagnostics"] = extract_failure_artifacts(
292
+ train_diagnostics[attempt], pred=pred, gt=gt
293
+ )
294
+
295
+ # Optional: include test feedback (uses solutions if available)
296
+ if INCLUDE_TEST:
297
+ solution_path = os.path.join(DATA_ROOT, f"arc-agi_{TASK_FILE}_solutions.json")
298
+ if os.path.isfile(solution_path):
299
+ with open(solution_path, 'r') as f:
300
+ solutions = json.load(f)
301
+ task_id = list(tasks.keys())[int(TASK_NUM)]
302
+ solution = solutions.get(task_id)
303
+ if solution is not None and "test" in task:
304
+ if len(task["test"]) != len(solution):
305
+ raise ValueError(
306
+ f"Train/test data mismatch: task {task_id} has {len(task['test'])} test inputs "
307
+ f"but {len(solution)} solution outputs. Check that arc-agi_{TASK_FILE}_challenges.json "
308
+ f"and arc-agi_{TASK_FILE}_solutions.json were generated together."
309
+ )
310
+ test_inputs = [np.array(inp["input"]) for inp in task['test']]
311
+ test_gts = [np.array(gt) for gt in solution]
312
+
313
+ test_attempts = []
314
+ for inp in test_inputs:
315
+ attempt_1 = program_module.transform_grid_attempt_1(inp)
316
+ if not isinstance(attempt_1, np.ndarray):
317
+ print(f"transform_grid_attempt_1 did not return a numpy array (test)")
318
+ return EvaluationResult(
319
+ metrics={
320
+ "runs_successfully": 0.0,
321
+ "combined_score": 0.0,
322
+ "error": "transform_grid_attempt_1 did not return a numpy array (test)"
323
+ },
324
+ artifacts={
325
+ "error_type": "InvalidReturnType",
326
+ "error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array (test).",
327
+ "suggestion": "Make sure transform_grid_attempt_1 returns a 2D numpy array."
328
+ }
329
+ )
330
+
331
+ attempt_2 = program_module.transform_grid_attempt_2(inp)
332
+ if not isinstance(attempt_2, np.ndarray):
333
+ print(f"transform_grid_attempt_2 did not return a numpy array (test)")
334
+ return EvaluationResult(
335
+ metrics={
336
+ "runs_successfully": 0.0,
337
+ "combined_score": 0.0,
338
+ "error": "transform_grid_attempt_2 did not return a numpy array (test)"
339
+ },
340
+ artifacts={
341
+ "error_type": "InvalidReturnType",
342
+ "error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array (test).",
343
+ "suggestion": "Make sure transform_grid_attempt_2 returns a 2D numpy array."
344
+ }
345
+ )
346
+ test_attempts.append([attempt_1, attempt_2])
347
+
348
+ pass_at_2_test, test_diagnostics_list = pass_at_2_accuracy_multi_test(test_attempts, test_gts)
349
+ test_pass_score = sum(pass_at_2_test) / len(pass_at_2_test)
350
+ test_cell_acc = sum(
351
+ best_attempt_cell_accuracy(attempts, gt)
352
+ for attempts, gt in zip(test_attempts, test_gts)
353
+ ) / len(test_gts)
354
+ test_score = 0.6 * test_pass_score + 0.4 * test_cell_acc
355
+
356
+ metrics["test_combined_score"] = test_score
357
+ metrics["test_pass_at_2_score"] = test_pass_score
358
+ metrics["test_cell_accuracy"] = round(test_cell_acc, 4)
359
+ metrics["test_included"] = 1
360
+
361
+ for i, (test_pass, test_diagnostics) in enumerate(zip(pass_at_2_test, test_diagnostics_list)):
362
+ example_name = f"test_example_{i}"
363
+ metrics[f"{example_name}_pass_at_2"] = test_pass
364
+ best_acc = best_attempt_cell_accuracy(test_attempts[i], test_gts[i])
365
+ metrics[f"{example_name}_cell_accuracy"] = round(best_acc, 4)
366
+ for attempt in test_diagnostics:
367
+ metrics[f"{example_name}_attempt_{attempt}"] = test_diagnostics[attempt]["perfect_match"]
368
+ if test_pass == 0:
369
+ first_failing_idx = next(
370
+ (a for a in test_diagnostics if not test_diagnostics[a]["perfect_match"]),
371
+ 0,
372
+ )
373
+ pred = test_attempts[i][first_failing_idx]
374
+ gt = test_gts[i]
375
+ error_artifacts[f"{example_name}"] = extract_failure_artifacts(
376
+ test_diagnostics[first_failing_idx], pred=pred, gt=gt
377
+ )
378
+
379
+ if USE_TEST_IN_SCORE:
380
+ metrics["combined_score"] = (train_score + test_score) / 2.0
381
+ else:
382
+ metrics["test_included"] = 0
383
+ else:
384
+ metrics["test_included"] = 0
385
+
386
+ return EvaluationResult(
387
+ metrics=metrics,
388
+ artifacts=error_artifacts
389
+ )
390
+
391
+
392
+ def _evaluate_as_dict(program_path):
393
+ """Adapter: calls evaluate() and converts EvaluationResult to a plain dict."""
394
+ result = evaluate(program_path)
395
+ d = dict(result.metrics)
396
+ for k, v in result.artifacts.items():
397
+ d[k] = v
398
+ return d
399
+
400
+
401
+ if __name__ == "__main__":
402
+ # Backwards-compat: bridges old evaluate() -> EvaluationResult to the
403
+ # container JSON protocol. wrapper.py is copied from
404
+ # skydiscover/evaluation/wrapper.py.
405
+ from wrapper import run
406
+
407
+ run(_evaluate_as_dict)
benchmarks/arc_benchmark/evaluator/requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ numpy
benchmarks/arc_benchmark/evaluator/wrapper.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Backwards-compat wrapper for old Python-based evaluators.
2
+
3
+ Old-style evaluators define ``evaluate(program_path) -> dict``. This module
4
+ bridges that interface to the container JSON protocol expected by
5
+ ContainerizedEvaluator.
6
+
7
+ Usage — add this to the bottom of your evaluator.py::
8
+
9
+ if __name__ == "__main__":
10
+ from wrapper import run
11
+ run(evaluate)
12
+ """
13
+
14
+ import json
15
+ import sys
16
+ import traceback
17
+
18
+
19
+ def run(evaluate_fn):
20
+ """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
21
+
22
+ * Reads ``sys.argv[1]`` as the program path.
23
+ * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
24
+ don't contaminate the JSON output.
25
+ * Separates numeric metrics from non-numeric artifacts.
26
+ * Guarantees ``combined_score`` is always present in metrics.
27
+ """
28
+ if len(sys.argv) < 2:
29
+ print("Usage: evaluator.py <program_path>", file=sys.stderr)
30
+ sys.exit(1)
31
+
32
+ program_path = sys.argv[1]
33
+
34
+ # Redirect stdout → stderr during evaluation so debug prints from
35
+ # the evaluator don't contaminate the JSON output on stdout.
36
+ real_stdout = sys.stdout
37
+ sys.stdout = sys.stderr
38
+ try:
39
+ result = evaluate_fn(program_path)
40
+ except Exception as e:
41
+ sys.stdout = real_stdout
42
+ print(
43
+ json.dumps(
44
+ {
45
+ "status": "error",
46
+ "combined_score": 0.0,
47
+ "metrics": {"combined_score": 0.0},
48
+ "artifacts": {
49
+ "error": str(e),
50
+ "traceback": traceback.format_exc(),
51
+ },
52
+ }
53
+ )
54
+ )
55
+ return
56
+ sys.stdout = real_stdout
57
+
58
+ if not isinstance(result, dict):
59
+ print(
60
+ json.dumps(
61
+ {
62
+ "status": "error",
63
+ "combined_score": 0.0,
64
+ "metrics": {"combined_score": 0.0},
65
+ "artifacts": {
66
+ "error": f"evaluate() returned {type(result).__name__}, expected dict"
67
+ },
68
+ }
69
+ )
70
+ )
71
+ return
72
+
73
+ # Separate numeric metrics from non-numeric artifacts.
74
+ metrics = {}
75
+ artifacts = {}
76
+ for k, v in result.items():
77
+ if isinstance(v, bool):
78
+ metrics[k] = float(v)
79
+ elif isinstance(v, (int, float)):
80
+ metrics[k] = float(v)
81
+ elif isinstance(v, str):
82
+ artifacts[k] = v
83
+ elif isinstance(v, (list, dict)):
84
+ artifacts[k] = json.dumps(v)
85
+
86
+ if "combined_score" not in metrics:
87
+ metrics["combined_score"] = 0.0
88
+
89
+ status = "error" if "error" in artifacts else "success"
90
+ output = {
91
+ "status": status,
92
+ "combined_score": metrics["combined_score"],
93
+ "metrics": metrics,
94
+ }
95
+ if artifacts:
96
+ output["artifacts"] = artifacts
97
+
98
+ print(json.dumps(output))
benchmarks/arc_benchmark/generate_config.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import yaml
3
+ import json
4
+
5
+
6
+ def load_task_as_prompt(task_json, task_num):
7
+ with open(task_json, 'r') as f:
8
+ tasks = json.load(f)
9
+
10
+ task_id = list(tasks.keys())[int(task_num)]
11
+ task = tasks[task_id]
12
+ train_inputs = [inp["input"] for inp in task['train']]
13
+ train_outputs = [gt["output"] for gt in task['train']]
14
+
15
+ train_pairs = ""
16
+ for i, (inp, out) in enumerate(zip(train_inputs, train_outputs)):
17
+ train_pairs += f"In {i} - {inp}\nOut {i} - {out}\n"
18
+
19
+ prompt = f"""You are participating in a puzzle solving competition. You are an expert at solving puzzles.
20
+ Find the common pattern that transforms each input grid into its corresponding output grid.
21
+
22
+ Your task is to write python functions that implement the MOST GENERAL transformation rule. The rule must:
23
+ - Apply consistently to ALL training examples
24
+ - Generalize to unseen inputs (critical for success)
25
+ - Be based on structural patterns, not memorized examples
26
+ - Use relative/spatial rules rather than absolute coordinates
27
+
28
+ Generalization rules (THIS IS CRITICAL):
29
+ - Infer the transformation ONLY from the training input-output pairs
30
+ - If multiple rules fit the training data, choose the SIMPLEST and MOST GENERAL one
31
+ - Prefer structural/relational rules (shapes, adjacency, symmetry, patterns) over coordinate-based rules
32
+ - Do NOT hardcode any values, coordinates, or specific grid sizes that appear in training examples
33
+ - Think: "What is the underlying principle?" not "What fits these specific examples?"
34
+ - Use numpy only (no external libraries)
35
+
36
+ Common failure modes to avoid:
37
+ - Overfitting to specific grid sizes or positions in training examples
38
+ - Hardcoding colors, coordinates, or counts from training data
39
+ - Assuming global properties (like separator colors) without verifying across ALL examples
40
+ - Using absolute positions when relative/structural rules would generalize better
41
+
42
+ Solution approach:
43
+ - Analyze the training examples to identify the CORE transformation principle
44
+ - Prefer block-wise, object-wise, or pattern-based rules that work locally
45
+ - If the grid has distinct regions, solve each region independently
46
+ - Build flexible rules that adapt to different input sizes and structures
47
+
48
+ Training examples:
49
+ {train_pairs}
50
+
51
+ Your task: Write 2 different Python functions that implement the general transformation rule.
52
+ - Each function takes a 2D numpy array as input and returns the transformed 2D numpy array
53
+ - The two attempts should use genuinely different strategies (e.g., different algorithmic approaches)
54
+ - Focus on generalization - your solution will be evaluated on BOTH training examples AND unseen test cases
55
+
56
+ CRITICAL: Write general transformations that discover the underlying rule, not memorize the training examples.
57
+
58
+ Remember to only output the modified python functions as your solution."""
59
+
60
+ return prompt
61
+
62
+ def generate_config(task_num, task_file, dataset_root=None, base_config=None):
63
+ if dataset_root is None:
64
+ dataset_root = os.getenv("DATA_ROOT")
65
+ if not dataset_root:
66
+ dataset_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
67
+ task_json = os.path.join(dataset_root, f"arc-agi_{task_file}_challenges.json")
68
+ prompt = load_task_as_prompt(task_json, task_num)
69
+
70
+ if base_config is None:
71
+ default_base = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config.yaml")
72
+ base_config = os.getenv("BASE_CONFIG", default_base)
73
+ with open(base_config, 'r') as file:
74
+ config = yaml.safe_load(file)
75
+
76
+ config['prompt']['system_message'] = prompt
77
+ # Use OPENAI_API_KEY at runtime if set (keeps real key out of committed config)
78
+ api_key_env = os.getenv("OPENAI_API_KEY")
79
+ if api_key_env and api_key_env.strip() and api_key_env != "your-gemini-api-key":
80
+ config["llm"]["api_key"] = api_key_env.strip()
81
+ # Override max_iterations from env if set (e.g. by run_discovery.sh)
82
+ max_iter_env = os.getenv("MAX_ITERATIONS")
83
+ if max_iter_env is not None and str(max_iter_env).strip() != "":
84
+ try:
85
+ config["max_iterations"] = int(max_iter_env)
86
+ except ValueError:
87
+ pass
88
+
89
+ # Write to a per-task config file so parallel runs don't conflict
90
+ out_path = os.getenv("CONFIG_OUT", f"./config_task_{task_num}.yaml")
91
+ with open(out_path, 'w') as file:
92
+ yaml.dump(config, file)
93
+ return out_path
94
+
95
+ if __name__ == "__main__":
96
+ TASK_FILE = os.getenv("ARC_TASK_FILE", "training")
97
+ TASK_NUM = os.getenv("TASK_NUM", 0)
98
+
99
+ path = generate_config(TASK_NUM, TASK_FILE)
100
+ print(path)
101
+
benchmarks/arc_benchmark/initial_program.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # EVOLVE-BLOCK-START
2
+
3
+ import numpy as np
4
+
5
+ def transform_grid_attempt_1(grid):
6
+ """
7
+ Example transformation:
8
+ - Validate input (2D, integer values 0-9).
9
+ - Rotate the grid 90 degrees clockwise.
10
+ - Increment every cell by 1 modulo 10 (keeps values 0-9).
11
+ Returns a new numpy int array.
12
+ """
13
+ arr = _validate_grid(grid)
14
+ out = np.rot90(arr, k=-1) # 90 degrees clockwise
15
+ out = (out + 1) % 10
16
+ return out.astype(np.int32)
17
+
18
+ def transform_grid_attempt_2(grid):
19
+ """
20
+ Example transformation:
21
+ - Validate input (2D, integer values 0-9).
22
+ - Upsample each cell to a 2x2 block (doubling both dimensions).
23
+ - Invert colors by mapping v -> 9 - v (keeps values 0-9).
24
+ Returns a new numpy int array.
25
+ """
26
+ arr = _validate_grid(grid)
27
+ out = np.repeat(np.repeat(arr, 2, axis=0), 2, axis=1)
28
+ out = 9 - out
29
+ return out.astype(np.int32)
30
+
31
+ # EVOLVE-BLOCK-END
32
+
33
+ def _validate_grid(grid):
34
+ arr = np.asarray(grid)
35
+ if arr.ndim != 2:
36
+ raise ValueError("Input must be a 2D array.")
37
+ # cast to integer type for value checks
38
+ if not np.issubdtype(arr.dtype, np.integer):
39
+ arr = arr.astype(int)
40
+ if arr.size and (arr.min() < 0 or arr.max() > 9):
41
+ raise ValueError("Array values must be integers in the range 0-9.")
42
+ return arr
benchmarks/arc_benchmark/post_discovery_eval.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib.util
2
+ import os
3
+ import json
4
+ import numpy as np
5
+ from evaluator import pass_at_2_accuracy_multi_test, extract_failure_artifacts
6
+
7
+ TASK_FILE = os.getenv("ARC_TASK_FILE", "training")
8
+ TASK_NUM = os.getenv("TASK_NUM", 0)
9
+ OUTS_DIR = os.getenv("OUTS_DIR", "")
10
+ # Optional: path to a checkpoint dir (e.g. outputs/evaluation_task_0/checkpoints/checkpoint_10) to eval that best_program.py on test set
11
+ PROGRAM_DIR = os.getenv("PROGRAM_DIR", "")
12
+
13
+
14
+ def _program_path():
15
+ """Path to best_program.py: PROGRAM_DIR if set, else OUTS_DIR/best/."""
16
+ if PROGRAM_DIR:
17
+ return os.path.join(PROGRAM_DIR, "best_program.py")
18
+ return os.path.join(OUTS_DIR, "best", "best_program.py")
19
+
20
+
21
+ def _result_path():
22
+ """Where to write post_evolution_evaluation_result.json."""
23
+ if PROGRAM_DIR:
24
+ return os.path.join(PROGRAM_DIR, "post_evolution_evaluation_result.json")
25
+ return os.path.join(OUTS_DIR, "best", "post_evolution_evaluation_result.json")
26
+
27
+
28
+ def load_program_module():
29
+ """Dynamically load the best_program.py module from the specified directory."""
30
+ path = _program_path()
31
+ if not os.path.isfile(path):
32
+ raise FileNotFoundError(f"Program not found: {path}. Set PROGRAM_DIR to a checkpoint dir (e.g. .../checkpoints/checkpoint_10) or ensure OUTS_DIR/best/best_program.py exists.")
33
+ spec = importlib.util.spec_from_file_location("program_module", path)
34
+ program_module = importlib.util.module_from_spec(spec)
35
+ spec.loader.exec_module(program_module)
36
+
37
+ return program_module
38
+
39
+ def evaluate():
40
+ """Evaluate the program module located in the specified directory."""
41
+ program_module = load_program_module()
42
+ if not hasattr(program_module, 'transform_grid_attempt_1') or not hasattr(program_module, 'transform_grid_attempt_2'):
43
+ print(f"Stage 1 validation failed: Program must define 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.")
44
+
45
+ error_artifacts = {
46
+ "error_type": "MissingFunction",
47
+ "error_message": "Stage 1: Program is missing required 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.",
48
+ "suggestion": "Make sure your program includes a functions named 'transform_grid_attempt_1' and 'transform_grid_attempt_2' that take as an argument a 2D numpy array and return a 2D numpy array."
49
+ }
50
+
51
+ return dict(
52
+ metrics={
53
+ "runs_successfully": 0.0,
54
+ "combined_score": 0.0,
55
+ "error": "Missing transform_grid_attempt_1 and transform_grid_attempt_2 functions"
56
+ },
57
+ artifacts=error_artifacts
58
+ )
59
+ # Load ARC tasks
60
+ data_root = os.getenv("DATA_ROOT")
61
+ if not data_root:
62
+ data_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
63
+ challenge_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_challenges.json")
64
+ solution_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_solutions.json")
65
+
66
+ with open(challenge_path, 'r') as f:
67
+ tasks = json.load(f)
68
+ with open(solution_path, 'r') as f:
69
+ solutions = json.load(f)
70
+
71
+ task_id = list(tasks.keys())[int(TASK_NUM)]
72
+ solution = solutions[task_id]
73
+ task = tasks[task_id]
74
+
75
+ # Sanity check: test inputs and solutions must align (same task, same order)
76
+ if len(task["test"]) != len(solution):
77
+ raise ValueError(
78
+ f"Train/test data mismatch: task {task_id} has {len(task['test'])} test inputs "
79
+ f"but {len(solution)} solution outputs. Check that arc-agi_{TASK_FILE}_challenges.json "
80
+ f"and arc-agi_{TASK_FILE}_solutions.json were generated together (convert_arc_agi2_data.py)."
81
+ )
82
+
83
+ test_inputs = [np.array(inp["input"]) for inp in task['test']]
84
+ test_gts = [np.array(gt) for gt in solution]
85
+
86
+ test_attempts = []
87
+ for inp in test_inputs:
88
+ attempt_1 = program_module.transform_grid_attempt_1(inp)
89
+ if not isinstance(attempt_1, np.ndarray):
90
+ print(f"transform_grid_attempt_1 did not return a numpy array")
91
+
92
+ error_artifacts = {
93
+ "error_type": "InvalidReturnType",
94
+ "error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array.",
95
+ "suggestion": "Make sure your transform_grid_attempt_1 function returns a 2D numpy array."
96
+ }
97
+
98
+ return dict(
99
+ metrics={
100
+ "runs_successfully": 0.0,
101
+ "combined_score": 0.0,
102
+ "error": "transform_grid_attempt_1 did not return a numpy array"
103
+ },
104
+ artifacts=error_artifacts
105
+ )
106
+
107
+ attempt_2 = program_module.transform_grid_attempt_2(inp)
108
+ if not isinstance(attempt_2, np.ndarray):
109
+ print(f"transform_grid_attempt_2 did not return a numpy array")
110
+
111
+ error_artifacts = {
112
+ "error_type": "InvalidReturnType",
113
+ "error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array.",
114
+ "suggestion": "Make sure your transform_grid_attempt_2 function returns a 2D numpy array."
115
+ }
116
+
117
+ return dict(
118
+ metrics={
119
+ "runs_successfully": 0.0,
120
+ "combined_score": 0.0,
121
+ "error": "transform_grid_attempt_2 did not return a numpy array"
122
+ },
123
+ artifacts=error_artifacts
124
+ )
125
+ test_attempts.append([attempt_1, attempt_2])
126
+
127
+ pass_at_2_test, test_diagnostics_list = pass_at_2_accuracy_multi_test(test_attempts, test_gts)
128
+ metrics = {
129
+ "runs_successfully": 1.0,
130
+ "combined_score": sum(pass_at_2_test) / len(pass_at_2_test),
131
+ }
132
+ error_artifacts = {}
133
+ for i, (test_pass, test_diagnostics) in enumerate(zip(pass_at_2_test, test_diagnostics_list)):
134
+ example_name = f"test_example_{i}"
135
+ metrics[f"{example_name}_pass_at_2"] = test_pass
136
+ for attempt in test_diagnostics:
137
+ metrics[f"{example_name}_attempt_{attempt}"] = test_diagnostics[attempt]["perfect_match"]
138
+ if test_pass == 0:
139
+ # test_diagnostics is {0: {...}, 1: {...}}; extract_failure_artifacts expects one attempt's dict
140
+ first_failing = next(
141
+ (test_diagnostics[a] for a in test_diagnostics if not test_diagnostics[a]["perfect_match"]),
142
+ test_diagnostics[0],
143
+ )
144
+ error_artifacts[f"{example_name}"] = extract_failure_artifacts(first_failing)
145
+
146
+ return dict(
147
+ metrics=metrics,
148
+ artifacts=error_artifacts
149
+ )
150
+
151
+ if __name__ == "__main__":
152
+ evaluation_result = evaluate()
153
+ result_path = _result_path()
154
+ os.makedirs(os.path.dirname(result_path), exist_ok=True)
155
+ with open(result_path, 'w') as f:
156
+ json.dump(evaluation_result, f, indent=4)
157
+ print(f"Test-set evaluation written to {result_path}")
benchmarks/frontier-cs-eval/README.md ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Frontier-CS Benchmark
2
+
3
+ Evolves C++ solutions for [Frontier-CS](https://github.com/facebookresearch/Frontier-CS) algorithmic optimization problems using SkyDiscover.
4
+
5
+ ## Setup
6
+
7
+ ```bash
8
+ # 1. Clone Frontier-CS
9
+ cd benchmarks/frontier-cs-eval
10
+ git clone https://github.com/FrontierCS/Frontier-CS.git
11
+
12
+ # 2. Start the judge server (requires Docker)
13
+ cd Frontier-CS/algorithmic
14
+ docker compose up -d
15
+
16
+ # 3. Install dependencies (from project root)
17
+ cd ../../..
18
+ uv sync --extra frontier-cs
19
+
20
+ # 4. Set your API key
21
+ export OPENAI_API_KEY=...
22
+ ```
23
+
24
+ ## Run
25
+
26
+ Supported algorithms: `adaevolve`, `evox`, `openevolve`, `gepa`, `shinkaevolve`
27
+
28
+
29
+ Single problem:
30
+ ```bash
31
+ cd benchmarks/frontier-cs-eval
32
+ FRONTIER_CS_PROBLEM=0 uv run skydiscover-run initial_program.cpp evaluator.py \
33
+ -c config.yaml -s [search_algorithm] -i 50
34
+ ```
35
+
36
+ All problems in parallel:
37
+ ```bash
38
+ uv run python run_all_frontiercs.py --search [search_algorithm] --iterations 50 --workers 6
39
+ ```
40
+
41
+ ## Evaluate best programs (post-discovery)
42
+
43
+ ```bash
44
+ uv run python run_best_programs_frontiercs.py
45
+ ```
46
+
47
+ ## Analyze results
48
+
49
+ ```bash
50
+ uv run python combine_results.py # merge training/testing scores into CSV
51
+ uv run python analyze_results.py # generate plots and statistics
52
+ ```
53
+
54
+ ## Files
55
+
56
+ | File | Description |
57
+ |------|-------------|
58
+ | `initial_program.cpp` | Seed C++ program |
59
+ | `evaluator.py` | Evaluates C++ solutions via Frontier-CS docker judge |
60
+ | `config.yaml` | Config with system prompt template |
61
+ | `run_all_frontiercs.py` | Parallelizes evolution across all problems |
62
+ | `run_best_programs_frontiercs.py` | Re-evaluates best programs after evolution |
63
+ | `combine_results.py` | Combines training/testing scores into CSV |
64
+ | `analyze_results.py` | Generates score analysis plots and statistics |
65
+
66
+ ## Environment variables
67
+
68
+ | Variable | Default | Description |
69
+ |----------|---------|-------------|
70
+ | `OPENAI_API_KEY` | (required) | API key |
71
+ | `FRONTIER_CS_PROBLEM` | `0` | Problem ID to evolve |
72
+ | `JUDGE_URLS` | `http://localhost:8081` | Comma-separated judge server URLs |
benchmarks/frontier-cs-eval/analyze_results.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import matplotlib.pyplot as plt
3
+ import numpy as np
4
+ from pathlib import Path
5
+
6
+ # Define paths
7
+ _script_dir = str(Path(__file__).resolve().parent)
8
+ input_csv = str(Path(_script_dir) / "combined_results.csv")
9
+ output_dir = _script_dir
10
+
11
+ # Read the CSV file
12
+ df = pd.read_csv(input_csv)
13
+
14
+ # Calculate average of training and testing scores
15
+ df['average_score'] = (df['training_score'] + df['testing_score']) / 2
16
+
17
+ # Remove rows where either score is None (NaN)
18
+ df_complete = df.dropna(subset=['training_score', 'testing_score'])
19
+
20
+ print(f"\n=== Analysis Results ===")
21
+ print(f"Total problems: {len(df)}")
22
+ print(f"Problems with complete data: {len(df_complete)}")
23
+ print(f"\nTraining Scores:")
24
+ print(f" Mean: {df_complete['training_score'].mean():.4f}")
25
+ print(f" Median: {df_complete['training_score'].median():.4f}")
26
+ print(f" Std Dev: {df_complete['training_score'].std():.4f}")
27
+ print(f" Min: {df_complete['training_score'].min():.4f}")
28
+ print(f" Max: {df_complete['training_score'].max():.4f}")
29
+
30
+ print(f"\nTesting Scores:")
31
+ print(f" Mean: {df_complete['testing_score'].mean():.4f}")
32
+ print(f" Median: {df_complete['testing_score'].median():.4f}")
33
+ print(f" Std Dev: {df_complete['testing_score'].std():.4f}")
34
+ print(f" Min: {df_complete['testing_score'].min():.4f}")
35
+ print(f" Max: {df_complete['testing_score'].max():.4f}")
36
+
37
+ print(f"\nAverage Scores:")
38
+ print(f" Mean: {df_complete['average_score'].mean():.4f}")
39
+ print(f" Median: {df_complete['average_score'].median():.4f}")
40
+ print(f" Std Dev: {df_complete['average_score'].std():.4f}")
41
+
42
+ # Save the updated CSV with averages
43
+ output_csv = Path(output_dir) / "combined_results_with_averages.csv"
44
+ df.to_csv(output_csv, index=False)
45
+ print(f"\nUpdated CSV with averages saved to {output_csv}")
46
+
47
+ # Create visualizations
48
+ fig, axes = plt.subplots(2, 2, figsize=(14, 10))
49
+
50
+ # 1. Scatter plot: Training vs Testing scores
51
+ ax = axes[0, 0]
52
+ ax.scatter(df_complete['training_score'], df_complete['testing_score'], alpha=0.6, s=50)
53
+ # Add diagonal line for reference (where training == testing)
54
+ lim = [min(df_complete['training_score'].min(), df_complete['testing_score'].min()),
55
+ max(df_complete['training_score'].max(), df_complete['testing_score'].max())]
56
+ ax.plot(lim, lim, 'r--', alpha=0.5, label='Training = Testing')
57
+ ax.set_xlabel('Training Score')
58
+ ax.set_ylabel('Testing Score')
59
+ ax.set_title('Training vs Testing Scores')
60
+ ax.legend()
61
+ ax.grid(True, alpha=0.3)
62
+
63
+ # 2. Distribution comparison - histograms
64
+ ax = axes[0, 1]
65
+ ax.hist(df_complete['training_score'], bins=20, alpha=0.6, label='Training', edgecolor='black')
66
+ ax.hist(df_complete['testing_score'], bins=20, alpha=0.6, label='Testing', edgecolor='black')
67
+ ax.set_xlabel('Score')
68
+ ax.set_ylabel('Frequency')
69
+ ax.set_title('Distribution of Training vs Testing Scores')
70
+ ax.legend()
71
+ ax.grid(True, alpha=0.3, axis='y')
72
+
73
+ # 3. Box plot comparison
74
+ ax = axes[1, 0]
75
+ box_data = [df_complete['training_score'], df_complete['testing_score'], df_complete['average_score']]
76
+ bp = ax.boxplot(box_data, labels=['Training', 'Testing', 'Average'])
77
+ ax.set_ylabel('Score')
78
+ ax.set_title('Score Comparison (Box Plot)')
79
+ ax.grid(True, alpha=0.3, axis='y')
80
+
81
+ # 4. Difference plot: Training - Testing
82
+ ax = axes[1, 1]
83
+ difference = df_complete['training_score'] - df_complete['testing_score']
84
+ ax.scatter(df_complete['problem_id'].astype(int), difference, alpha=0.6, s=50)
85
+ ax.axhline(y=0, color='r', linestyle='--', alpha=0.5, label='No Difference')
86
+ ax.set_xlabel('Problem ID')
87
+ ax.set_ylabel('Training Score - Testing Score')
88
+ ax.set_title('Score Difference (Training - Testing)')
89
+ ax.legend()
90
+ ax.grid(True, alpha=0.3)
91
+
92
+ plt.tight_layout()
93
+ plot_path = Path(output_dir) / "results_analysis.png"
94
+ plt.savefig(plot_path, dpi=300, bbox_inches='tight')
95
+ print(f"Plot saved to {plot_path}")
96
+
97
+ # Additional statistics about differences
98
+ print(f"\nScore Differences (Training - Testing):")
99
+ print(f" Mean Difference: {difference.mean():.4f}")
100
+ print(f" Median Difference: {difference.median():.4f}")
101
+ print(f" Std Dev: {difference.std():.4f}")
102
+ print(f" Problems where training > testing: {(difference > 0).sum()}")
103
+ print(f" Problems where testing > training: {(difference < 0).sum()}")
104
+
105
+ plt.show()
benchmarks/frontier-cs-eval/combine_results.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import csv
3
+ import os
4
+ from pathlib import Path
5
+
6
+ # Define paths
7
+ _script_dir = Path(__file__).resolve().parent
8
+ _repo_root = _script_dir.parent.parent
9
+ training_dir = str(_repo_root / "outputs" / "frontier_cs")
10
+ testing_dir = str(_script_dir / "evaluation_results")
11
+ output_csv = str(_script_dir / "combined_results.csv")
12
+
13
+ # Collect all problems
14
+ results = []
15
+
16
+ # Get all problem directories from training data
17
+ training_problems = sorted([d for d in os.listdir(training_dir) if d.startswith("problem_")])
18
+
19
+ print(f"Found {len(training_problems)} training problems")
20
+
21
+ for problem_dir in training_problems:
22
+ problem_id = problem_dir.replace("problem_", "")
23
+
24
+ # Get training score from best_program_info.json
25
+ training_score = None
26
+ training_info_path = os.path.join(training_dir, problem_dir, "best", "best_program_info.json")
27
+
28
+ if os.path.exists(training_info_path):
29
+ try:
30
+ with open(training_info_path, 'r') as f:
31
+ training_data = json.load(f)
32
+ training_score = training_data.get("metrics", {}).get("combined_score")
33
+ except Exception as e:
34
+ print(f"Error reading training data for problem {problem_id}: {e}")
35
+
36
+ # Get testing score from evaluation_results json
37
+ testing_score = None
38
+ testing_json_path = os.path.join(testing_dir, f"problem_{problem_id}.json")
39
+
40
+ if os.path.exists(testing_json_path):
41
+ try:
42
+ with open(testing_json_path, 'r') as f:
43
+ testing_data = json.load(f)
44
+ testing_score = testing_data.get("combined_score")
45
+ except Exception as e:
46
+ print(f"Error reading testing data for problem {problem_id}: {e}")
47
+
48
+ results.append({
49
+ "problem_id": problem_id,
50
+ "training_score": training_score,
51
+ "testing_score": testing_score
52
+ })
53
+
54
+ # Write to CSV
55
+ with open(output_csv, 'w', newline='') as csvfile:
56
+ fieldnames = ["problem_id", "training_score", "testing_score"]
57
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
58
+
59
+ writer.writeheader()
60
+ writer.writerows(results)
61
+
62
+ print(f"\nResults written to {output_csv}")
63
+ print(f"Total problems: {len(results)}")
64
+ print(f"Problems with both scores: {sum(1 for r in results if r['training_score'] is not None and r['testing_score'] is not None)}")
65
+ print(f"Problems missing training score: {sum(1 for r in results if r['training_score'] is None)}")
66
+ print(f"Problems missing testing score: {sum(1 for r in results if r['testing_score'] is None)}")
benchmarks/frontier-cs-eval/config.yaml ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Frontier-CS Benchmark
2
+ # Usage: uv run skydiscover-run initial_program.cpp evaluator.py -c config.yaml -s <strategy> -i 50
3
+
4
+ max_iterations: 100
5
+ checkpoint_interval: 10
6
+ log_level: INFO
7
+
8
+ llm:
9
+ models:
10
+ - name: "gpt-5"
11
+ weight: 1.0
12
+ api_base: https://api.openai.com/v1
13
+ temperature: 0.7
14
+ # top_p: 0.95 # omitted by default; some providers (e.g. Anthropic) reject both temperature and top_p
15
+ max_tokens: 32000
16
+ timeout: 600
17
+ # To use Gemini: override with --model gemini-3-pro-preview
18
+
19
+ prompt:
20
+ system_message: |
21
+ You are an expert competitive programmer specializing in algorithmic optimization.
22
+
23
+ PROBLEM STATEMENT:
24
+ {problem_statement}
25
+
26
+ CONSTRAINTS:
27
+ {problem_constraints}
28
+
29
+ OBJECTIVE: Maximize the score returned by the Frontier-CS judge (higher is better).
30
+ Your solution must be valid C++ code that compiles and runs correctly.
31
+
32
+ KEY STRATEGIES:
33
+ - Analyze the problem structure carefully before coding
34
+ - Consider time and space complexity constraints
35
+ - Use efficient data structures (vectors, maps, sets, priority queues)
36
+ - Implement clean, well-structured code
37
+ - Handle edge cases properly
38
+ - Optimize hot loops and critical sections
39
+
40
+ COMMON TECHNIQUES:
41
+ - Dynamic programming for optimization problems
42
+ - Greedy algorithms with proper ordering
43
+ - Graph algorithms (BFS, DFS, shortest paths)
44
+ - Binary search for monotonic functions
45
+ - Divide and conquer approaches
46
+ - Heuristic search (simulated annealing, genetic algorithms, local search)
47
+
48
+ OUTPUT: Complete C++ program with main() function that reads from stdin and writes to stdout.
49
+
50
+ evaluator:
51
+ timeout: 300
52
+ max_retries: 3
53
+ cascade_evaluation: false
54
+
55
+ diff_based_generation: true
56
+ max_solution_length: 50000
57
+ random_seed: 42
benchmarks/frontier-cs-eval/evaluator.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Evaluator for Frontier-CS algorithmic problems.
3
+
4
+ This evaluator integrates with SkyDiscover to evaluate generated C++ solutions
5
+ against Frontier-CS benchmark problems using the local judge server.
6
+ """
7
+
8
+ import traceback
9
+ from pathlib import Path
10
+ import logging
11
+ import sys
12
+ import os
13
+ import random
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Support multiple judge servers for load balancing
18
+ DEFAULT_JUDGE_URL = "http://localhost:8081"
19
+ JUDGE_URLS = os.environ.get("JUDGE_URLS", DEFAULT_JUDGE_URL).split(",")
20
+ JUDGE_URLS = [url.strip() for url in JUDGE_URLS if url.strip()]
21
+
22
+ def get_judge_url() -> str:
23
+ """Get a judge URL using random selection for load balancing."""
24
+ return random.choice(JUDGE_URLS)
25
+
26
+ # Add Frontier-CS to path
27
+ frontier_cs_path = Path(__file__).resolve().parent / "Frontier-CS" / "src"
28
+ if str(frontier_cs_path) not in sys.path:
29
+ sys.path.insert(0, str(frontier_cs_path))
30
+
31
+ try:
32
+ from frontier_cs.single_evaluator import SingleEvaluator as FrontierCSEvaluator
33
+ from frontier_cs.runner.base import EvaluationStatus
34
+ except ImportError as e:
35
+ logger.error(f"Failed to import Frontier-CS: {e}")
36
+ logger.error("Please ensure Frontier-CS is installed as a submodule in benchmarks/frontier-cs-eval/Frontier-CS")
37
+ raise
38
+
39
+ def evaluate(program_path: str, problem_id: str = None, **kwargs) -> dict:
40
+ """
41
+ Evaluate a C++ solution for a Frontier-CS algorithmic problem.
42
+
43
+ Args:
44
+ program_path: Path to the C++ solution file
45
+ problem_id: Frontier-CS problem ID (e.g., "0", "1", "2", etc.)
46
+ If None, will be read from FRONTIER_CS_PROBLEM env var or config
47
+
48
+ Returns:
49
+ dict with evaluation results:
50
+ - combined_score: The score from the judge (higher is better)
51
+ - runs_successfully: 1.0 if evaluation succeeded, 0.0 otherwise
52
+ - status: Evaluation status string
53
+ - message: Any error or status messages
54
+ - problem_id: The problem ID
55
+ - program_path: Path to the evaluated program
56
+ - score_unbounded: Unbounded score if available
57
+ - metadata: Additional evaluation metadata
58
+ """
59
+ # Get problem_id from parameter, environment, or kwargs
60
+ if problem_id is None:
61
+ import os
62
+ problem_id = os.environ.get('FRONTIER_CS_PROBLEM')
63
+ if problem_id is None:
64
+ problem_id = kwargs.get('frontier_cs_problem', '0')
65
+
66
+ logger.info(f"Evaluating program {program_path} for Frontier-CS problem {problem_id}")
67
+
68
+ try:
69
+ # Initialize evaluator with judge server (load balanced if multiple configured)
70
+ judge_url = get_judge_url()
71
+ logger.info(f"Using judge server: {judge_url}")
72
+ evaluator = FrontierCSEvaluator(
73
+ backend="docker",
74
+ judge_url=judge_url,
75
+ register_cleanup=False,
76
+ )
77
+
78
+ # Read the solution code
79
+ solution_path = Path(program_path)
80
+ if not solution_path.exists():
81
+ error_msg = f"Solution file not found: {program_path}"
82
+ logger.error(error_msg)
83
+ return {
84
+ "combined_score": 0.0,
85
+ "runs_successfully": 0.0,
86
+ "status": "error",
87
+ "message": error_msg,
88
+ "problem_id": problem_id,
89
+ "program_path": program_path,
90
+ }
91
+
92
+ # Extract code and remove any EVOLVE-BLOCK markers
93
+ code = solution_path.read_text().replace(
94
+ "// EVOLVE-BLOCK-START", ""
95
+ ).replace(
96
+ "// EVOLVE-BLOCK-END", ""
97
+ ).strip()
98
+
99
+ logger.info(f"Code extracted from {program_path}")
100
+
101
+ # Evaluate the solution
102
+ result = evaluator.evaluate(
103
+ track="algorithmic",
104
+ problem_id=problem_id,
105
+ code=code,
106
+ backend="docker",
107
+ )
108
+
109
+ logger.info(f"Evaluation completed with status: {result.status}")
110
+
111
+ # Process result
112
+ if result.status == EvaluationStatus.SUCCESS:
113
+ print(result)
114
+ score = result.score
115
+ # Use unbounded score for optimization (allows >100 if beating reference)
116
+ score_unbounded = result.metadata.get('scoreUnbounded', score) if result.metadata else score
117
+ print(f"score={score}, score_unbounded={score_unbounded}")
118
+
119
+ # Extract only essential metadata (exclude large test case outputs)
120
+ essential_metadata = {}
121
+ if result.metadata:
122
+ essential_metadata = {
123
+ "status": result.metadata.get("status"),
124
+ "passed": result.metadata.get("passed"),
125
+ "result": result.metadata.get("result"),
126
+ "score": result.metadata.get("score"),
127
+ "scoreUnbounded": result.metadata.get("scoreUnbounded"),
128
+ }
129
+
130
+ return {
131
+ "combined_score": float(score), # Ensure it's a float
132
+ "score_unbounded": score_unbounded,
133
+ "runs_successfully": 1.0,
134
+ "status": "success",
135
+ "message": result.message or "Evaluation successful",
136
+ "problem_id": problem_id,
137
+ "program_path": program_path,
138
+ "duration_seconds": result.duration_seconds,
139
+ "metadata": essential_metadata,
140
+ }
141
+ elif result.status == EvaluationStatus.TIMEOUT:
142
+ logger.warning(f"Evaluation timed out: {result.message}")
143
+ return {
144
+ "combined_score": 0.0,
145
+ "runs_successfully": 0.0,
146
+ "status": "timeout",
147
+ "message": result.message or "Evaluation timed out",
148
+ "problem_id": problem_id,
149
+ "program_path": program_path,
150
+ }
151
+ else: # ERROR status
152
+ logger.error(f"Evaluation error: {result.message}")
153
+ return {
154
+ "combined_score": 0.0,
155
+ "runs_successfully": 0.0,
156
+ "status": "error",
157
+ "message": result.message or "Evaluation failed",
158
+ "problem_id": problem_id,
159
+ "program_path": program_path,
160
+ "logs": result.logs,
161
+ }
162
+
163
+ except Exception as e:
164
+ logger.error(f"Evaluation failed completely: {str(e)}")
165
+ logger.error(traceback.format_exc())
166
+ return {
167
+ "combined_score": 0.0,
168
+ "runs_successfully": 0.0,
169
+ "status": "error",
170
+ "message": str(e),
171
+ "problem_id": problem_id,
172
+ "program_path": program_path,
173
+ "error": str(e),
174
+ }
benchmarks/frontier-cs-eval/initial_program.cpp ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #include <bits/stdc++.h>
2
+ using namespace std;
3
+ int main(){
4
+ std::cout << "Hello, World!" << std::endl;
5
+ return 0;
6
+ }
benchmarks/frontier-cs-eval/run_all_frontiercs.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import sys
4
+ import subprocess
5
+ from pathlib import Path
6
+ from concurrent.futures import ProcessPoolExecutor
7
+
8
+ from dotenv import load_dotenv
9
+ load_dotenv()
10
+
11
+ SCRIPT_DIR = Path(__file__).resolve().parent
12
+
13
+ frontier_cs_path = SCRIPT_DIR / "Frontier-CS" / "src"
14
+ if str(frontier_cs_path) not in sys.path:
15
+ sys.path.insert(0, str(frontier_cs_path))
16
+
17
+ from frontier_cs.runner.algorithmic_local import AlgorithmicLocalRunner
18
+
19
+
20
+ def run_single_problem(args):
21
+ p_id, search, iterations, env = args
22
+ print(f"\n[START] Problem ID: {p_id}")
23
+ command = [
24
+ "uv", "run", "skydiscover-run",
25
+ "initial_program.cpp", "evaluator.py",
26
+ "-c", "config.yaml",
27
+ "-s", search,
28
+ "-i", str(iterations),
29
+ "-o", f"outputs/frontier_cs/problem_{p_id}",
30
+ ]
31
+ env = {**env, "FRONTIER_CS_PROBLEM": str(p_id)}
32
+ try:
33
+ subprocess.run(command, check=True, env=env, cwd=str(SCRIPT_DIR))
34
+ return f"✅ Problem {p_id} completed."
35
+ except subprocess.CalledProcessError as e:
36
+ return f"❌ Problem {p_id} failed: {e}"
37
+
38
+
39
+ def main():
40
+ parser = argparse.ArgumentParser(description="Run SkyDiscover on all Frontier-CS problems")
41
+ parser.add_argument("--search", "-s", default="adaevolve",
42
+ help="Search algorithm (default: adaevolve)")
43
+ parser.add_argument("--iterations", "-i", type=int, default=50,
44
+ help="Iterations per problem (default: 50)")
45
+ parser.add_argument("--workers", "-w", type=int, default=6,
46
+ help="Parallel workers (default: 6)")
47
+ args = parser.parse_args()
48
+
49
+ runner = AlgorithmicLocalRunner()
50
+ problems_data = runner.list_problems()
51
+ problem_ids = sorted([p['id'] for p in problems_data['problems']], key=int)
52
+
53
+ print(f"Running {len(problem_ids)} problems with {args.workers} workers "
54
+ f"(search={args.search}, iterations={args.iterations})...")
55
+
56
+ env = os.environ.copy()
57
+ task_args = [(p_id, args.search, args.iterations, env) for p_id in problem_ids]
58
+
59
+ with ProcessPoolExecutor(max_workers=args.workers) as executor:
60
+ results = list(executor.map(run_single_problem, task_args))
61
+
62
+ print("\n" + "=" * 30)
63
+ print("ALL RUNS COMPLETE")
64
+ print("=" * 30)
65
+ for result in results:
66
+ print(result)
67
+
68
+
69
+ if __name__ == "__main__":
70
+ main()
benchmarks/frontier-cs-eval/run_best_programs_frontiercs.py ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import json
4
+ import logging
5
+ import threading
6
+ from pathlib import Path
7
+ from typing import Dict, List, Tuple
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+
10
+ # Set up logging
11
+ logging.basicConfig(
12
+ level=logging.INFO,
13
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
14
+ )
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Add Frontier-CS to path
18
+ frontier_cs_path = Path(__file__).resolve().parent / "Frontier-CS" / "src"
19
+ if str(frontier_cs_path) not in sys.path:
20
+ sys.path.insert(0, str(frontier_cs_path))
21
+
22
+ try:
23
+ from frontier_cs.evaluator import FrontierCSEvaluator
24
+ from frontier_cs.runner.base import EvaluationStatus
25
+ except ImportError as e:
26
+ logger.error(f"Failed to import Frontier-CS: {e}")
27
+ logger.error("Please ensure Frontier-CS is installed as a submodule in benchmarks/frontier-cs-eval/Frontier-CS")
28
+ sys.exit(1)
29
+
30
+
31
+ class BestProgramEvaluator:
32
+ """Evaluates all best_program.cpp files in the outputs directory."""
33
+
34
+ def __init__(self, outputs_dir: str, judge_url: str = "http://localhost:8081", num_workers: int = 8):
35
+ """
36
+ Initialize the evaluator.
37
+
38
+ Args:
39
+ outputs_dir: Path to the outputs directory containing problem folders
40
+ judge_url: URL of the judge server
41
+ num_workers: Number of parallel workers for evaluation
42
+ """
43
+ self.outputs_dir = Path(outputs_dir)
44
+ self.judge_url = judge_url
45
+ self.num_workers = num_workers
46
+
47
+ # Use thread-local storage for evaluator instances (avoid race condition)
48
+ self._evaluator_local = threading.local()
49
+
50
+ self.results = []
51
+
52
+ # Create results directory in the script's directory
53
+ self.results_dir = Path(__file__).resolve().parent / "evaluation_results"
54
+ self.results_dir.mkdir(exist_ok=True)
55
+ logger.info(f"Results will be saved to {self.results_dir}")
56
+ logger.info(f"Using {self.num_workers} parallel workers with thread-local evaluators")
57
+
58
+ def _get_evaluator(self) -> 'FrontierCSEvaluator':
59
+ """
60
+ Get the evaluator for the current thread.
61
+ Creates a new instance if this thread hasn't created one yet.
62
+ This avoids race conditions from sharing a single evaluator across threads.
63
+ """
64
+ if not hasattr(self._evaluator_local, 'evaluator'):
65
+ self._evaluator_local.evaluator = FrontierCSEvaluator(
66
+ backend="docker",
67
+ judge_url=self.judge_url,
68
+ )
69
+ logger.debug(f"Created new evaluator for thread {threading.current_thread().name}")
70
+ return self._evaluator_local.evaluator
71
+
72
+ def find_best_programs(self) -> Dict[str, Path]:
73
+ """
74
+ Find all best_program.cpp files in the outputs directory.
75
+
76
+ Returns:
77
+ Dict mapping problem_id to best_program.cpp path
78
+ """
79
+ best_programs = {}
80
+
81
+ # Look for frontier_cs subdirectory
82
+ frontier_cs_dir = self.outputs_dir / "frontier_cs"
83
+ if not frontier_cs_dir.exists():
84
+ logger.error(f"frontier_cs directory not found at {frontier_cs_dir}")
85
+ return best_programs
86
+
87
+ # Iterate through problem directories
88
+ for problem_dir in sorted(frontier_cs_dir.iterdir()):
89
+ if not problem_dir.is_dir() or not problem_dir.name.startswith("problem_"):
90
+ continue
91
+
92
+ # Extract problem ID
93
+ problem_id = problem_dir.name.replace("problem_", "")
94
+
95
+ # Look for best_program.cpp
96
+ best_program_path = problem_dir / "best" / "best_program.cpp"
97
+ if best_program_path.exists():
98
+ best_programs[problem_id] = best_program_path
99
+ logger.info(f"Found best_program.cpp for problem {problem_id}")
100
+ else:
101
+ logger.warning(f"best_program.cpp not found for problem {problem_id} at {best_program_path}")
102
+
103
+ return best_programs
104
+
105
+ def evaluate_program(self, problem_id: str, program_path: Path) -> Dict:
106
+ """
107
+ Evaluate a single best_program.cpp file.
108
+
109
+ Args:
110
+ problem_id: The Frontier-CS problem ID
111
+ program_path: Path to the best_program.cpp file
112
+
113
+ Returns:
114
+ Dictionary with evaluation results
115
+ """
116
+ logger.info(f"Evaluating problem {problem_id}: {program_path}")
117
+
118
+ try:
119
+ # Read the solution code
120
+ if not program_path.exists():
121
+ error_msg = f"Solution file not found: {program_path}"
122
+ logger.error(error_msg)
123
+ return {
124
+ "problem_id": problem_id,
125
+ "program_path": str(program_path),
126
+ "combined_score": 0.0,
127
+ "runs_successfully": 0.0,
128
+ "status": "error",
129
+ "message": error_msg,
130
+ }
131
+
132
+ # Read the code
133
+ code = program_path.read_text().replace(
134
+ "// EVOLVE-BLOCK-START", ""
135
+ ).replace(
136
+ "// EVOLVE-BLOCK-END", ""
137
+ ).strip()
138
+
139
+ logger.info(f"Code extracted from {program_path}, length: {len(code)} characters")
140
+
141
+ # Evaluate the solution (use thread-local evaluator)
142
+ evaluator = self._get_evaluator()
143
+ result = evaluator.evaluate(
144
+ track="algorithmic",
145
+ problem_id=problem_id,
146
+ code=code,
147
+ backend="docker",
148
+ )
149
+
150
+ logger.info(f"Evaluation completed for problem {problem_id} with status: {result.status}")
151
+
152
+ # Log the result object and its properties
153
+ logger.info(f"Judger output for problem {problem_id}:")
154
+ logger.info(f" Status: {result.status}")
155
+ logger.info(f" Message: {result.message}")
156
+ if hasattr(result, 'score'):
157
+ logger.info(f" Score: {result.score}")
158
+ if hasattr(result, 'duration_seconds'):
159
+ logger.info(f" Duration: {result.duration_seconds}s")
160
+ if hasattr(result, 'metadata'):
161
+ logger.info(f" Metadata: {result.metadata}")
162
+ logger.info(f" Full result object: {result}")
163
+
164
+ # Process result
165
+ if result.status == EvaluationStatus.SUCCESS:
166
+ score = result.score
167
+ logger.info(f"Problem {problem_id}: Score = {score}")
168
+
169
+ return {
170
+ "problem_id": problem_id,
171
+ "program_path": str(program_path),
172
+ "combined_score": float(score),
173
+ "runs_successfully": 1.0,
174
+ "status": "success",
175
+ "message": result.message or "Evaluation successful",
176
+ "duration_seconds": result.duration_seconds,
177
+ "judger_output": str(result),
178
+ "metadata": result.metadata if hasattr(result, 'metadata') else None,
179
+ }
180
+ elif result.status == EvaluationStatus.TIMEOUT:
181
+ logger.warning(f"Problem {problem_id}: Evaluation timed out")
182
+ return {
183
+ "problem_id": problem_id,
184
+ "program_path": str(program_path),
185
+ "combined_score": 0.0,
186
+ "runs_successfully": 0.0,
187
+ "status": "timeout",
188
+ "message": f"Evaluation timed out: {result.message}",
189
+ "duration_seconds": result.duration_seconds,
190
+ "judger_output": str(result),
191
+ }
192
+ elif result.status == EvaluationStatus.COMPILATION_ERROR:
193
+ logger.warning(f"Problem {problem_id}: Compilation error")
194
+ return {
195
+ "problem_id": problem_id,
196
+ "program_path": str(program_path),
197
+ "combined_score": 0.0,
198
+ "runs_successfully": 0.0,
199
+ "status": "compilation_error",
200
+ "message": f"Compilation error: {result.message}",
201
+ "duration_seconds": result.duration_seconds,
202
+ "judger_output": str(result),
203
+ }
204
+ else:
205
+ logger.error(f"Problem {problem_id}: Evaluation failed with status {result.status}")
206
+ return {
207
+ "problem_id": problem_id,
208
+ "program_path": str(program_path),
209
+ "combined_score": 0.0,
210
+ "runs_successfully": 0.0,
211
+ "status": str(result.status),
212
+ "message": f"Evaluation failed: {result.message}",
213
+ "duration_seconds": result.duration_seconds,
214
+ "judger_output": str(result),
215
+ }
216
+
217
+ except Exception as e:
218
+ logger.error(f"Exception while evaluating problem {problem_id}: {str(e)}")
219
+ logger.error(f"Exception traceback: {type(e).__name__}")
220
+ import traceback
221
+ logger.error(traceback.format_exc())
222
+
223
+ return {
224
+ "problem_id": problem_id,
225
+ "program_path": str(program_path),
226
+ "combined_score": 0.0,
227
+ "runs_successfully": 0.0,
228
+ "status": "exception",
229
+ "message": str(e),
230
+ }
231
+
232
+ def run_all_evaluations(self) -> List[Dict]:
233
+ """
234
+ Run evaluations for all best_program.cpp files sequentially (one at a time).
235
+
236
+ Returns:
237
+ List of evaluation results
238
+ """
239
+ logger.info(f"Starting evaluation of all best programs in {self.outputs_dir}")
240
+
241
+ best_programs = self.find_best_programs()
242
+ logger.info(f"Found {len(best_programs)} best_program.cpp files")
243
+
244
+ if not best_programs:
245
+ logger.warning("No best_program.cpp files found!")
246
+ return []
247
+
248
+ # Sort problems by ID for consistent ordering
249
+ sorted_problems = sorted(best_programs.items(), key=lambda x: int(x[0]))
250
+
251
+ # Evaluate each program sequentially (no parallelization)
252
+ results = []
253
+ total = len(sorted_problems)
254
+ for idx, (problem_id, program_path) in enumerate(sorted_problems, 1):
255
+ logger.info(f"[SEQ] Evaluating problem {problem_id} ({idx}/{total})")
256
+ try:
257
+ result = self.evaluate_program(problem_id, program_path)
258
+
259
+ # CRITICAL: Ensure problem_id matches
260
+ if result.get("problem_id") != problem_id:
261
+ logger.error(f"[CRITICAL] Problem ID MISMATCH! Expected {problem_id}, got {result.get('problem_id')}")
262
+ result["problem_id"] = problem_id # Force correct problem_id
263
+
264
+ results.append(result)
265
+ self.results.append(result)
266
+
267
+ logger.info(f"[SAVE] Saving problem {problem_id} result to file")
268
+ # Save result immediately after evaluation
269
+ self.save_problem_result(result)
270
+
271
+ except Exception as e:
272
+ logger.error(f"Exception evaluating problem {problem_id}: {str(e)}")
273
+ import traceback
274
+ logger.error(traceback.format_exc())
275
+
276
+ error_result = {
277
+ "problem_id": problem_id,
278
+ "combined_score": 0.0,
279
+ "runs_successfully": 0.0,
280
+ "status": "exception",
281
+ "message": str(e),
282
+ }
283
+ results.append(error_result)
284
+ self.results.append(error_result)
285
+ self.save_problem_result(error_result)
286
+
287
+ return results
288
+
289
+ def save_results(self, output_file: str = "evaluation_results.json"):
290
+ """
291
+ Save evaluation results to a JSON file.
292
+
293
+ Args:
294
+ output_file: Path to save the results
295
+ """
296
+ output_path = Path(output_file)
297
+ with open(output_path, 'w') as f:
298
+ json.dump(self.results, f, indent=2)
299
+ logger.info(f"Results saved to {output_path}")
300
+
301
+ def save_problem_result(self, result: Dict):
302
+ """
303
+ Save individual problem result to a separate file.
304
+
305
+ Args:
306
+ result: The evaluation result for a single problem
307
+ """
308
+ problem_id = result.get("problem_id", "unknown")
309
+ result_file = self.results_dir / f"problem_{problem_id}.json"
310
+
311
+ with open(result_file, 'w') as f:
312
+ json.dump(result, f, indent=2)
313
+ logger.info(f"Problem {problem_id} result saved to {result_file}")
314
+
315
+ def print_summary(self):
316
+ """Print a summary of the evaluation results."""
317
+ if not self.results:
318
+ logger.info("No results to summarize")
319
+ return
320
+
321
+ logger.info("\n" + "="*80)
322
+ logger.info("EVALUATION SUMMARY")
323
+ logger.info("="*80)
324
+
325
+ successful = [r for r in self.results if r.get("status") == "success"]
326
+ timeout = [r for r in self.results if r.get("status") == "timeout"]
327
+ compilation_error = [r for r in self.results if r.get("status") == "compilation_error"]
328
+ other_error = [r for r in self.results if r.get("status") not in ["success", "timeout", "compilation_error"]]
329
+
330
+ logger.info(f"Total problems evaluated: {len(self.results)}")
331
+ logger.info(f"Successful: {len(successful)}")
332
+ logger.info(f"Timeouts: {len(timeout)}")
333
+ logger.info(f"Compilation errors: {len(compilation_error)}")
334
+ logger.info(f"Other errors: {len(other_error)}")
335
+
336
+ if successful:
337
+ scores = [r["combined_score"] for r in successful]
338
+ logger.info(f"\nSuccessful evaluation scores:")
339
+ logger.info(f" Average score: {sum(scores) / len(scores):.2f}")
340
+ logger.info(f" Min score: {min(scores):.2f}")
341
+ logger.info(f" Max score: {max(scores):.2f}")
342
+
343
+ logger.info(f"\nTop 5 problems by score:")
344
+ top_5 = sorted(successful, key=lambda r: r["combined_score"], reverse=True)[:5]
345
+ for i, result in enumerate(top_5, 1):
346
+ logger.info(f" {i}. Problem {result['problem_id']}: {result['combined_score']:.2f}")
347
+
348
+ logger.info("="*80 + "\n")
349
+
350
+
351
+ def main():
352
+ """Main entry point."""
353
+ import argparse
354
+
355
+ parser = argparse.ArgumentParser(
356
+ description="Evaluate all best_program.cpp files in the outputs directory"
357
+ )
358
+
359
+ # Default outputs directory is two levels up from this script
360
+ default_outputs_dir = Path(__file__).resolve().parent.parent.parent / "outputs"
361
+
362
+ parser.add_argument(
363
+ "--outputs-dir",
364
+ type=str,
365
+ default=str(default_outputs_dir),
366
+ help="Path to the outputs directory (default: ../../outputs from script location)"
367
+ )
368
+ parser.add_argument(
369
+ "--judge-url",
370
+ type=str,
371
+ default="http://localhost:8081",
372
+ help="URL of the judge server (default: http://localhost:8081)"
373
+ )
374
+ parser.add_argument(
375
+ "--output-file",
376
+ type=str,
377
+ default="evaluation_results.json",
378
+ help="Path to save the evaluation results (default: evaluation_results.json)"
379
+ )
380
+ parser.add_argument(
381
+ "--workers",
382
+ type=int,
383
+ default=8,
384
+ help="Number of parallel workers for evaluation (default: 8)"
385
+ )
386
+
387
+ args = parser.parse_args()
388
+
389
+ # Run evaluations
390
+ evaluator = BestProgramEvaluator(
391
+ outputs_dir=args.outputs_dir,
392
+ judge_url=args.judge_url,
393
+ num_workers=args.workers
394
+ )
395
+
396
+ results = evaluator.run_all_evaluations()
397
+ evaluator.save_results(args.output_file)
398
+ evaluator.print_summary()
399
+
400
+ logger.info(f"Evaluation complete. Results saved to {args.output_file}")
401
+
402
+
403
+ if __name__ == "__main__":
404
+ main()
benchmarks/image_gen/README.md ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Image Generation Benchmark
2
+
3
+ This benchmark evaluates whether SkyDiscover can optimize images, not just code or text. Each "solution" in the population is an image, evolved by generating and scoring variants from a candidate pool stored in the database. The evolutionary loop is the same as for code — parent selection, mutation via LLM, crossover via other context images from other islands — but instead of evolving Python programs, SkyDiscover evolves text prompts fed to GPT-5's native image generation. The VLM receives actual parent and other context images alongside text guidance, reasons about what to improve, and generates a new image. Setting `language: "image"` in the config is the only change needed.
4
+
5
+ ## Benchmark: Sky Festival
6
+
7
+ **Directory:** `sky_festival/`
8
+
9
+ The system must generate a floating sky-festival image where many details must match exact structural constraints: 9 clouds with specific shapes (rabbit, teacup, musical note, crescent moon, whale, etc.), 5 hot-air balloons with exact colors, passengers, and a banner reading "HAPPY 100TH SKY FESTIVAL", a floating island with 4 trees in a specific left-to-right order, and a party table with precisely counted items (6 cupcakes, 8 golden plates, 5 gift boxes in a pyramid). The scene also includes 6 characters with specific attributes (e.g., a robot with 3 colored buttons on its chest, a grandmother giving a thumbs-up with her left hand), flying creatures, and a correctly ordered 7-band rainbow. The full specification is about 2000 words and lives in `config.yaml`'s `prompt.system_message`.
10
+
11
+ **Evaluator.** Each generated image is graded by a GPT-5 vision judge using a strict rubric. The judge receives the image and a detailed scoring sheet, then returns per-category scores across 7 dimensions — cloud shapes (15 pts), balloons (20 pts), floating island (10 pts), table items (20 pts), characters (15 pts), decorations/creatures (10 pts), and rainbow/lighting (10 pts) — for a total of 100 points. The judge is instructed to be extremely harsh: points are awarded only when requirements are clearly and unambiguously met in the image.
12
+
13
+ ## Setup
14
+
15
+ 1. **Set your API key:**
16
+
17
+ ```bash
18
+ export OPENAI_API_KEY=...
19
+ ```
20
+
21
+ Both the image generator (GPT-5) and the evaluator judge (GPT-5) use the OpenAI API.
22
+
23
+ ## Run
24
+
25
+ ```bash
26
+ cd benchmarks/image_gen/sky_festival
27
+
28
+ # AdaEvolve
29
+ uv run skydiscover-run evaluator.py -c config.yaml -s adaevolve -o sky_festival_output
30
+
31
+ # EvoX
32
+ uv run skydiscover-run evaluator.py -c config.yaml -s evox -o sky_festival_output
33
+ ```
34
+
35
+ ## Files
36
+
37
+ | File | Description |
38
+ |------|-------------|
39
+ | `sky_festival/evaluator.py` | GPT-5 vision judge that scores images against the 100-point rubric |
40
+ | `sky_festival/config.yaml` | Config — scene specification in `prompt.system_message` |
benchmarks/image_gen/sky_festival/config.yaml ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Sky Festival Benchmark
2
+ #
3
+ # Usage:
4
+ # cd benchmarks/image_gen/sky_festival
5
+ # skydiscover-run evaluator.py -c config.yaml -s adaevolve -o sky_festival_output
6
+
7
+ language: "image"
8
+ diff_based_generation: false
9
+ max_iterations: 100
10
+ checkpoint_interval: 1
11
+
12
+ llm:
13
+ models:
14
+ - name: "gpt-5"
15
+ weight: 1.0
16
+ temperature: 0.9
17
+ max_tokens: 16384
18
+ timeout: 300
19
+
20
+ evaluator:
21
+ timeout: 300
22
+
23
+ prompt:
24
+ system_message: |
25
+ You are an expert visual artist and image generation AI specializing in
26
+ complex compositional scenes with precise object counting, spatial
27
+ arrangement, and rich detail.
28
+
29
+ You can see the current images from the database along with their scores
30
+ across 7 categories: cloud shapes, hot air balloons, floating island,
31
+ table items, characters, decorations/creatures, and rainbow/lighting.
32
+
33
+ Your goal is to generate a NEW, improved image that scores higher on
34
+ the rubric. Pay special attention to:
35
+ - EXACT counts: 9 shaped clouds, 5 balloons, 4 trees, 6 cupcakes, 8 plates, 5 gifts, 6 characters, 11 bunting flags, 7 lanterns, 7 rainbow bands
36
+ - Correct passengers in each balloon (2 children, 1 woman, 3 cats, 1 violinist, empty)
37
+ - Legible text: "HAPPY 100TH SKY FESTIVAL" on banner, "100 YEARS" on cake
38
+ - Specific character details: robot buttons, grandmother's LEFT hand thumbs-up, dog's striped hat
39
+ - Correct spatial ordering: trees left-to-right, gift pyramid, cupcake grid
40
+ - Warm golden lighting from upper left, consistent shadows
41
+
42
+ Also provide brief text reasoning about your approach and what you changed.
43
+
44
+ # Target Image Description
45
+ A joyful, sunlit floating sky festival on a perfect summer day, viewed from a slightly elevated angle.
46
+
47
+ THE SKY AND BACKGROUND:
48
+ The sky is a brilliant gradient from warm gold at the horizon to deep cerulean blue at the top. There are exactly 9 fluffy white clouds scattered across the sky. Each cloud has a distinct shape: cloud 1 looks like a rabbit, cloud 2 looks like a teacup, cloud 3 looks like a musical note, cloud 4 looks like a crescent moon, cloud 5 looks like a whale, cloud 6 looks like a bicycle, cloud 7 looks like a crown, cloud 8 looks like a butterfly, cloud 9 looks like the number 7. The clouds are arranged in a gentle arc from left to right across the upper third of the image.
49
+
50
+ THE HOT AIR BALLOONS:
51
+ There are exactly 5 hot air balloons floating at different heights. Each balloon has a unique color and pattern:
52
+ - Balloon 1 (leftmost, highest): Red with white horizontal stripes. Its basket carries exactly 2 waving children.
53
+ - Balloon 2 (second from left, medium height): Sunshine yellow with orange polka dots. Its basket carries exactly 1 old woman holding a telescope.
54
+ - Balloon 3 (center, lowest): Rainbow gradient (red-orange-yellow-green-blue-purple from top to bottom). Its basket carries exactly 3 cats — one orange tabby, one black, one white — all wearing tiny party hats.
55
+ - Balloon 4 (second from right, medium height): Deep purple with gold stars printed on it. Its basket carries exactly 1 man playing a violin.
56
+ - Balloon 5 (rightmost, highest): Emerald green with a large white peace sign on the front. Its basket is empty but has a banner hanging from it that reads exactly: "HAPPY 100TH SKY FESTIVAL"
57
+
58
+ THE FLOATING ISLAND:
59
+ Below the balloons, there is a lush green floating island suspended in mid-air. The island is roughly circular and has grass, wildflowers, and 4 trees on it. The trees are different species: one oak with a thick trunk, one cherry blossom in full pink bloom, one palm tree leaning slightly right, and one pine tree (tallest of the four). The trees are spaced evenly along the island from left to right in that exact order: oak, cherry blossom, palm, pine.
60
+
61
+ THE PARTY TABLE:
62
+ On the center of the floating island sits a long rectangular wooden table covered with a checkered red-and-white tablecloth. On the table, from left to right:
63
+ - A 3-tier birthday cake with white frosting. The bottom tier has blue frosting roses, the middle tier has pink frosting roses, the top tier has a single golden candle that is lit with a bright flame. Written on the middle tier in purple icing: "100 YEARS"
64
+ - Exactly 6 cupcakes arranged in 2 rows of 3. Each cupcake has a different colored frosting: red, orange, yellow, green, blue, purple (in that order, left to right, top row first).
65
+ - A glass pitcher of lemonade, three-quarters full, with exactly 3 lemon slices floating in it and 2 ice cubes visible.
66
+ - A stack of exactly 8 golden plates.
67
+ - Exactly 5 colorful gift boxes stacked in a pyramid: 3 on the bottom row (red, blue, green from left to right), 2 on top (yellow, purple from left to right). Each gift box has a white ribbon bow on top.
68
+
69
+ THE CHARACTERS AROUND THE TABLE:
70
+ Seated around the table are exactly 6 characters, 3 on each long side facing each other:
71
+ - Left side (facing right), from left to right: A smiling girl with pigtails wearing a blue dress, a jolly round penguin wearing a red bowtie, and a tall giraffe whose long neck extends above the frame but whose smiling face peeks down from above.
72
+ - Right side (facing left), from left to right: A friendly robot with a square head and glowing green eyes, a grandmother in a floral apron giving a thumbs-up with her LEFT hand, and a golden retriever dog sitting upright on a chair wearing a cone-shaped party hat with blue and white stripes.
73
+
74
+ THE BUNTING AND DECORATIONS:
75
+ Strung between the cherry blossom tree and the pine tree is a triangular bunting banner with exactly 11 small triangular flags. The flags alternate in color: red, yellow, blue, red, yellow, blue, red, yellow, blue, red, yellow. Below the bunting, there are exactly 7 paper lanterns hanging at different heights. The lanterns are spherical and glow warmly in these colors from left to right: orange, pink, gold, white, lavender, mint green, coral.
76
+
77
+ THE ANIMALS IN THE SKY:
78
+ Flying around the balloons are exactly 4 birds and 2 butterflies. The birds are: 1 blue jay, 1 cardinal (red), 1 canary (yellow), and 1 hummingbird (iridescent green). The 2 butterflies are: one monarch (orange and black) and one morpho (brilliant blue). The blue jay and the cardinal are flying together near Balloon 2. The canary is perched on top of Balloon 4. The hummingbird hovers near the cherry blossom tree. The monarch butterfly is near the bunting. The morpho butterfly is near Balloon 5.
79
+
80
+ THE FLOATING MUSICAL NOTES:
81
+ Drifting upward from the violin player in Balloon 4, there are exactly 5 golden musical notes of different sizes, getting smaller as they rise higher. They follow a gentle curved path upward and to the right.
82
+
83
+ THE RAINBOW:
84
+ Behind everything, a complete semicircular rainbow arcs from the lower left to the lower right of the scene. It has the correct 7 color bands in order from outside to inside: red, orange, yellow, green, blue, indigo, violet.
85
+
86
+ LIGHTING AND ATMOSPHERE:
87
+ The scene is lit by warm, golden afternoon sunlight coming from the upper left. All shadows fall to the lower right. The overall mood is magical, celebratory, and full of wonder. There is a soft, warm glow around the floating island. The light catches the glass lemonade pitcher creating a small sparkle. The golden candle flame on the cake emits a tiny warm glow.
88
+
89
+ IMPORTANT DETAILS:
90
+ - The girl with pigtails has exactly 5 fingers visible on each hand.
91
+ - The robot has exactly 3 buttons on its chest: a red circle, a green square, and a blue triangle, arranged vertically.
92
+ - The grandmother's floral apron has exactly sunflowers on it, not roses or daisies.
93
+ - Every character at the table who has a mouth is smiling.
94
+ - The penguin's red bowtie has white polka dots on it.
95
+
96
+ monitor:
97
+ enabled: true
98
+ port: 8765
99
+ summary_model: "gpt-5"
100
+ summary_interval: 5
101
+
102
+ hil_enabled: true
103
+ hil_mode: "append"
benchmarks/image_gen/sky_festival/evaluator.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sky Festival evaluator — GPT-5 LLM-as-a-judge.
3
+
4
+ Scores VLM-generated images against a 100-point rubric using GPT-5 vision.
5
+ Returns combined_score normalized to [0, 1].
6
+
7
+ The framework passes the image path via a sidecar file:
8
+ <program_path>.image_path -> absolute path to the generated image
9
+
10
+ Requirements:
11
+ pip install openai
12
+ Environment: OPENAI_API_KEY (required), JUDGE_MODEL (optional, default gpt-5)
13
+ """
14
+
15
+ import base64
16
+ import json
17
+ import logging
18
+ import os
19
+ import re
20
+ from typing import Dict, Union
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ JUDGE_MODEL = os.environ.get("JUDGE_MODEL", "gpt-5")
25
+
26
+ SYSTEM_PROMPT = """\
27
+ You are an extremely strict image evaluation judge. You score images against a precise rubric.
28
+ You must output ONLY valid JSON with the exact keys specified. No markdown, no explanation outside JSON.
29
+ Be harsh — most AI-generated images fail these criteria. Award points only when clearly met.
30
+ If you cannot verify a requirement (e.g., too small to see), award 0 for that item."""
31
+
32
+ RUBRIC_PROMPT = """\
33
+ Score this image against the following rubric for a "Floating Sky Festival" scene.
34
+ Be extremely strict. Only award points when requirements are CLEARLY and UNAMBIGUOUSLY met.
35
+
36
+ ## Category 1: Cloud Counting and Shapes (15 pts)
37
+ - Exactly 9 clouds visible in the sky: 5 pts (8 or 10 clouds = 0)
38
+ - At least 5 of the 9 clouds have recognizable distinct shapes (rabbit, teacup, musical note, crescent moon, whale, bicycle, crown, butterfly, number 7): 10 pts (2 pts per recognizable shape, max 10)
39
+
40
+ ## Category 2: Hot Air Balloons — Count, Colors, and Passengers (20 pts)
41
+ - Exactly 5 hot air balloons visible: 4 pts (4 or 6 = 0)
42
+ - Each balloon has correct distinct color/pattern (red-striped, yellow-dotted, rainbow, purple-stars, green-peace-sign): 6 pts (deduct 2 per wrong/missing pattern)
43
+ - Correct passenger count per balloon (2 children, 1 woman, 3 cats, 1 violinist, empty): 6 pts (deduct 2 per wrong count)
44
+ - Banner on Balloon 5 reads exactly "HAPPY 100TH SKY FESTIVAL": 4 pts (any word wrong = 0)
45
+
46
+ ## Category 3: Floating Island and Trees (10 pts)
47
+ - Floating island visible suspended in air: 3 pts
48
+ - Exactly 4 different trees on the island: 4 pts (3 or 5 = 0)
49
+ - Trees in correct order left to right (oak, cherry blossom, palm, pine): 3 pts
50
+
51
+ ## Category 4: Party Table Items — Counting and Arrangement (20 pts)
52
+ - 3-tier cake with candle present: 3 pts
53
+ - Cake text "100 YEARS" legible on middle tier: 3 pts
54
+ - Exactly 6 cupcakes in 2 rows of 3 with different colored frostings: 4 pts
55
+ - Lemonade pitcher with 3 lemon slices and 2 ice cubes: 3 pts
56
+ - Stack of exactly 8 golden plates: 3 pts
57
+ - Exactly 5 gift boxes in pyramid (3 bottom, 2 top): 4 pts
58
+
59
+ ## Category 5: Characters — Count, Identity, and Details (15 pts)
60
+ - Exactly 6 characters seated at the table (3 per side): 5 pts
61
+ - Correct characters identifiable (girl with pigtails, penguin with bowtie, giraffe, robot, grandmother, golden retriever): 5 pts (1 pt per correct character, max 5 — giraffe counts as 1 even if neck extends)
62
+ - Specific details: robot has 3 colored buttons on chest, grandmother thumbs-up with LEFT hand, dog wears striped party hat, girl has 5 fingers per hand: 5 pts (deduct 1.5 per missing detail)
63
+
64
+ ## Category 6: Decorations and Flying Creatures (10 pts)
65
+ - Bunting banner with approximately 11 flags in alternating red/yellow/blue: 3 pts
66
+ - Exactly 7 paper lanterns in different colors: 3 pts
67
+ - Correct flying creatures: 4 birds (blue jay, cardinal, canary, hummingbird) + 2 butterflies (monarch, morpho): 4 pts (1 pt per 2 correct creatures)
68
+
69
+ ## Category 7: Rainbow, Lighting, and Overall Composition (10 pts)
70
+ - Complete semicircular rainbow with 7 color bands in correct order: 4 pts
71
+ - Consistent warm golden lighting from upper left with shadows falling lower right: 3 pts
72
+ - Overall magical/celebratory mood, scene is joyful and cohesive: 3 pts
73
+
74
+ Respond with ONLY this JSON (no other text):
75
+ {
76
+ "cloud_shapes": <0-15>,
77
+ "balloons": <0-20>,
78
+ "floating_island": <0-10>,
79
+ "table_items": <0-20>,
80
+ "characters": <0-15>,
81
+ "decorations_creatures": <0-10>,
82
+ "rainbow_lighting": <0-10>,
83
+ "reasoning": "<brief 2-3 sentence explanation>"
84
+ }"""
85
+
86
+ # Category maximum scores for validation
87
+ CATEGORY_MAXES = {
88
+ "cloud_shapes": 15,
89
+ "balloons": 20,
90
+ "floating_island": 10,
91
+ "table_items": 20,
92
+ "characters": 15,
93
+ "decorations_creatures": 10,
94
+ "rainbow_lighting": 10,
95
+ }
96
+
97
+ _client = None
98
+
99
+
100
+ def _get_client():
101
+ global _client
102
+ if _client is None:
103
+ from openai import OpenAI
104
+ _client = OpenAI()
105
+ return _client
106
+
107
+
108
+ def _encode_image(image_path: str) -> str:
109
+ with open(image_path, "rb") as f:
110
+ return base64.b64encode(f.read()).decode("utf-8")
111
+
112
+
113
+ def _judge_image(image_path: str) -> Dict[str, Union[float, str]]:
114
+ """Call GPT-5 to score the image. Retries once on failure."""
115
+ client = _get_client()
116
+ b64 = _encode_image(image_path)
117
+
118
+ ext = os.path.splitext(image_path)[1].lstrip(".").lower()
119
+ mime = {"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg", "webp": "image/webp"}.get(ext, "image/png")
120
+ data_url = f"data:{mime};base64,{b64}"
121
+
122
+ messages = [
123
+ {"role": "system", "content": SYSTEM_PROMPT},
124
+ {
125
+ "role": "user",
126
+ "content": [
127
+ {"type": "image_url", "image_url": {"url": data_url, "detail": "high"}},
128
+ {"type": "text", "text": RUBRIC_PROMPT},
129
+ ],
130
+ },
131
+ ]
132
+
133
+ last_error = None
134
+ for attempt in range(2):
135
+ try:
136
+ response = client.chat.completions.create(
137
+ model=JUDGE_MODEL,
138
+ messages=messages,
139
+ max_completion_tokens=16384,
140
+ )
141
+ content = response.choices[0].message.content or ""
142
+ raw = content.strip()
143
+ logger.info(f"Judge raw response (first 300 chars): {raw[:300]}")
144
+
145
+ # Extract JSON from markdown code block if present
146
+ if "```" in raw:
147
+ m = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", raw, re.DOTALL)
148
+ if m:
149
+ raw = m.group(1).strip()
150
+
151
+ # Find JSON object in response
152
+ start = raw.find("{")
153
+ end = raw.rfind("}") + 1
154
+ if start >= 0 and end > start:
155
+ raw = raw[start:end]
156
+
157
+ result = json.loads(raw)
158
+
159
+ # Validate and clamp scores
160
+ scores = {}
161
+ for cat, max_val in CATEGORY_MAXES.items():
162
+ val = result.get(cat, 0)
163
+ if not isinstance(val, (int, float)):
164
+ val = 0
165
+ scores[cat] = max(0, min(max_val, float(val)))
166
+
167
+ scores["reasoning"] = str(result.get("reasoning", ""))
168
+ return scores
169
+
170
+ except Exception as e:
171
+ last_error = e
172
+ logger.warning(f"Judge attempt {attempt + 1} failed: {e}")
173
+
174
+ logger.error(f"GPT-5 judge failed after retries: {last_error}")
175
+ return {cat: 0.0 for cat in CATEGORY_MAXES}
176
+
177
+
178
+ def evaluate(program_path: str) -> Dict[str, Union[float, str]]:
179
+ """Score a VLM-generated image using GPT-5 as judge.
180
+
181
+ Args:
182
+ program_path: Path to the text file (VLM reasoning).
183
+ A sidecar file ``<program_path>.image_path`` contains the
184
+ absolute path to the generated image.
185
+
186
+ Returns:
187
+ Dictionary with combined_score (0-1), per-category scores, and image_path.
188
+ """
189
+ # Read image path from sidecar
190
+ sidecar = program_path + ".image_path"
191
+ image_path = None
192
+ if os.path.exists(sidecar):
193
+ with open(sidecar) as f:
194
+ image_path = f.read().strip()
195
+
196
+ if not image_path or not os.path.exists(image_path):
197
+ logger.warning("No image found for scoring")
198
+ return {"combined_score": 0.0, "error": "No image to score"}
199
+
200
+ # Score with GPT-5
201
+ scores = _judge_image(image_path)
202
+
203
+ # Compute total out of 100, normalize to 0-1
204
+ total = sum(v for k, v in scores.items() if k in CATEGORY_MAXES)
205
+ combined = round(total / 100.0, 4)
206
+
207
+ result = {"combined_score": combined, "image_path": image_path}
208
+
209
+ # Add per-category scores (normalized to 0-1 for each category)
210
+ for cat, max_val in CATEGORY_MAXES.items():
211
+ result[cat] = round(scores.get(cat, 0) / max_val, 4)
212
+
213
+ # Also store raw scores
214
+ result["raw_total"] = round(total, 1)
215
+
216
+ reasoning = scores.get("reasoning", "")
217
+ if reasoning:
218
+ result["judge_reasoning"] = reasoning
219
+
220
+ return result
benchmarks/math/circle_packing_rect/evaluator/evaluator.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ===--------------------------------------------------------------------------------------===#
2
+ #
3
+ # This file implements the evaluator for the circle packing problem on a rectangle
4
+ # of perimeter 4.
5
+ #
6
+ # ===--------------------------------------------------------------------------------------===#
7
+ #
8
+ # Some of the code in this file is adapted from:
9
+ #
10
+ # google-deepmind/alphaevolve_results:
11
+ # Licensed under the Apache License v2.0.
12
+ #
13
+ # ===--------------------------------------------------------------------------------------===#
14
+
15
+ import time
16
+ import numpy as np
17
+ import sys
18
+ import os
19
+ from importlib import __import__
20
+
21
+ BENCHMARK = 2.3658321334167627
22
+ NUM_CIRCLES = 21
23
+ TOL = 1e-6
24
+
25
+
26
+ def minimum_circumscribing_rectangle(circles: np.ndarray):
27
+ """Returns the width and height of the minimum circumscribing rectangle.
28
+
29
+ Args:
30
+ circles: A numpy array of shape (num_circles, 3), where each row is of the
31
+ form (x, y, radius), specifying a circle.
32
+
33
+ Returns:
34
+ A tuple (width, height) of the minimum circumscribing rectangle.
35
+ """
36
+ min_x = np.min(circles[:, 0] - circles[:, 2])
37
+ max_x = np.max(circles[:, 0] + circles[:, 2])
38
+ min_y = np.min(circles[:, 1] - circles[:, 2])
39
+ max_y = np.max(circles[:, 1] + circles[:, 2])
40
+ return max_x - min_x, max_y - min_y
41
+
42
+
43
+ def validate_packing_radii(radii: np.ndarray) -> None:
44
+ n = len(radii)
45
+ for i in range(n):
46
+ if radii[i] < 0:
47
+ raise ValueError(f"Circle {i} has negative radius {radii[i]}")
48
+ elif np.isnan(radii[i]):
49
+ raise ValueError(f"Circle {i} has nan radius")
50
+
51
+
52
+ def validate_packing_overlap_wtol(circles: np.ndarray, tol: float = 1e-6) -> None:
53
+ n = len(circles)
54
+ for i in range(n):
55
+ for j in range(i + 1, n):
56
+ dist = np.sqrt(np.sum((circles[i, :2] - circles[j, :2]) ** 2))
57
+ if dist < circles[i, 2] + circles[j, 2] - tol:
58
+ raise ValueError(
59
+ f"Circles {i} and {j} overlap: dist={dist}, r1+r2={circles[i,2]+circles[j,2]}"
60
+ )
61
+
62
+
63
+ def validate_packing_inside_rect_wtol(circles: np.array, tol: float = 1e-6) -> None:
64
+ width, height = minimum_circumscribing_rectangle(circles)
65
+ if width + height > (2 + tol):
66
+ raise ValueError("Circles are not contained inside a rectangle of perimeter 4.")
67
+
68
+
69
+ def evaluate(program_path: str):
70
+ try:
71
+ abs_program_path = os.path.abspath(program_path)
72
+ program_dir = os.path.dirname(abs_program_path)
73
+ module_name = os.path.splitext(os.path.basename(program_path))[0]
74
+
75
+ circles = None
76
+ eval_time = 0
77
+ try:
78
+ sys.path.insert(0, program_dir)
79
+ program = __import__(module_name)
80
+
81
+ start_time = time.time()
82
+ circles = program.circle_packing21()
83
+ end_time = time.time()
84
+ eval_time = end_time - start_time
85
+ except Exception as err:
86
+ raise err
87
+ finally:
88
+ if program_dir in sys.path:
89
+ sys.path.remove(program_dir)
90
+
91
+ if not isinstance(circles, np.ndarray):
92
+ circles = np.array(circles)
93
+
94
+ if circles.shape != (NUM_CIRCLES, 3):
95
+ raise ValueError(
96
+ f"Invalid shapes: circles = {circles.shape}, expected {(NUM_CIRCLES,3)}"
97
+ )
98
+
99
+ validate_packing_radii(circles[:, -1])
100
+ validate_packing_overlap_wtol(circles, TOL)
101
+ validate_packing_inside_rect_wtol(circles, TOL)
102
+
103
+ radii_sum = np.sum(circles[:, -1])
104
+
105
+ return {
106
+ "radii_sum": float(radii_sum),
107
+ "combined_score": float(radii_sum / BENCHMARK),
108
+ "eval_time": float(eval_time),
109
+ }
110
+ except Exception as e:
111
+ return {"combined_score": 0.0, "error": str(e)}
112
+
113
+
114
+ if __name__ == "__main__":
115
+ # Backwards-compat: bridges old evaluate() -> dict to the container JSON
116
+ # protocol. wrapper.py is copied from skydiscover/evaluation/wrapper.py.
117
+ from wrapper import run
118
+
119
+ run(evaluate)
benchmarks/math/erdos_min_overlap/config.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Math benchmark: erdos_min_overlap
2
+ # Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
3
+ language: python
4
+ diff_based_generation: true
5
+ max_iterations: 100
6
+ checkpoint_interval: 10
7
+ max_solution_length: 60000
8
+ llm:
9
+ api_base: https://api.openai.com/v1
10
+ models:
11
+ - name: "gpt-5"
12
+ weight: 1.0
13
+ max_tokens: 32000
14
+ timeout: 600
15
+ prompt:
16
+ system_message: |
17
+ SETTING:
18
+ You are an expert in harmonic analysis, numerical optimization, and AI-driven mathematical discovery.
19
+ Your task is to evolve and optimize a Python script to find a better **upper bound** for the Erdős minimum overlap problem constant C₅.
20
+
21
+ PROBLEM CONTEXT:
22
+ Target: Find a step function h: [0, 2] → [0, 1] that **minimizes** the objective:
23
+ max_k ∫ h(x)(1 - h(x+k)) dx
24
+
25
+ This minimal value provides a tight upper bound for the constant C5.
26
+
27
+ Current best known upper bound: C5 ≤ 0.38092303510845016
28
+ Goal: Find a step function `h` that results in a C5 value lower than 0.38092303510845016.
29
+
30
+ CONSTRAINTS:
31
+ 1. The function `h` must have values in the range [0, 1].
32
+ 2. The integral of h(x) over [0, 2] must be exactly 1.
33
+
34
+ PERFORMANCE METRICS:
35
+ - c5_bound: The bound found by the program.
36
+ - combined_score: 0.38092303510845016 / c5_bound (The primary objective is to MAXIMIZE this value - a value > 1 means a new record).
37
+ - n_points: number of points used in the discretization.
38
+ - eval_time: evaluation time of the program.
39
+ evaluator:
40
+ timeout: 600
41
+ max_retries: 3
benchmarks/math/erdos_min_overlap/evaluator/Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+ WORKDIR /benchmark
3
+
4
+ COPY requirements.txt .
5
+ RUN pip install --no-cache-dir -r requirements.txt
6
+
7
+ # wrapper.py provides backwards compatibility for old Python-based evaluators
8
+ # that define evaluate(program_path) -> dict. Bridges them to the container
9
+ # JSON protocol. Source of truth: skydiscover/evaluation/wrapper.py
10
+ COPY . .
11
+ RUN chmod +x evaluate.sh
12
+
13
+ ENTRYPOINT ["./evaluate.sh"]
benchmarks/math/erdos_min_overlap/evaluator/requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ numpy
2
+ jax
3
+ optax
benchmarks/math/erdos_min_overlap/initial_program.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # EVOLVE-BLOCK-START
2
+ import jax
3
+ import jax.numpy as jnp
4
+ import optax
5
+ import numpy as np
6
+ from dataclasses import dataclass
7
+ import tqdm
8
+
9
+
10
+ @dataclass
11
+ class Hyperparameters:
12
+ num_intervals: int = 200
13
+ learning_rate: float = 0.005
14
+ num_steps: int = 20000
15
+ penalty_strength: float = 1000000.0
16
+
17
+
18
+ class ErdosOptimizer:
19
+ """
20
+ Finds a step function h that minimizes the maximum overlap integral.
21
+ """
22
+
23
+ def __init__(self, hypers: Hyperparameters):
24
+ self.hypers = hypers
25
+ self.domain_width = 2.0
26
+ self.dx = self.domain_width / self.hypers.num_intervals
27
+
28
+ def _objective_fn(self, latent_h_values: jnp.ndarray) -> jnp.ndarray:
29
+ """
30
+ The loss function includes the objective and a penalty for the constraint.
31
+ """
32
+ # Enforce h(x) in [0, 1] via sigmoid (hard constraint)
33
+ h = jax.nn.sigmoid(latent_h_values)
34
+
35
+ # Calculate the primary objective (max correlation)
36
+ j = 1.0 - h
37
+ N = self.hypers.num_intervals
38
+ h_padded = jnp.pad(h, (0, N))
39
+ j_padded = jnp.pad(j, (0, N))
40
+ corr_fft = jnp.fft.fft(h_padded) * jnp.conj(jnp.fft.fft(j_padded))
41
+ correlation = jnp.fft.ifft(corr_fft).real
42
+ scaled_correlation = correlation * self.dx
43
+ objective_loss = jnp.max(scaled_correlation)
44
+
45
+ # Calculate the penalty for the integral constraint
46
+ integral_h = jnp.sum(h) * self.dx
47
+ constraint_loss = (integral_h - 1.0) ** 2
48
+
49
+ # Combine the objective with the penalty
50
+ total_loss = objective_loss + self.hypers.penalty_strength * constraint_loss
51
+ return total_loss
52
+
53
+ def run_optimization(self):
54
+ optimizer = optax.adam(self.hypers.learning_rate)
55
+
56
+ key = jax.random.PRNGKey(42)
57
+ latent_h_values = jax.random.normal(key, (self.hypers.num_intervals,))
58
+
59
+ opt_state = optimizer.init(latent_h_values)
60
+
61
+ @jax.jit
62
+ def train_step(latent_h_values, opt_state):
63
+ loss, grads = jax.value_and_grad(self._objective_fn)(latent_h_values)
64
+ updates, opt_state = optimizer.update(grads, opt_state)
65
+ latent_h_values = optax.apply_updates(latent_h_values, updates)
66
+ return latent_h_values, opt_state, loss
67
+
68
+ print(f"Optimizing a step function with {self.hypers.num_intervals} intervals...")
69
+ for step in tqdm.tqdm(range(self.hypers.num_steps), desc="Optimizing"):
70
+ latent_h_values, opt_state, loss = train_step(latent_h_values, opt_state)
71
+
72
+ # Final h is just the sigmoid of the latent values
73
+ final_h = jax.nn.sigmoid(latent_h_values)
74
+
75
+ # Re-calculate final objective loss without the penalty for the report
76
+ j = 1.0 - final_h
77
+ N = self.hypers.num_intervals
78
+ h_padded = jnp.pad(final_h, (0, N))
79
+ j_padded = jnp.pad(j, (0, N))
80
+ corr_fft = jnp.fft.fft(h_padded) * jnp.conj(jnp.fft.fft(j_padded))
81
+ correlation = jnp.fft.ifft(corr_fft).real
82
+ c5_bound = jnp.max(correlation * self.dx)
83
+
84
+ print(f"Optimization complete. Final C5 upper bound: {c5_bound:.8f}")
85
+ return np.array(final_h), float(c5_bound)
86
+
87
+
88
+ def run():
89
+ hypers = Hyperparameters()
90
+ optimizer = ErdosOptimizer(hypers)
91
+ final_h_values, c5_bound = optimizer.run_optimization()
92
+
93
+ return final_h_values, c5_bound, hypers.num_intervals
94
+
95
+
96
+ # EVOLVE-BLOCK-END
benchmarks/math/heilbronn_convex/13/evaluator/evaluate.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ PROGRAM="$1"
5
+ # MODE ($2) accepted but ignored — override this file to use train/test splits.
6
+
7
+ python /benchmark/evaluator.py "$PROGRAM"
benchmarks/math/heilbronn_convex/13/evaluator/wrapper.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Backwards-compat wrapper for old Python-based evaluators.
2
+
3
+ Old-style evaluators define ``evaluate(program_path) -> dict``. This module
4
+ bridges that interface to the container JSON protocol expected by
5
+ ContainerizedEvaluator.
6
+
7
+ Usage — add this to the bottom of your evaluator.py::
8
+
9
+ if __name__ == "__main__":
10
+ from wrapper import run
11
+ run(evaluate)
12
+ """
13
+
14
+ import json
15
+ import sys
16
+ import traceback
17
+
18
+
19
+ def run(evaluate_fn):
20
+ """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
21
+
22
+ * Reads ``sys.argv[1]`` as the program path.
23
+ * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
24
+ don't contaminate the JSON output.
25
+ * Separates numeric metrics from non-numeric artifacts.
26
+ * Guarantees ``combined_score`` is always present in metrics.
27
+ """
28
+ if len(sys.argv) < 2:
29
+ print("Usage: evaluator.py <program_path>", file=sys.stderr)
30
+ sys.exit(1)
31
+
32
+ program_path = sys.argv[1]
33
+
34
+ # Redirect stdout → stderr during evaluation so debug prints from
35
+ # the evaluator don't contaminate the JSON output on stdout.
36
+ real_stdout = sys.stdout
37
+ sys.stdout = sys.stderr
38
+ try:
39
+ result = evaluate_fn(program_path)
40
+ except Exception as e:
41
+ sys.stdout = real_stdout
42
+ print(
43
+ json.dumps(
44
+ {
45
+ "status": "error",
46
+ "combined_score": 0.0,
47
+ "metrics": {"combined_score": 0.0},
48
+ "artifacts": {
49
+ "error": str(e),
50
+ "traceback": traceback.format_exc(),
51
+ },
52
+ }
53
+ )
54
+ )
55
+ return
56
+ sys.stdout = real_stdout
57
+
58
+ if not isinstance(result, dict):
59
+ print(
60
+ json.dumps(
61
+ {
62
+ "status": "error",
63
+ "combined_score": 0.0,
64
+ "metrics": {"combined_score": 0.0},
65
+ "artifacts": {
66
+ "error": f"evaluate() returned {type(result).__name__}, expected dict"
67
+ },
68
+ }
69
+ )
70
+ )
71
+ return
72
+
73
+ # Separate numeric metrics from non-numeric artifacts.
74
+ metrics = {}
75
+ artifacts = {}
76
+ for k, v in result.items():
77
+ if isinstance(v, bool):
78
+ metrics[k] = float(v)
79
+ elif isinstance(v, (int, float)):
80
+ metrics[k] = float(v)
81
+ elif isinstance(v, str):
82
+ artifacts[k] = v
83
+ elif isinstance(v, (list, dict)):
84
+ artifacts[k] = json.dumps(v)
85
+
86
+ if "combined_score" not in metrics:
87
+ metrics["combined_score"] = 0.0
88
+
89
+ status = "error" if "error" in artifacts else "success"
90
+ output = {
91
+ "status": status,
92
+ "combined_score": metrics["combined_score"],
93
+ "metrics": metrics,
94
+ }
95
+ if artifacts:
96
+ output["artifacts"] = artifacts
97
+
98
+ print(json.dumps(output))
benchmarks/math/hexagon_packing/11/evaluator/evaluate.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ PROGRAM="$1"
5
+ # MODE ($2) accepted but ignored — override this file to use train/test splits.
6
+
7
+ python /benchmark/evaluator.py "$PROGRAM"
benchmarks/math/matmul/evaluator/Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+ WORKDIR /benchmark
3
+
4
+ COPY requirements.txt .
5
+ RUN pip install --no-cache-dir -r requirements.txt
6
+
7
+ # wrapper.py provides backwards compatibility for old Python-based evaluators
8
+ # that define evaluate(program_path) -> dict. Bridges them to the container
9
+ # JSON protocol. Source of truth: skydiscover/evaluation/wrapper.py
10
+ COPY . .
11
+ RUN chmod +x evaluate.sh
12
+
13
+ ENTRYPOINT ["./evaluate.sh"]
benchmarks/math/matmul/evaluator/evaluate.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ PROGRAM="$1"
5
+ # MODE ($2) accepted but ignored — override this file to use train/test splits.
6
+
7
+ python /benchmark/evaluator.py "$PROGRAM"
benchmarks/math/matmul/evaluator/evaluator.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ===--------------------------------------------------------------------------------------===#
2
+ #
3
+ # This file implements the evaluator for the matrix multiplication problem with tensor size
4
+ # of <2,4,5>
5
+ #
6
+ # ===--------------------------------------------------------------------------------------===#
7
+ #
8
+ # Some of the code in this file is adapted from:
9
+ #
10
+ # google-deepmind/alphaevolve_results:
11
+ # Licensed under the Apache License v2.0.
12
+ #
13
+ # ===--------------------------------------------------------------------------------------===#
14
+
15
+ import sys
16
+ import os
17
+ from importlib import __import__
18
+ import time
19
+ import numpy as np
20
+
21
+ BENCHMARK = 32
22
+
23
+
24
+ def verify_tensor_decomposition(
25
+ decomposition: tuple[np.ndarray, np.ndarray, np.ndarray], n: int, m: int, p: int, rank: int
26
+ ):
27
+ """Verifies the correctness of the tensor decomposition."""
28
+
29
+ # Add robustness for cases where the optimizer might fail
30
+ if not all(isinstance(arr, np.ndarray) for arr in decomposition) or not decomposition:
31
+ raise ValueError("Decomposition must be a tuple of NumPy arrays.")
32
+ if any(arr.size == 0 for arr in decomposition):
33
+ print("Warning: One or more decomposition arrays are empty. Verification skipped.")
34
+ return
35
+
36
+ # Check that each factor matrix has the correct shape.
37
+ factor_matrix_1, factor_matrix_2, factor_matrix_3 = decomposition
38
+ if factor_matrix_1.shape != (n * m, rank):
39
+ raise ValueError(
40
+ f"Expected shape of factor matrix 1 is {(n * m, rank)}. Actual shape is {factor_matrix_1.shape}."
41
+ )
42
+ if factor_matrix_2.shape != (m * p, rank):
43
+ raise ValueError(
44
+ f"Expected shape of factor matrix 2 is {(m * p, rank)}. Actual shape is {factor_matrix_2.shape}."
45
+ )
46
+ if factor_matrix_3.shape != (n * p, rank):
47
+ raise ValueError(
48
+ f"Expected shape of factor matrix 3 is {(n * p, rank)}. Actual shape is {factor_matrix_3.shape}."
49
+ )
50
+
51
+ # Form the matrix multiplication tensor <n, m, p>.
52
+ matmul_tensor = np.zeros((n * m, m * p, n * p), dtype=np.float32)
53
+ for i in range(n):
54
+ for j in range(m):
55
+ for k in range(p):
56
+ # Use the standard k*n+i indexing for the third dimension
57
+ matmul_tensor[i * m + j, j * p + k, k * n + i] = 1
58
+
59
+ # Check that the tensor is correctly constructed.
60
+ constructed_tensor = np.einsum("ir,jr,kr -> ijk", *decomposition)
61
+
62
+ # Exact check
63
+ if not np.array_equal(constructed_tensor, matmul_tensor):
64
+ # If the exact check fails, report the floating-point difference for diagnostics.
65
+ diff = np.max(np.abs(constructed_tensor - matmul_tensor))
66
+ raise ValueError(
67
+ f"Tensor constructed by decomposition does not exactly match the target tensor. Maximum difference is {diff:.6e}."
68
+ )
69
+
70
+
71
+ def evaluate(program_path: str):
72
+ try:
73
+ abs_program_path = os.path.abspath(program_path)
74
+ program_dir = os.path.dirname(abs_program_path)
75
+ module_name = os.path.splitext(os.path.basename(program_path))[0]
76
+
77
+ try:
78
+ sys.path.insert(0, program_dir)
79
+ program = __import__(module_name)
80
+ start_time = time.time()
81
+ decomposition, n, m, p, loss, rank = program.run()
82
+ end_time = time.time()
83
+ eval_time = end_time - start_time
84
+ except Exception as err:
85
+ raise err
86
+ finally:
87
+ if program_dir in sys.path:
88
+ sys.path.remove(program_dir)
89
+
90
+ verify_tensor_decomposition(decomposition, n, m, p, rank)
91
+
92
+ success_threshold = 1e-6
93
+ if loss > success_threshold:
94
+ print(
95
+ f"\nWarning: Final loss {loss:.2e} is above the success threshold of {success_threshold:.2e}."
96
+ )
97
+
98
+ inverse_rank = BENCHMARK / rank
99
+
100
+ return {
101
+ "combined_score": inverse_rank,
102
+ "loss": loss,
103
+ "rank": rank,
104
+ "eval_time": float(eval_time),
105
+ }
106
+ except Exception as e:
107
+ return {"combined_score": 0.0, "error": str(e)}
108
+
109
+
110
+ if __name__ == "__main__":
111
+ # Backwards-compat: bridges old evaluate() -> dict to the container JSON
112
+ # protocol. wrapper.py is copied from skydiscover/evaluation/wrapper.py.
113
+ from wrapper import run
114
+
115
+ run(evaluate)
benchmarks/math/matmul/evaluator/requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ numpy
2
+ jax
3
+ optax
benchmarks/math/matmul/evaluator/wrapper.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Backwards-compat wrapper for old Python-based evaluators.
2
+
3
+ Old-style evaluators define ``evaluate(program_path) -> dict``. This module
4
+ bridges that interface to the container JSON protocol expected by
5
+ ContainerizedEvaluator.
6
+
7
+ Usage — add this to the bottom of your evaluator.py::
8
+
9
+ if __name__ == "__main__":
10
+ from wrapper import run
11
+ run(evaluate)
12
+ """
13
+
14
+ import json
15
+ import sys
16
+ import traceback
17
+
18
+
19
+ def run(evaluate_fn):
20
+ """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
21
+
22
+ * Reads ``sys.argv[1]`` as the program path.
23
+ * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
24
+ don't contaminate the JSON output.
25
+ * Separates numeric metrics from non-numeric artifacts.
26
+ * Guarantees ``combined_score`` is always present in metrics.
27
+ """
28
+ if len(sys.argv) < 2:
29
+ print("Usage: evaluator.py <program_path>", file=sys.stderr)
30
+ sys.exit(1)
31
+
32
+ program_path = sys.argv[1]
33
+
34
+ # Redirect stdout → stderr during evaluation so debug prints from
35
+ # the evaluator don't contaminate the JSON output on stdout.
36
+ real_stdout = sys.stdout
37
+ sys.stdout = sys.stderr
38
+ try:
39
+ result = evaluate_fn(program_path)
40
+ except Exception as e:
41
+ sys.stdout = real_stdout
42
+ print(
43
+ json.dumps(
44
+ {
45
+ "status": "error",
46
+ "combined_score": 0.0,
47
+ "metrics": {"combined_score": 0.0},
48
+ "artifacts": {
49
+ "error": str(e),
50
+ "traceback": traceback.format_exc(),
51
+ },
52
+ }
53
+ )
54
+ )
55
+ return
56
+ sys.stdout = real_stdout
57
+
58
+ if not isinstance(result, dict):
59
+ print(
60
+ json.dumps(
61
+ {
62
+ "status": "error",
63
+ "combined_score": 0.0,
64
+ "metrics": {"combined_score": 0.0},
65
+ "artifacts": {
66
+ "error": f"evaluate() returned {type(result).__name__}, expected dict"
67
+ },
68
+ }
69
+ )
70
+ )
71
+ return
72
+
73
+ # Separate numeric metrics from non-numeric artifacts.
74
+ metrics = {}
75
+ artifacts = {}
76
+ for k, v in result.items():
77
+ if isinstance(v, bool):
78
+ metrics[k] = float(v)
79
+ elif isinstance(v, (int, float)):
80
+ metrics[k] = float(v)
81
+ elif isinstance(v, str):
82
+ artifacts[k] = v
83
+ elif isinstance(v, (list, dict)):
84
+ artifacts[k] = json.dumps(v)
85
+
86
+ if "combined_score" not in metrics:
87
+ metrics["combined_score"] = 0.0
88
+
89
+ status = "error" if "error" in artifacts else "success"
90
+ output = {
91
+ "status": status,
92
+ "combined_score": metrics["combined_score"],
93
+ "metrics": metrics,
94
+ }
95
+ if artifacts:
96
+ output["artifacts"] = artifacts
97
+
98
+ print(json.dumps(output))
benchmarks/math/matmul/initial_program.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Disable progress bar for cleaner output logs
2
+ import os
3
+
4
+ os.environ["TQDM_DISABLE"] = "1"
5
+
6
+ # Fixed parameters
7
+ n, m, p = 2, 4, 5
8
+
9
+ # EVOLVE-BLOCK-START
10
+ import numpy as np
11
+ import jax
12
+ import jax.numpy as jnp
13
+ import optax
14
+ from dataclasses import dataclass
15
+ import tqdm
16
+
17
+
18
+ # --- Straight-Through Estimator for Rounding ---
19
+ @jax.custom_vjp
20
+ def round_to_half_ste(x):
21
+ """Forward pass: snaps values to the nearest half-integer."""
22
+ return jnp.round(x * 2) / 2
23
+
24
+
25
+ def round_ste_fwd(x):
26
+ """Standard forward pass and identity for backward pass."""
27
+ return round_to_half_ste(x), None
28
+
29
+
30
+ def round_ste_bwd(res, g):
31
+ """Backward pass: Identity function, passes gradient straight through."""
32
+ return (g,)
33
+
34
+
35
+ round_to_half_ste.defvjp(round_ste_fwd, round_ste_bwd)
36
+ # --- End of STE definition ---
37
+
38
+
39
+ # --- Loss Functions ---
40
+ def weighted_l2_loss(reconstructed: jnp.ndarray, target: jnp.ndarray) -> jnp.ndarray:
41
+ error = reconstructed - target
42
+ weights = jnp.where(target != 0, 100.0, 1.0)
43
+ return jnp.mean(weights * (error**2))
44
+
45
+
46
+ def l2_loss_real(x: jnp.ndarray, y: jnp.ndarray) -> jnp.ndarray:
47
+ return jnp.mean((x - y) ** 2)
48
+
49
+
50
+ # --- Hyperparameters ---
51
+ @dataclass
52
+ class Hyperparameters:
53
+ rank: int = 55
54
+ # Phase 1: Continuous Search
55
+ num_restarts: int = 10
56
+ phase1_steps: int = 80000
57
+ phase1_lr: float = 0.01
58
+ init_scale: float = 0.1
59
+ l1_strength: float = 1e-6
60
+ clamp_range: float = 4.0
61
+ # Phase 2: Discrete Fine-tuning
62
+ phase2_steps: int = 20000
63
+ phase2_lr: float = 1e-4 # A much smaller learning rate for fine-tuning
64
+
65
+
66
+ # --- Optimizer Classes ---
67
+ class ContinuousOptimizer:
68
+ """Finds a high-quality approximate continuous solution."""
69
+
70
+ def __init__(self, target_tensor: jnp.ndarray, hypers: Hyperparameters):
71
+ self.target_tensor = target_tensor
72
+ self.hypers = hypers
73
+ self.opt = optax.adam(hypers.phase1_lr)
74
+
75
+ def _get_constrained_decomposition(self, latent_decomposition: tuple) -> tuple:
76
+ """Applies a scaled tanh to map latent parameters to the desired range."""
77
+ return jax.tree_util.tree_map(
78
+ lambda x: self.hypers.clamp_range * jnp.tanh(x), latent_decomposition
79
+ )
80
+
81
+ def _loss_fn(self, latent_decomposition: tuple) -> jnp.ndarray:
82
+ constrained = self._get_constrained_decomposition(latent_decomposition)
83
+ reconstructed = jnp.einsum("ir,jr,kr->ijk", *constrained)
84
+ recon_loss = weighted_l2_loss(reconstructed, self.target_tensor)
85
+ l1_penalty = sum(jnp.mean(jnp.abs(arr)) for arr in constrained)
86
+ return recon_loss + self.hypers.l1_strength * l1_penalty
87
+
88
+
89
+ class DiscreteOptimizer:
90
+ """Refines a continuous solution into an exact discrete one using an STE."""
91
+
92
+ def __init__(self, target_tensor: jnp.ndarray, hypers: Hyperparameters):
93
+ self.target_tensor = target_tensor
94
+ self.hypers = hypers
95
+ self.opt = optax.adam(hypers.phase2_lr)
96
+
97
+ def _loss_fn(self, continuous_decomposition: tuple) -> jnp.ndarray:
98
+ # Snap the continuous parameters to the discrete grid
99
+ discrete_decomposition = jax.tree_util.tree_map(round_to_half_ste, continuous_decomposition)
100
+ # Compute the loss using only these exact half-integer values
101
+ reconstructed = jnp.einsum("ir,jr,kr->ijk", *discrete_decomposition)
102
+ return l2_loss_real(reconstructed, self.target_tensor)
103
+
104
+
105
+ # --- JIT-compatible Train Step ---
106
+ def train_step(params, opt_state, optimizer, loss_fn):
107
+ loss, grads = jax.value_and_grad(loss_fn)(params)
108
+ updates, opt_state = optimizer.update(grads, opt_state, params)
109
+ params = optax.apply_updates(params, updates)
110
+ return params, opt_state, loss
111
+
112
+
113
+ def get_matrix_multiplication_tensor(n, m, p):
114
+ T = jnp.zeros((n * m, m * p, n * p))
115
+ for i, j, k in np.ndindex(n, m, p):
116
+ T = T.at[i * m + j, j * p + k, k * n + i].set(1)
117
+ return T
118
+
119
+
120
+ def run():
121
+ hypers = Hyperparameters()
122
+ target_tensor = get_matrix_multiplication_tensor(n, m, p)
123
+ main_key = jax.random.PRNGKey(42)
124
+
125
+ # --- PHASE 1: CONTINUOUS EXPLORATION ---
126
+ print(f"\n{'='*20} PHASE 1: Continuous Exploration {'='*20}")
127
+ best_loss_phase1 = float("inf")
128
+ best_latent_decomp = None
129
+
130
+ continuous_optimizer = ContinuousOptimizer(target_tensor, hypers)
131
+
132
+ # JIT the train_step for the continuous phase
133
+ jit_train_step_continuous = jax.jit(train_step, static_argnums=(2, 3))
134
+
135
+ for i in range(hypers.num_restarts):
136
+ print(f"\n--- Restart {i+1}/{hypers.num_restarts} ---")
137
+ main_key, restart_key = jax.random.split(main_key)
138
+ init_fn = jax.nn.initializers.normal(stddev=hypers.init_scale)
139
+ latent_decomp = (
140
+ init_fn(restart_key, (n * m, hypers.rank)),
141
+ init_fn(restart_key, (m * p, hypers.rank)),
142
+ init_fn(restart_key, (n * p, hypers.rank)),
143
+ )
144
+ opt_state = continuous_optimizer.opt.init(latent_decomp)
145
+
146
+ for _ in tqdm.tqdm(range(hypers.phase1_steps), desc="Continuous Search"):
147
+ latent_decomp, opt_state, loss = jit_train_step_continuous(
148
+ latent_decomp,
149
+ opt_state,
150
+ continuous_optimizer.opt,
151
+ continuous_optimizer._loss_fn,
152
+ )
153
+
154
+ final_loss = l2_loss_real(
155
+ target_tensor,
156
+ jnp.einsum(
157
+ "ir,jr,kr->ijk",
158
+ *continuous_optimizer._get_constrained_decomposition(latent_decomp),
159
+ ),
160
+ )
161
+ print(f"End of Trial | Final continuous loss: {final_loss:.8f}")
162
+
163
+ if final_loss < best_loss_phase1:
164
+ best_loss_phase1 = final_loss
165
+ best_latent_decomp = latent_decomp
166
+
167
+ # --- PHASE 2: DISCRETE FINE-TUNING ---
168
+ print(f"\n{'='*20} PHASE 2: Discrete Fine-tuning (STE) {'='*20}")
169
+ print(f"Starting with best continuous solution (loss: {best_loss_phase1:.8f})")
170
+
171
+ continuous_params = continuous_optimizer._get_constrained_decomposition(best_latent_decomp)
172
+
173
+ discrete_optimizer = DiscreteOptimizer(target_tensor, hypers)
174
+ opt_state = discrete_optimizer.opt.init(continuous_params)
175
+
176
+ # JIT the train_step for the discrete phase
177
+ jit_train_step_discrete = jax.jit(train_step, static_argnums=(2, 3))
178
+
179
+ for step in tqdm.tqdm(range(hypers.phase2_steps), desc="Discrete Fine-tuning"):
180
+ continuous_params, opt_state, loss = jit_train_step_discrete(
181
+ continuous_params, opt_state, discrete_optimizer.opt, discrete_optimizer._loss_fn
182
+ )
183
+ if (step + 1) % 2000 == 0:
184
+ print(f"Step {step+1} | Discrete Loss: {loss:.8f}")
185
+ if loss < 1e-7:
186
+ print("\nFound a perfect solution!")
187
+ break
188
+
189
+ final_discrete_decomposition = jax.tree_util.tree_map(round_to_half_ste, continuous_params)
190
+ final_loss = l2_loss_real(
191
+ target_tensor, jnp.einsum("ir,jr,kr->ijk", *final_discrete_decomposition)
192
+ )
193
+ print(f"Search complete. Final discrete loss: {final_loss:.8f}")
194
+
195
+ final_decomposition_np = jax.tree_util.tree_map(np.array, final_discrete_decomposition)
196
+ return final_decomposition_np, n, m, p, float(final_loss), hypers.rank
197
+
198
+
199
+ # EVOLVE-BLOCK-END
benchmarks/math/minimizing_max_min_dist/2/config.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Math benchmark: minimizing_max_min_dist/2
2
+ # Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
3
+ language: python
4
+ diff_based_generation: true
5
+ max_iterations: 100
6
+ checkpoint_interval: 10
7
+ max_solution_length: 60000
8
+ llm:
9
+ api_base: https://api.openai.com/v1
10
+ models:
11
+ - name: "gpt-5"
12
+ weight: 1.0
13
+ max_tokens: 32000
14
+ timeout: 600
15
+ prompt:
16
+ system_message: "SETTING:\nYou are an expert computational geometer and optimization specialist focusing on point dispersion\
17
+ \ problems.\nYour task is to evolve a constructor function that generates an optimal arrangement of exactly 16 points\
18
+ \ in 2D space, maximizing the ratio of minimum distance to maximum distance between all point pairs.\n\nPROBLEM CONTEXT:\n\
19
+ - Target: Beat the AlphaEvolve benchmark of min/max ratio = 1/√12.889266112 ≈ 0.2786\n- Constraint: Points must be placed\
20
+ \ in 2D Euclidean space (typically normalized to unit square [0,1] × [0,1])\n- Mathematical formulation: For points Pi\
21
+ \ = (xi, yi), i = 1,...,16:\n * Distance matrix: dij = √[(xi-xj)² + (yi-yj)²] for all i≠j\n * Minimum distance: dmin\
22
+ \ = min{dij : i≠j}\n * Maximum distance: dmax = max{dij : i≠j}\n * Objective: maximize dmin/dmax subject to spatial\
23
+ \ constraints\n\nPERFORMANCE METRICS:\n1. **min_max_ratio**: dmin/dmax ratio (PRIMARY OBJECTIVE - maximize)\n2. **combined_score**:\
24
+ \ min_max_ratio / 0.2786 (progress toward beating AlphaEvolve benchmark)\n3. **eval_time**: Execution time in seconds\
25
+ \ (balance accuracy vs. efficiency)\n\nTECHNICAL REQUIREMENTS:\n- **Reproducibility**: Fixed random seeds for all stochastic\
26
+ \ components\n"
27
+ evaluator:
28
+ timeout: 360
29
+ max_retries: 3
benchmarks/math/minimizing_max_min_dist/2/evaluator/Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+ WORKDIR /benchmark
3
+
4
+ COPY requirements.txt .
5
+ RUN pip install --no-cache-dir -r requirements.txt
6
+
7
+ # wrapper.py provides backwards compatibility for old Python-based evaluators
8
+ # that define evaluate(program_path) -> dict. Bridges them to the container
9
+ # JSON protocol. Source of truth: skydiscover/evaluation/wrapper.py
10
+ COPY . .
11
+ RUN chmod +x evaluate.sh
12
+
13
+ ENTRYPOINT ["./evaluate.sh"]
benchmarks/math/minimizing_max_min_dist/2/evaluator/evaluate.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ PROGRAM="$1"
5
+ # MODE ($2) accepted but ignored — override this file to use train/test splits.
6
+
7
+ python /benchmark/evaluator.py "$PROGRAM"
benchmarks/math/minimizing_max_min_dist/2/evaluator/evaluator.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ===--------------------------------------------------------------------------------------===#
2
+ #
3
+ # This file implements the evaluator for problem of minimizing the ratio of maximum
4
+ # to minimum distance on dimension 2 and with 16 points.
5
+ #
6
+ # ===--------------------------------------------------------------------------------------===#
7
+ #
8
+ # Some of the code in this file is adapted from:
9
+ #
10
+ # google-deepmind/alphaevolve_results:
11
+ # Licensed under the Apache License v2.0.
12
+ #
13
+ # ===--------------------------------------------------------------------------------------===#
14
+
15
+ import sys
16
+ import os
17
+ from importlib import __import__
18
+ import scipy as sp
19
+ import time
20
+ import numpy as np
21
+
22
+ NUM_POINTS = 16
23
+ DIMENSION = 2
24
+ BENCHMARK = 1 / 12.889266112
25
+
26
+ # Scoring: (dmin/dmax)^2.
27
+ # Key reformulation: maximize auxiliary variable t
28
+ # subject to d(i,j)^2 >= t AND d(i,j)^2 <= 1 for every pair (i,j).
29
+ # This is a constrained NLP with O(n^2) pairwise inequality constraints.
30
+
31
+
32
+ def evaluate(program_path: str):
33
+ try:
34
+ abs_program_path = os.path.abspath(program_path)
35
+ program_dir = os.path.dirname(abs_program_path)
36
+ module_name = os.path.splitext(os.path.basename(program_path))[0]
37
+
38
+ try:
39
+ sys.path.insert(0, program_dir)
40
+ program = __import__(module_name)
41
+ start_time = time.time()
42
+ points = program.min_max_dist_dim2_16()
43
+ end_time = time.time()
44
+ eval_time = end_time - start_time
45
+ except Exception as err:
46
+ raise err
47
+ finally:
48
+ if program_dir in sys.path:
49
+ sys.path.remove(program_dir)
50
+
51
+ if not isinstance(points, np.ndarray):
52
+ points = np.array(points)
53
+
54
+ if points.shape != (NUM_POINTS, DIMENSION):
55
+ raise ValueError(
56
+ f"Invalid shapes: points = {points.shape}, expected {(NUM_POINTS,DIMENSION)}"
57
+ )
58
+
59
+ pairwise_distances = sp.spatial.distance.pdist(points)
60
+ min_distance = np.min(pairwise_distances)
61
+ max_distance = np.max(pairwise_distances)
62
+
63
+ inv_ratio_squared = (min_distance / max_distance) ** 2 if max_distance > 0 else 0
64
+ return {
65
+ "min_max_ratio": float(inv_ratio_squared),
66
+ "combined_score": float(inv_ratio_squared / BENCHMARK),
67
+ "eval_time": float(eval_time),
68
+ }
69
+ except Exception as e:
70
+ return {"combined_score": 0.0, "error": str(e)}
71
+
72
+
73
+ if __name__ == "__main__":
74
+ # Backwards-compat: bridges old evaluate() -> dict to the container JSON
75
+ # protocol. wrapper.py is copied from skydiscover/evaluation/wrapper.py.
76
+ from wrapper import run
77
+
78
+ run(evaluate)
benchmarks/math/minimizing_max_min_dist/2/evaluator/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ numpy
2
+ scipy
benchmarks/math/minimizing_max_min_dist/2/evaluator/wrapper.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Backwards-compat wrapper for old Python-based evaluators.
2
+
3
+ Old-style evaluators define ``evaluate(program_path) -> dict``. This module
4
+ bridges that interface to the container JSON protocol expected by
5
+ ContainerizedEvaluator.
6
+
7
+ Usage — add this to the bottom of your evaluator.py::
8
+
9
+ if __name__ == "__main__":
10
+ from wrapper import run
11
+ run(evaluate)
12
+ """
13
+
14
+ import json
15
+ import sys
16
+ import traceback
17
+
18
+
19
+ def run(evaluate_fn):
20
+ """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
21
+
22
+ * Reads ``sys.argv[1]`` as the program path.
23
+ * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
24
+ don't contaminate the JSON output.
25
+ * Separates numeric metrics from non-numeric artifacts.
26
+ * Guarantees ``combined_score`` is always present in metrics.
27
+ """
28
+ if len(sys.argv) < 2:
29
+ print("Usage: evaluator.py <program_path>", file=sys.stderr)
30
+ sys.exit(1)
31
+
32
+ program_path = sys.argv[1]
33
+
34
+ # Redirect stdout → stderr during evaluation so debug prints from
35
+ # the evaluator don't contaminate the JSON output on stdout.
36
+ real_stdout = sys.stdout
37
+ sys.stdout = sys.stderr
38
+ try:
39
+ result = evaluate_fn(program_path)
40
+ except Exception as e:
41
+ sys.stdout = real_stdout
42
+ print(
43
+ json.dumps(
44
+ {
45
+ "status": "error",
46
+ "combined_score": 0.0,
47
+ "metrics": {"combined_score": 0.0},
48
+ "artifacts": {
49
+ "error": str(e),
50
+ "traceback": traceback.format_exc(),
51
+ },
52
+ }
53
+ )
54
+ )
55
+ return
56
+ sys.stdout = real_stdout
57
+
58
+ if not isinstance(result, dict):
59
+ print(
60
+ json.dumps(
61
+ {
62
+ "status": "error",
63
+ "combined_score": 0.0,
64
+ "metrics": {"combined_score": 0.0},
65
+ "artifacts": {
66
+ "error": f"evaluate() returned {type(result).__name__}, expected dict"
67
+ },
68
+ }
69
+ )
70
+ )
71
+ return
72
+
73
+ # Separate numeric metrics from non-numeric artifacts.
74
+ metrics = {}
75
+ artifacts = {}
76
+ for k, v in result.items():
77
+ if isinstance(v, bool):
78
+ metrics[k] = float(v)
79
+ elif isinstance(v, (int, float)):
80
+ metrics[k] = float(v)
81
+ elif isinstance(v, str):
82
+ artifacts[k] = v
83
+ elif isinstance(v, (list, dict)):
84
+ artifacts[k] = json.dumps(v)
85
+
86
+ if "combined_score" not in metrics:
87
+ metrics["combined_score"] = 0.0
88
+
89
+ status = "error" if "error" in artifacts else "success"
90
+ output = {
91
+ "status": status,
92
+ "combined_score": metrics["combined_score"],
93
+ "metrics": metrics,
94
+ }
95
+ if artifacts:
96
+ output["artifacts"] = artifacts
97
+
98
+ print(json.dumps(output))
benchmarks/math/minimizing_max_min_dist/2/initial_program.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # EVOLVE-BLOCK-START
2
+ import numpy as np
3
+
4
+
5
+ def min_max_dist_dim2_16() -> np.ndarray:
6
+ """
7
+ Creates 16 points in 2 dimensions in order to maximize the ratio of minimum to maximum distance.
8
+
9
+ Returns
10
+ points: np.ndarray of shape (16,2) containing the (x,y) coordinates of the 16 points.
11
+
12
+ """
13
+
14
+ n = 16
15
+ d = 2
16
+
17
+ # places points randomly
18
+ np.random.seed(42)
19
+ points = np.random.randn(n, d)
20
+
21
+ return points
22
+
23
+
24
+ # EVOLVE-BLOCK-END