red1bluelost
commited on
Commit
·
6bd9122
1
Parent(s):
9997d13
Adds initial evaluation of just runtime completion tests.
Browse files- README.md +0 -2
- evaluate_genericify_cpp.py +150 -33
- execute.py +130 -0
README.md
CHANGED
@@ -1,7 +1,5 @@
|
|
1 |
---
|
2 |
title: evaluate_genericify_cpp
|
3 |
-
datasets:
|
4 |
-
-
|
5 |
tags:
|
6 |
- evaluate
|
7 |
- metric
|
|
|
1 |
---
|
2 |
title: evaluate_genericify_cpp
|
|
|
|
|
3 |
tags:
|
4 |
- evaluate
|
5 |
- metric
|
evaluate_genericify_cpp.py
CHANGED
@@ -1,21 +1,15 @@
|
|
1 |
-
#
|
2 |
-
#
|
3 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
-
# you may not use this file except in compliance with the License.
|
5 |
-
# You may obtain a copy of the License at
|
6 |
-
#
|
7 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
-
#
|
9 |
-
# Unless required by applicable law or agreed to in writing, software
|
10 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
-
# See the License for the specific language governing permissions and
|
13 |
-
# limitations under the License.
|
14 |
"""TODO: Add a description here."""
|
|
|
|
|
15 |
|
16 |
-
import
|
17 |
import datasets
|
|
|
|
|
|
|
18 |
|
|
|
19 |
|
20 |
# TODO: Add BibTeX citation
|
21 |
_CITATION = """\
|
@@ -53,43 +47,166 @@ Examples:
|
|
53 |
{'accuracy': 1.0}
|
54 |
"""
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
# TODO: Define external resources urls if needed
|
57 |
BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
|
58 |
|
59 |
|
60 |
-
@evaluate.utils.file_utils.add_start_docstrings(
|
61 |
-
|
|
|
|
|
62 |
"""TODO: Short description of my evaluation module."""
|
63 |
|
64 |
def _info(self):
|
65 |
# TODO: Specifies the evaluate.EvaluationModuleInfo object
|
66 |
return evaluate.MetricInfo(
|
67 |
# This is the description that will appear on the modules page.
|
68 |
-
module_type="metric",
|
69 |
description=_DESCRIPTION,
|
70 |
citation=_CITATION,
|
71 |
inputs_description=_KWARGS_DESCRIPTION,
|
72 |
# This defines the format of each prediction and reference
|
73 |
-
features=datasets.Features(
|
74 |
-
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
# Homepage of the module for documentation
|
78 |
homepage="http://module.homepage",
|
79 |
# Additional links to the codebase or references
|
80 |
codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
|
81 |
-
reference_urls=["http://path.to.reference.url/new_module"]
|
82 |
)
|
83 |
|
84 |
-
def
|
85 |
-
"""Optional: download external resources useful to compute the scores"""
|
86 |
-
# TODO: Download external resources if needed
|
87 |
-
pass
|
88 |
-
|
89 |
-
def _compute(self, predictions, references):
|
90 |
"""Returns the scores"""
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Heavily adapted from `Muennighoff/code_eval_octopack`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
"""TODO: Add a description here."""
|
3 |
+
import collections
|
4 |
+
import os
|
5 |
|
6 |
+
import concurrent.futures
|
7 |
import datasets
|
8 |
+
import evaluate
|
9 |
+
import itertools
|
10 |
+
import numpy as np
|
11 |
|
12 |
+
from .execute import check_correctness
|
13 |
|
14 |
# TODO: Add BibTeX citation
|
15 |
_CITATION = """\
|
|
|
47 |
{'accuracy': 1.0}
|
48 |
"""
|
49 |
|
50 |
+
_WARNING = """
|
51 |
+
################################################################################
|
52 |
+
!!!WARNING!!!
|
53 |
+
################################################################################
|
54 |
+
The "code_eval" metric executes untrusted model-generated code in Python.
|
55 |
+
Although it is highly unlikely that model-generated code will do something
|
56 |
+
overtly malicious in response to this test suite, model-generated code may act
|
57 |
+
destructively due to a lack of model capability or alignment.
|
58 |
+
Users are strongly encouraged to sandbox this evaluation suite so that it
|
59 |
+
does not perform destructive actions on their host or network. For more
|
60 |
+
information on how OpenAI sandboxes its code, see the paper "Evaluating Large
|
61 |
+
Language Models Trained on Code" (https://arxiv.org/abs/2107.03374).
|
62 |
+
Once you have read this disclaimer and taken appropriate precautions,
|
63 |
+
set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this
|
64 |
+
with:
|
65 |
+
>>> import os
|
66 |
+
>>> os.environ["HF_ALLOW_CODE_EVAL"] = "1"
|
67 |
+
################################################################################\
|
68 |
+
"""
|
69 |
+
|
70 |
+
_CLANG_WARNING = """
|
71 |
+
Please provide the environment variable 'GENERICIFY_CLANG' with the path of the
|
72 |
+
clang++ compiler. Version 15+ is required. Within Python you can to this
|
73 |
+
with:
|
74 |
+
>>> import os
|
75 |
+
>>> os.environ["GENERICIFY_CLANG"] = "/path/to/clang++"
|
76 |
+
"""
|
77 |
+
|
78 |
# TODO: Define external resources urls if needed
|
79 |
BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
|
80 |
|
81 |
|
82 |
+
@evaluate.utils.file_utils.add_start_docstrings(
|
83 |
+
_DESCRIPTION, _KWARGS_DESCRIPTION
|
84 |
+
)
|
85 |
+
class EvaluateGenericifyCpp(evaluate.Metric):
|
86 |
"""TODO: Short description of my evaluation module."""
|
87 |
|
88 |
def _info(self):
|
89 |
# TODO: Specifies the evaluate.EvaluationModuleInfo object
|
90 |
return evaluate.MetricInfo(
|
91 |
# This is the description that will appear on the modules page.
|
|
|
92 |
description=_DESCRIPTION,
|
93 |
citation=_CITATION,
|
94 |
inputs_description=_KWARGS_DESCRIPTION,
|
95 |
# This defines the format of each prediction and reference
|
96 |
+
features=datasets.Features(
|
97 |
+
{
|
98 |
+
"predictions": datasets.Sequence(
|
99 |
+
datasets.Features(
|
100 |
+
{
|
101 |
+
"base": datasets.Value("string"),
|
102 |
+
"sfinae": datasets.Value("string"),
|
103 |
+
"concepts": datasets.Value("string"),
|
104 |
+
}
|
105 |
+
)
|
106 |
+
),
|
107 |
+
"references": datasets.Features(
|
108 |
+
{
|
109 |
+
"tests": datasets.Value("string"),
|
110 |
+
"invalids": datasets.Value("string"),
|
111 |
+
}
|
112 |
+
),
|
113 |
+
}
|
114 |
+
),
|
115 |
# Homepage of the module for documentation
|
116 |
homepage="http://module.homepage",
|
117 |
# Additional links to the codebase or references
|
118 |
codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
|
119 |
+
reference_urls=["http://path.to.reference.url/new_module"],
|
120 |
)
|
121 |
|
122 |
+
def _compute(self, *, predictions, references, k=[1, 10, 100]):
|
|
|
|
|
|
|
|
|
|
|
123 |
"""Returns the scores"""
|
124 |
+
num_workers = 4
|
125 |
+
|
126 |
+
if os.getenv("HF_ALLOW_CODE_EVAL", default=0) != "1":
|
127 |
+
raise ValueError(_WARNING)
|
128 |
+
|
129 |
+
if os.getenv("GENERICIFY_CLANG", default=0) == 0:
|
130 |
+
raise ValueError(_CLANG_WARNING)
|
131 |
+
|
132 |
+
if os.name == "nt":
|
133 |
+
raise NotImplementedError(
|
134 |
+
"This metric is currently not supported on Windows."
|
135 |
+
)
|
136 |
+
|
137 |
+
with concurrent.futures.ThreadPoolExecutor(
|
138 |
+
max_workers=num_workers
|
139 |
+
) as executor:
|
140 |
+
futures = []
|
141 |
+
completion_id = collections.Counter()
|
142 |
+
results = collections.defaultdict(list)
|
143 |
+
|
144 |
+
for task_id, (candidates, reference) in enumerate(
|
145 |
+
zip(predictions, references)
|
146 |
+
):
|
147 |
+
for candidate in candidates:
|
148 |
+
args = (
|
149 |
+
candidate,
|
150 |
+
reference,
|
151 |
+
task_id,
|
152 |
+
completion_id[task_id],
|
153 |
+
)
|
154 |
+
future = executor.submit(check_correctness, *args)
|
155 |
+
futures.append(future)
|
156 |
+
completion_id[task_id] += 1
|
157 |
+
|
158 |
+
for future in concurrent.futures.as_completed(futures):
|
159 |
+
result = future.result()
|
160 |
+
results[result["task_id"]].append(
|
161 |
+
(result["completion_id"], result)
|
162 |
+
)
|
163 |
+
|
164 |
+
totals = collections.defaultdict(list)
|
165 |
+
corrects = collections.defaultdict(list)
|
166 |
+
for result in results.values():
|
167 |
+
result.sort()
|
168 |
+
for pt in [
|
169 |
+
"base_run_passed",
|
170 |
+
"sfinae_run_passed",
|
171 |
+
"concepts_run_passed",
|
172 |
+
]:
|
173 |
+
passed = [r[1][pt] for r in result]
|
174 |
+
totals[pt].append(len(passed))
|
175 |
+
corrects[pt].append(sum(passed))
|
176 |
+
|
177 |
+
totals = {k: np.array(v) for k, v in totals.items()}
|
178 |
+
corrects = {k: np.array(v) for k, v in corrects.items()}
|
179 |
+
|
180 |
+
ks = k
|
181 |
+
pass_at_k = {
|
182 |
+
f"{key}@{k}": estimate_pass_at_k(
|
183 |
+
totals[key],
|
184 |
+
corrects[key],
|
185 |
+
k,
|
186 |
+
).mean()
|
187 |
+
for key in totals.keys()
|
188 |
+
for k in ks
|
189 |
+
if (totals[key] >= k).all()
|
190 |
+
}
|
191 |
+
|
192 |
+
return pass_at_k, results
|
193 |
+
|
194 |
+
|
195 |
+
def estimate_pass_at_k(num_samples, num_correct, k) -> np.array:
|
196 |
+
"""Estimates pass@k of each problem and returns them in an array."""
|
197 |
+
|
198 |
+
def estimator(n: int, c: int) -> float:
|
199 |
+
"""Calculates 1 - comb(n - c, k) / comb(n, k)."""
|
200 |
+
if n - c < k:
|
201 |
+
return 1.0
|
202 |
+
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
|
203 |
+
|
204 |
+
if isinstance(num_samples, int):
|
205 |
+
num_samples_it = itertools.repeat(num_samples, len(num_correct))
|
206 |
+
else:
|
207 |
+
assert len(num_samples) == len(num_correct)
|
208 |
+
num_samples_it = iter(num_samples)
|
209 |
+
|
210 |
+
return np.array(
|
211 |
+
[estimator(int(n), int(c)) for n, c in zip(num_samples_it, num_correct)]
|
212 |
+
)
|
execute.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import contextlib
|
2 |
+
import multiprocessing
|
3 |
+
import os
|
4 |
+
import subprocess
|
5 |
+
import tempfile
|
6 |
+
|
7 |
+
|
8 |
+
def check_correctness(candidate, reference, task_id, completion_id):
|
9 |
+
"""
|
10 |
+
Evaluates the functional correctness of a completion by running the test
|
11 |
+
suite provided in the problem.
|
12 |
+
|
13 |
+
:param completion_id: an optional completion ID so we can match
|
14 |
+
the results later even if execution finishes asynchronously.
|
15 |
+
"""
|
16 |
+
|
17 |
+
manager = multiprocessing.Manager()
|
18 |
+
base_run_result = manager.list()
|
19 |
+
process_case(
|
20 |
+
unsafe_execute_cpp,
|
21 |
+
candidate["base"],
|
22 |
+
reference["tests"],
|
23 |
+
base_run_result,
|
24 |
+
)
|
25 |
+
sfinae_run_result = manager.list()
|
26 |
+
process_case(
|
27 |
+
unsafe_execute_cpp,
|
28 |
+
candidate["sfinae"],
|
29 |
+
reference["tests"],
|
30 |
+
sfinae_run_result,
|
31 |
+
)
|
32 |
+
concepts_run_result = manager.list()
|
33 |
+
process_case(
|
34 |
+
unsafe_execute_cpp,
|
35 |
+
candidate["concepts"],
|
36 |
+
reference["tests"],
|
37 |
+
concepts_run_result,
|
38 |
+
)
|
39 |
+
|
40 |
+
return dict(
|
41 |
+
task_id=task_id,
|
42 |
+
completion_id=completion_id,
|
43 |
+
base_run_passed=base_run_result[0] == "passed",
|
44 |
+
base_run_result=base_run_result[0],
|
45 |
+
sfinae_run_passed=sfinae_run_result[0] == "passed",
|
46 |
+
sfinae_run_result=sfinae_run_result[0],
|
47 |
+
concepts_run_passed=concepts_run_result[0] == "passed",
|
48 |
+
concepts_run_result=concepts_run_result[0],
|
49 |
+
)
|
50 |
+
|
51 |
+
|
52 |
+
def process_case(target, candidate, reference, result):
|
53 |
+
timeout = 60
|
54 |
+
|
55 |
+
p = multiprocessing.Process(
|
56 |
+
target=target,
|
57 |
+
args=(candidate, reference, result, timeout),
|
58 |
+
)
|
59 |
+
|
60 |
+
p.start()
|
61 |
+
p.join(timeout=timeout + 5)
|
62 |
+
if p.is_alive():
|
63 |
+
p.kill()
|
64 |
+
|
65 |
+
if not result:
|
66 |
+
result.append("timed out")
|
67 |
+
|
68 |
+
|
69 |
+
def unsafe_execute_cpp(candidate, reference, result, timeout):
|
70 |
+
with create_tempdir():
|
71 |
+
code = "#include <bits/stdc++.h>\n" + candidate + reference
|
72 |
+
open(f"test.cpp", "w").write(code)
|
73 |
+
|
74 |
+
cpp_compiler = os.getenv("GENERICIFY_CLANG")
|
75 |
+
compilation_result = subprocess.run(
|
76 |
+
[cpp_compiler, "-std=c++20", "test.cpp"],
|
77 |
+
timeout=timeout,
|
78 |
+
capture_output=True,
|
79 |
+
)
|
80 |
+
if compilation_result.returncode != 0:
|
81 |
+
if compilation_result.stderr:
|
82 |
+
err = compilation_result.stderr.decode()
|
83 |
+
else:
|
84 |
+
err = compilation_result.stdout.decode()
|
85 |
+
result.append(f"failed: compilation error: {err}")
|
86 |
+
else:
|
87 |
+
try:
|
88 |
+
exec_result = subprocess.run(
|
89 |
+
["./a.out"], timeout=timeout, capture_output=True
|
90 |
+
)
|
91 |
+
|
92 |
+
if exec_result.returncode == 0:
|
93 |
+
result.append("passed")
|
94 |
+
else:
|
95 |
+
if exec_result.stderr:
|
96 |
+
try:
|
97 |
+
err = exec_result.stderr.decode()
|
98 |
+
except:
|
99 |
+
err = exec_result.stderr
|
100 |
+
else:
|
101 |
+
try:
|
102 |
+
err = exec_result.stdout.decode()
|
103 |
+
except:
|
104 |
+
err = exec_result.stdout
|
105 |
+
result.append(f"failed: {err}")
|
106 |
+
|
107 |
+
except subprocess.TimeoutExpired as e:
|
108 |
+
result.append("timed out")
|
109 |
+
|
110 |
+
|
111 |
+
@contextlib.contextmanager
|
112 |
+
def create_tempdir():
|
113 |
+
with tempfile.TemporaryDirectory() as dirname:
|
114 |
+
with chdir(dirname):
|
115 |
+
yield dirname
|
116 |
+
|
117 |
+
|
118 |
+
@contextlib.contextmanager
|
119 |
+
def chdir(root):
|
120 |
+
if root == ".":
|
121 |
+
yield
|
122 |
+
return
|
123 |
+
cwd = os.getcwd()
|
124 |
+
os.chdir(root)
|
125 |
+
try:
|
126 |
+
yield
|
127 |
+
except BaseException as exc:
|
128 |
+
raise exc
|
129 |
+
finally:
|
130 |
+
os.chdir(cwd)
|