red1bluelost commited on
Commit
6bd9122
·
1 Parent(s): 9997d13

Adds initial evaluation of just runtime completion tests.

Browse files
Files changed (3) hide show
  1. README.md +0 -2
  2. evaluate_genericify_cpp.py +150 -33
  3. execute.py +130 -0
README.md CHANGED
@@ -1,7 +1,5 @@
1
  ---
2
  title: evaluate_genericify_cpp
3
- datasets:
4
- -
5
  tags:
6
  - evaluate
7
  - metric
 
1
  ---
2
  title: evaluate_genericify_cpp
 
 
3
  tags:
4
  - evaluate
5
  - metric
evaluate_genericify_cpp.py CHANGED
@@ -1,21 +1,15 @@
1
- # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
  """TODO: Add a description here."""
 
 
15
 
16
- import evaluate
17
  import datasets
 
 
 
18
 
 
19
 
20
  # TODO: Add BibTeX citation
21
  _CITATION = """\
@@ -53,43 +47,166 @@ Examples:
53
  {'accuracy': 1.0}
54
  """
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  # TODO: Define external resources urls if needed
57
  BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
58
 
59
 
60
- @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
61
- class evaluate_genericify_cpp(evaluate.Metric):
 
 
62
  """TODO: Short description of my evaluation module."""
63
 
64
  def _info(self):
65
  # TODO: Specifies the evaluate.EvaluationModuleInfo object
66
  return evaluate.MetricInfo(
67
  # This is the description that will appear on the modules page.
68
- module_type="metric",
69
  description=_DESCRIPTION,
70
  citation=_CITATION,
71
  inputs_description=_KWARGS_DESCRIPTION,
72
  # This defines the format of each prediction and reference
73
- features=datasets.Features({
74
- 'predictions': datasets.Value('int64'),
75
- 'references': datasets.Value('int64'),
76
- }),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  # Homepage of the module for documentation
78
  homepage="http://module.homepage",
79
  # Additional links to the codebase or references
80
  codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
81
- reference_urls=["http://path.to.reference.url/new_module"]
82
  )
83
 
84
- def _download_and_prepare(self, dl_manager):
85
- """Optional: download external resources useful to compute the scores"""
86
- # TODO: Download external resources if needed
87
- pass
88
-
89
- def _compute(self, predictions, references):
90
  """Returns the scores"""
91
- # TODO: Compute the different scores of the module
92
- accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
93
- return {
94
- "accuracy": accuracy,
95
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Heavily adapted from `Muennighoff/code_eval_octopack`
 
 
 
 
 
 
 
 
 
 
 
 
2
  """TODO: Add a description here."""
3
+ import collections
4
+ import os
5
 
6
+ import concurrent.futures
7
  import datasets
8
+ import evaluate
9
+ import itertools
10
+ import numpy as np
11
 
12
+ from .execute import check_correctness
13
 
14
  # TODO: Add BibTeX citation
15
  _CITATION = """\
 
47
  {'accuracy': 1.0}
48
  """
49
 
50
+ _WARNING = """
51
+ ################################################################################
52
+ !!!WARNING!!!
53
+ ################################################################################
54
+ The "code_eval" metric executes untrusted model-generated code in Python.
55
+ Although it is highly unlikely that model-generated code will do something
56
+ overtly malicious in response to this test suite, model-generated code may act
57
+ destructively due to a lack of model capability or alignment.
58
+ Users are strongly encouraged to sandbox this evaluation suite so that it
59
+ does not perform destructive actions on their host or network. For more
60
+ information on how OpenAI sandboxes its code, see the paper "Evaluating Large
61
+ Language Models Trained on Code" (https://arxiv.org/abs/2107.03374).
62
+ Once you have read this disclaimer and taken appropriate precautions,
63
+ set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this
64
+ with:
65
+ >>> import os
66
+ >>> os.environ["HF_ALLOW_CODE_EVAL"] = "1"
67
+ ################################################################################\
68
+ """
69
+
70
+ _CLANG_WARNING = """
71
+ Please provide the environment variable 'GENERICIFY_CLANG' with the path of the
72
+ clang++ compiler. Version 15+ is required. Within Python you can to this
73
+ with:
74
+ >>> import os
75
+ >>> os.environ["GENERICIFY_CLANG"] = "/path/to/clang++"
76
+ """
77
+
78
  # TODO: Define external resources urls if needed
79
  BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
80
 
81
 
82
+ @evaluate.utils.file_utils.add_start_docstrings(
83
+ _DESCRIPTION, _KWARGS_DESCRIPTION
84
+ )
85
+ class EvaluateGenericifyCpp(evaluate.Metric):
86
  """TODO: Short description of my evaluation module."""
87
 
88
  def _info(self):
89
  # TODO: Specifies the evaluate.EvaluationModuleInfo object
90
  return evaluate.MetricInfo(
91
  # This is the description that will appear on the modules page.
 
92
  description=_DESCRIPTION,
93
  citation=_CITATION,
94
  inputs_description=_KWARGS_DESCRIPTION,
95
  # This defines the format of each prediction and reference
96
+ features=datasets.Features(
97
+ {
98
+ "predictions": datasets.Sequence(
99
+ datasets.Features(
100
+ {
101
+ "base": datasets.Value("string"),
102
+ "sfinae": datasets.Value("string"),
103
+ "concepts": datasets.Value("string"),
104
+ }
105
+ )
106
+ ),
107
+ "references": datasets.Features(
108
+ {
109
+ "tests": datasets.Value("string"),
110
+ "invalids": datasets.Value("string"),
111
+ }
112
+ ),
113
+ }
114
+ ),
115
  # Homepage of the module for documentation
116
  homepage="http://module.homepage",
117
  # Additional links to the codebase or references
118
  codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
119
+ reference_urls=["http://path.to.reference.url/new_module"],
120
  )
121
 
122
+ def _compute(self, *, predictions, references, k=[1, 10, 100]):
 
 
 
 
 
123
  """Returns the scores"""
124
+ num_workers = 4
125
+
126
+ if os.getenv("HF_ALLOW_CODE_EVAL", default=0) != "1":
127
+ raise ValueError(_WARNING)
128
+
129
+ if os.getenv("GENERICIFY_CLANG", default=0) == 0:
130
+ raise ValueError(_CLANG_WARNING)
131
+
132
+ if os.name == "nt":
133
+ raise NotImplementedError(
134
+ "This metric is currently not supported on Windows."
135
+ )
136
+
137
+ with concurrent.futures.ThreadPoolExecutor(
138
+ max_workers=num_workers
139
+ ) as executor:
140
+ futures = []
141
+ completion_id = collections.Counter()
142
+ results = collections.defaultdict(list)
143
+
144
+ for task_id, (candidates, reference) in enumerate(
145
+ zip(predictions, references)
146
+ ):
147
+ for candidate in candidates:
148
+ args = (
149
+ candidate,
150
+ reference,
151
+ task_id,
152
+ completion_id[task_id],
153
+ )
154
+ future = executor.submit(check_correctness, *args)
155
+ futures.append(future)
156
+ completion_id[task_id] += 1
157
+
158
+ for future in concurrent.futures.as_completed(futures):
159
+ result = future.result()
160
+ results[result["task_id"]].append(
161
+ (result["completion_id"], result)
162
+ )
163
+
164
+ totals = collections.defaultdict(list)
165
+ corrects = collections.defaultdict(list)
166
+ for result in results.values():
167
+ result.sort()
168
+ for pt in [
169
+ "base_run_passed",
170
+ "sfinae_run_passed",
171
+ "concepts_run_passed",
172
+ ]:
173
+ passed = [r[1][pt] for r in result]
174
+ totals[pt].append(len(passed))
175
+ corrects[pt].append(sum(passed))
176
+
177
+ totals = {k: np.array(v) for k, v in totals.items()}
178
+ corrects = {k: np.array(v) for k, v in corrects.items()}
179
+
180
+ ks = k
181
+ pass_at_k = {
182
+ f"{key}@{k}": estimate_pass_at_k(
183
+ totals[key],
184
+ corrects[key],
185
+ k,
186
+ ).mean()
187
+ for key in totals.keys()
188
+ for k in ks
189
+ if (totals[key] >= k).all()
190
+ }
191
+
192
+ return pass_at_k, results
193
+
194
+
195
+ def estimate_pass_at_k(num_samples, num_correct, k) -> np.array:
196
+ """Estimates pass@k of each problem and returns them in an array."""
197
+
198
+ def estimator(n: int, c: int) -> float:
199
+ """Calculates 1 - comb(n - c, k) / comb(n, k)."""
200
+ if n - c < k:
201
+ return 1.0
202
+ return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
203
+
204
+ if isinstance(num_samples, int):
205
+ num_samples_it = itertools.repeat(num_samples, len(num_correct))
206
+ else:
207
+ assert len(num_samples) == len(num_correct)
208
+ num_samples_it = iter(num_samples)
209
+
210
+ return np.array(
211
+ [estimator(int(n), int(c)) for n, c in zip(num_samples_it, num_correct)]
212
+ )
execute.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import contextlib
2
+ import multiprocessing
3
+ import os
4
+ import subprocess
5
+ import tempfile
6
+
7
+
8
+ def check_correctness(candidate, reference, task_id, completion_id):
9
+ """
10
+ Evaluates the functional correctness of a completion by running the test
11
+ suite provided in the problem.
12
+
13
+ :param completion_id: an optional completion ID so we can match
14
+ the results later even if execution finishes asynchronously.
15
+ """
16
+
17
+ manager = multiprocessing.Manager()
18
+ base_run_result = manager.list()
19
+ process_case(
20
+ unsafe_execute_cpp,
21
+ candidate["base"],
22
+ reference["tests"],
23
+ base_run_result,
24
+ )
25
+ sfinae_run_result = manager.list()
26
+ process_case(
27
+ unsafe_execute_cpp,
28
+ candidate["sfinae"],
29
+ reference["tests"],
30
+ sfinae_run_result,
31
+ )
32
+ concepts_run_result = manager.list()
33
+ process_case(
34
+ unsafe_execute_cpp,
35
+ candidate["concepts"],
36
+ reference["tests"],
37
+ concepts_run_result,
38
+ )
39
+
40
+ return dict(
41
+ task_id=task_id,
42
+ completion_id=completion_id,
43
+ base_run_passed=base_run_result[0] == "passed",
44
+ base_run_result=base_run_result[0],
45
+ sfinae_run_passed=sfinae_run_result[0] == "passed",
46
+ sfinae_run_result=sfinae_run_result[0],
47
+ concepts_run_passed=concepts_run_result[0] == "passed",
48
+ concepts_run_result=concepts_run_result[0],
49
+ )
50
+
51
+
52
+ def process_case(target, candidate, reference, result):
53
+ timeout = 60
54
+
55
+ p = multiprocessing.Process(
56
+ target=target,
57
+ args=(candidate, reference, result, timeout),
58
+ )
59
+
60
+ p.start()
61
+ p.join(timeout=timeout + 5)
62
+ if p.is_alive():
63
+ p.kill()
64
+
65
+ if not result:
66
+ result.append("timed out")
67
+
68
+
69
+ def unsafe_execute_cpp(candidate, reference, result, timeout):
70
+ with create_tempdir():
71
+ code = "#include <bits/stdc++.h>\n" + candidate + reference
72
+ open(f"test.cpp", "w").write(code)
73
+
74
+ cpp_compiler = os.getenv("GENERICIFY_CLANG")
75
+ compilation_result = subprocess.run(
76
+ [cpp_compiler, "-std=c++20", "test.cpp"],
77
+ timeout=timeout,
78
+ capture_output=True,
79
+ )
80
+ if compilation_result.returncode != 0:
81
+ if compilation_result.stderr:
82
+ err = compilation_result.stderr.decode()
83
+ else:
84
+ err = compilation_result.stdout.decode()
85
+ result.append(f"failed: compilation error: {err}")
86
+ else:
87
+ try:
88
+ exec_result = subprocess.run(
89
+ ["./a.out"], timeout=timeout, capture_output=True
90
+ )
91
+
92
+ if exec_result.returncode == 0:
93
+ result.append("passed")
94
+ else:
95
+ if exec_result.stderr:
96
+ try:
97
+ err = exec_result.stderr.decode()
98
+ except:
99
+ err = exec_result.stderr
100
+ else:
101
+ try:
102
+ err = exec_result.stdout.decode()
103
+ except:
104
+ err = exec_result.stdout
105
+ result.append(f"failed: {err}")
106
+
107
+ except subprocess.TimeoutExpired as e:
108
+ result.append("timed out")
109
+
110
+
111
+ @contextlib.contextmanager
112
+ def create_tempdir():
113
+ with tempfile.TemporaryDirectory() as dirname:
114
+ with chdir(dirname):
115
+ yield dirname
116
+
117
+
118
+ @contextlib.contextmanager
119
+ def chdir(root):
120
+ if root == ".":
121
+ yield
122
+ return
123
+ cwd = os.getcwd()
124
+ os.chdir(root)
125
+ try:
126
+ yield
127
+ except BaseException as exc:
128
+ raise exc
129
+ finally:
130
+ os.chdir(cwd)