giulio98 commited on
Commit
38ed8d0
1 Parent(s): a1b354b

first commit

Browse files
Files changed (2) hide show
  1. code_eval_outputs.py +163 -50
  2. execute.py +252 -0
code_eval_outputs.py CHANGED
@@ -11,85 +11,198 @@
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
- """TODO: Add a description here."""
 
 
 
 
 
 
 
 
15
 
16
- import evaluate
17
  import datasets
 
 
 
 
 
18
 
19
 
20
- # TODO: Add BibTeX citation
21
  _CITATION = """\
22
- @InProceedings{huggingface:module,
23
- title = {A great new module},
24
- authors={huggingface, Inc.},
25
- year={2020}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  }
27
  """
28
 
29
- # TODO: Add description of the module here
30
  _DESCRIPTION = """\
31
- This new module is designed to solve this great ML task and is crafted with a lot of care.
 
 
32
  """
33
 
34
 
35
- # TODO: Add description of the arguments of the module here
36
  _KWARGS_DESCRIPTION = """
37
  Calculates how good are predictions given some references, using certain scores
38
  Args:
39
- predictions: list of predictions to score. Each predictions
40
- should be a string with tokens separated by spaces.
41
- references: list of reference for each prediction. Each
42
- reference should be a string with tokens separated by spaces.
 
 
 
 
43
  Returns:
44
- accuracy: description of the first score,
45
- another_score: description of the second score,
46
  Examples:
47
- Examples should be written in doctest format, and should illustrate how
48
- to use the function.
 
 
 
 
 
 
49
 
50
- >>> my_new_module = evaluate.load("my_new_module")
51
- >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
52
- >>> print(results)
53
- {'accuracy': 1.0}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  """
55
 
56
- # TODO: Define external resources urls if needed
57
- BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
 
60
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
61
- class code_eval_outputs(evaluate.Metric):
62
- """TODO: Short description of my evaluation module."""
63
-
64
  def _info(self):
65
- # TODO: Specifies the evaluate.EvaluationModuleInfo object
66
  return evaluate.MetricInfo(
67
- # This is the description that will appear on the modules page.
68
- module_type="metric",
69
  description=_DESCRIPTION,
70
  citation=_CITATION,
71
  inputs_description=_KWARGS_DESCRIPTION,
72
  # This defines the format of each prediction and reference
73
- features=datasets.Features({
74
- 'predictions': datasets.Value('int64'),
75
- 'references': datasets.Value('int64'),
76
- }),
77
- # Homepage of the module for documentation
78
- homepage="http://module.homepage",
79
- # Additional links to the codebase or references
80
- codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
81
- reference_urls=["http://path.to.reference.url/new_module"]
 
82
  )
83
 
84
- def _download_and_prepare(self, dl_manager):
85
- """Optional: download external resources useful to compute the scores"""
86
- # TODO: Download external resources if needed
87
- pass
88
-
89
- def _compute(self, predictions, references):
90
  """Returns the scores"""
91
- # TODO: Compute the different scores of the module
92
- accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
93
- return {
94
- "accuracy": accuracy,
95
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
+ """The CodeEval metric estimates the pass@k metric for code synthesis using the output generated as comparison.
15
+ This is based on the evaluation harness for the HumanEval problem solving dataset
16
+ described in the paper "Evaluating Large Language Models Trained on Code"
17
+ (https://arxiv.org/abs/2107.03374)."""
18
+
19
+ import itertools
20
+ import os
21
+ from collections import Counter, defaultdict
22
+ from concurrent.futures import ThreadPoolExecutor, as_completed
23
 
 
24
  import datasets
25
+ import numpy as np
26
+
27
+ import evaluate
28
+
29
+ from .execute import check_correctness
30
 
31
 
 
32
  _CITATION = """\
33
+ @misc{chen2021evaluating,
34
+ title={Evaluating Large Language Models Trained on Code},
35
+ author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan \
36
+ and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards \
37
+ and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray \
38
+ and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf \
39
+ and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray \
40
+ and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser \
41
+ and Mohammad Bavarian and Clemens Winter and Philippe Tillet \
42
+ and Felipe Petroski Such and Dave Cummings and Matthias Plappert \
43
+ and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss \
44
+ and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak \
45
+ and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain \
46
+ and William Saunders and Christopher Hesse and Andrew N. Carr \
47
+ and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa \
48
+ and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati \
49
+ and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei \
50
+ and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
51
+ year={2021},
52
+ eprint={2107.03374},
53
+ archivePrefix={arXiv},
54
+ primaryClass={cs.LG}
55
  }
56
  """
57
 
 
58
  _DESCRIPTION = """\
59
+ This metric is based on the evaluation harness for the HumanEval problem solving dataset
60
+ described in the paper "Evaluating Large Language Models Trained on Code"
61
+ (https://arxiv.org/abs/2107.03374).
62
  """
63
 
64
 
 
65
  _KWARGS_DESCRIPTION = """
66
  Calculates how good are predictions given some references, using certain scores
67
  Args:
68
+ predictions: list of candidates to evaluate. Each candidates should be a list
69
+ of strings with several code candidates to solve the problem.
70
+ references: a list with a test for each prediction. Each test should evaluate the
71
+ correctness of a code candidate.
72
+ output: expected output of the program.
73
+ k: number of code candidates to consider in the evaluation (Default: [1, 10, 100])
74
+ num_workers: number of workers used to evaluate the canidate programs (Default: 4).
75
+ timeout:
76
  Returns:
77
+ pass_at_k: dict with pass rates for each k
78
+ results: dict with granular results of each unittest
79
  Examples:
80
+ >>> code_eval = evaluate.load("code_eval")
81
+ >>> test_cases = ["print(add(2,3))"]
82
+ >>> candidates = [["def add(a,b): return a*b", "def add(a, b): return a+b"]]
83
+ >>> output = 5
84
+ >>> pass_at_k, results = code_eval_outputs.compute(references=test_cases, predictions=candidates, output=output, k=[1, 2])
85
+ >>> print(pass_at_k)
86
+ {'pass@1': 0.5, 'pass@2': 1.0}
87
+ """
88
 
89
+
90
+ _WARNING = """
91
+ ################################################################################
92
+ !!!WARNING!!!
93
+ ################################################################################
94
+ The "code_eval" metric executes untrusted model-generated code in Python.
95
+ Although it is highly unlikely that model-generated code will do something
96
+ overtly malicious in response to this test suite, model-generated code may act
97
+ destructively due to a lack of model capability or alignment.
98
+ Users are strongly encouraged to sandbox this evaluation suite so that it
99
+ does not perform destructive actions on their host or network. For more
100
+ information on how OpenAI sandboxes its code, see the paper "Evaluating Large
101
+ Language Models Trained on Code" (https://arxiv.org/abs/2107.03374).
102
+ Once you have read this disclaimer and taken appropriate precautions,
103
+ set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this
104
+ with:
105
+ >>> import os
106
+ >>> os.environ["HF_ALLOW_CODE_EVAL"] = "1"
107
+ ################################################################################\
108
  """
109
 
110
+ _LICENSE = """The MIT License
111
+ Copyright (c) OpenAI (https://openai.com)
112
+ Permission is hereby granted, free of charge, to any person obtaining a copy
113
+ of this software and associated documentation files (the "Software"), to deal
114
+ in the Software without restriction, including without limitation the rights
115
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
116
+ copies of the Software, and to permit persons to whom the Software is
117
+ furnished to do so, subject to the following conditions:
118
+ The above copyright notice and this permission notice shall be included in
119
+ all copies or substantial portions of the Software.
120
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
121
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
122
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
123
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
124
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
125
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
126
+ THE SOFTWARE."""
127
 
128
 
129
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
130
+ class CodeEval(evaluate.Metric):
 
 
131
  def _info(self):
 
132
  return evaluate.MetricInfo(
133
+ # This is the description that will appear on the metrics page.
 
134
  description=_DESCRIPTION,
135
  citation=_CITATION,
136
  inputs_description=_KWARGS_DESCRIPTION,
137
  # This defines the format of each prediction and reference
138
+ features=datasets.Features(
139
+ {
140
+ "predictions": datasets.Sequence(datasets.Value("string")),
141
+ "references": datasets.Value("string"),
142
+ }
143
+ ),
144
+ homepage="https://github.com/openai/human-eval",
145
+ codebase_urls=["https://github.com/openai/human-eval"],
146
+ reference_urls=["https://github.com/openai/human-eval"],
147
+ license=_LICENSE,
148
  )
149
 
150
+ def _compute(self, predictions, references, output, k=[1, 10, 100], num_workers=4, timeout=3.0):
 
 
 
 
 
151
  """Returns the scores"""
152
+
153
+ if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
154
+ raise ValueError(_WARNING)
155
+
156
+ if os.name == "nt":
157
+ raise NotImplementedError("This metric is currently not supported on Windows.")
158
+
159
+ with ThreadPoolExecutor(max_workers=num_workers) as executor:
160
+ futures = []
161
+ completion_id = Counter()
162
+ n_samples = 0
163
+ results = defaultdict(list)
164
+
165
+ for task_id, (candidates, test_case) in enumerate(zip(predictions, references)):
166
+ for candidate in candidates:
167
+ test_program = candidate + "\n" + test_case
168
+ args = (test_program, output, timeout, task_id, completion_id[task_id])
169
+ future = executor.submit(check_correctness, *args)
170
+ futures.append(future)
171
+ completion_id[task_id] += 1
172
+ n_samples += 1
173
+
174
+ for future in as_completed(futures):
175
+ result = future.result()
176
+ results[result["task_id"]].append((result["completion_id"], result))
177
+
178
+ total, correct = [], []
179
+ for result in results.values():
180
+ result.sort()
181
+ passed = [r[1]["passed"] for r in result]
182
+ total.append(len(passed))
183
+ correct.append(sum(passed))
184
+ total = np.array(total)
185
+ correct = np.array(correct)
186
+
187
+ ks = k
188
+ pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean() for k in ks if (total >= k).all()}
189
+
190
+ return pass_at_k, results
191
+
192
+
193
+ def estimate_pass_at_k(num_samples, num_correct, k):
194
+ """Estimates pass@k of each problem and returns them in an array."""
195
+
196
+ def estimator(n: int, c: int, k: int) -> float:
197
+ """Calculates 1 - comb(n - c, k) / comb(n, k)."""
198
+ if n - c < k:
199
+ return 1.0
200
+ return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
201
+
202
+ if isinstance(num_samples, int):
203
+ num_samples_it = itertools.repeat(num_samples, len(num_correct))
204
+ else:
205
+ assert len(num_samples) == len(num_correct)
206
+ num_samples_it = iter(num_samples)
207
+
208
+ return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
execute.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # This code is adapted from OpenAI's release
16
+ # https://github.com/openai/human-eval/blob/master/human_eval/execution.py
17
+
18
+ import contextlib
19
+ import faulthandler
20
+ import io
21
+ import multiprocessing
22
+ import os
23
+ import platform
24
+ import signal
25
+ import tempfile
26
+ import sys
27
+ from io import StringIO
28
+ import contextlib
29
+
30
+
31
+ def check_correctness(check_program, output, timeout, task_id, completion_id):
32
+ """
33
+ Evaluates the functional correctness of a completion by running the test
34
+ suite provided in the problem.
35
+ :param completion_id: an optional completion ID so we can match
36
+ the results later even if execution finishes asynchronously.
37
+ """
38
+ manager = multiprocessing.Manager()
39
+ result = manager.list()
40
+
41
+ p = multiprocessing.Process(target=unsafe_execute, args=(check_program, output, result, timeout))
42
+ p.start()
43
+ p.join(timeout=timeout + 1)
44
+ if p.is_alive():
45
+ p.kill()
46
+
47
+ if not result:
48
+ result.append("timed out")
49
+
50
+ return dict(
51
+ task_id=task_id,
52
+ passed=result[0] == "passed",
53
+ result=result[0],
54
+ completion_id=completion_id,
55
+ )
56
+
57
+
58
+
59
+
60
+ def unsafe_execute(check_program, output, result, timeout):
61
+
62
+ with create_tempdir():
63
+
64
+ # These system calls are needed when cleaning up tempdir.
65
+ import os
66
+ import shutil
67
+
68
+ rmtree = shutil.rmtree
69
+ rmdir = os.rmdir
70
+ chdir = os.chdir
71
+
72
+ # Disable functionalities that can make destructive changes to the test.
73
+ reliability_guard()
74
+
75
+ # Run program.
76
+ try:
77
+ exec_globals = {}
78
+ with swallow_io():
79
+ with time_limit(timeout):
80
+ with stdoutIO() as s:
81
+ exec(check_program, exec_globals)
82
+ if(s.getvalue() == output):
83
+ result.append("passed")
84
+ else:
85
+ result.append("not passed")
86
+ except TimeoutException:
87
+ result.append("timed out")
88
+ except BaseException as e:
89
+ result.append(f"failed: {e}")
90
+
91
+ # Needed for cleaning up.
92
+ shutil.rmtree = rmtree
93
+ os.rmdir = rmdir
94
+ os.chdir = chdir
95
+
96
+ @contextlib.contextmanager
97
+ def stdoutIO(stdout=None):
98
+ old = sys.stdout
99
+ if stdout is None:
100
+ stdout = StringIO()
101
+ sys.stdout = stdout
102
+ yield stdout
103
+ sys.stdout = old
104
+
105
+
106
+ @contextlib.contextmanager
107
+ def time_limit(seconds):
108
+ def signal_handler(signum, frame):
109
+ raise TimeoutException("Timed out!")
110
+
111
+ signal.setitimer(signal.ITIMER_REAL, seconds)
112
+ signal.signal(signal.SIGALRM, signal_handler)
113
+ try:
114
+ yield
115
+ finally:
116
+ signal.setitimer(signal.ITIMER_REAL, 0)
117
+
118
+
119
+ @contextlib.contextmanager
120
+ def swallow_io():
121
+ stream = WriteOnlyStringIO()
122
+ with contextlib.redirect_stdout(stream):
123
+ with contextlib.redirect_stderr(stream):
124
+ with redirect_stdin(stream):
125
+ yield
126
+
127
+
128
+ @contextlib.contextmanager
129
+ def create_tempdir():
130
+ with tempfile.TemporaryDirectory() as dirname:
131
+ with chdir(dirname):
132
+ yield dirname
133
+
134
+
135
+ class TimeoutException(Exception):
136
+ pass
137
+
138
+
139
+ class WriteOnlyStringIO(io.StringIO):
140
+ """StringIO that throws an exception when it's read from"""
141
+
142
+ def read(self, *args, **kwargs):
143
+ raise OSError
144
+
145
+ def readline(self, *args, **kwargs):
146
+ raise OSError
147
+
148
+ def readlines(self, *args, **kwargs):
149
+ raise OSError
150
+
151
+ def readable(self, *args, **kwargs):
152
+ """Returns True if the IO object can be read."""
153
+ return False
154
+
155
+
156
+ class redirect_stdin(contextlib._RedirectStream): # type: ignore
157
+ _stream = "stdin"
158
+
159
+
160
+ @contextlib.contextmanager
161
+ def chdir(root):
162
+ if root == ".":
163
+ yield
164
+ return
165
+ cwd = os.getcwd()
166
+ os.chdir(root)
167
+ try:
168
+ yield
169
+ except BaseException as exc:
170
+ raise exc
171
+ finally:
172
+ os.chdir(cwd)
173
+
174
+
175
+ def reliability_guard(maximum_memory_bytes=None):
176
+ """
177
+ This disables various destructive functions and prevents the generated code
178
+ from interfering with the test (e.g. fork bomb, killing other processes,
179
+ removing filesystem files, etc.)
180
+ WARNING
181
+ This function is NOT a security sandbox. Untrusted code, including, model-
182
+ generated code, should not be blindly executed outside of one. See the
183
+ Codex paper for more information about OpenAI's code sandbox, and proceed
184
+ with caution.
185
+ """
186
+
187
+ if maximum_memory_bytes is not None:
188
+ import resource
189
+
190
+ resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
191
+ resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
192
+ if not platform.uname().system == "Darwin":
193
+ resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
194
+
195
+ faulthandler.disable()
196
+
197
+ import builtins
198
+
199
+ builtins.exit = None
200
+ builtins.quit = None
201
+
202
+ import os
203
+
204
+ os.environ["OMP_NUM_THREADS"] = "1"
205
+
206
+ os.kill = None
207
+ os.system = None
208
+ os.putenv = None
209
+ os.remove = None
210
+ os.removedirs = None
211
+ os.rmdir = None
212
+ os.fchdir = None
213
+ os.setuid = None
214
+ os.fork = None
215
+ os.forkpty = None
216
+ os.killpg = None
217
+ os.rename = None
218
+ os.renames = None
219
+ os.truncate = None
220
+ os.replace = None
221
+ os.unlink = None
222
+ os.fchmod = None
223
+ os.fchown = None
224
+ os.chmod = None
225
+ os.chown = None
226
+ os.chroot = None
227
+ os.fchdir = None
228
+ os.lchflags = None
229
+ os.lchmod = None
230
+ os.lchown = None
231
+ os.getcwd = None
232
+ os.chdir = None
233
+
234
+ import shutil
235
+
236
+ shutil.rmtree = None
237
+ shutil.move = None
238
+ shutil.chown = None
239
+
240
+ import subprocess
241
+
242
+ subprocess.Popen = None # type: ignore
243
+
244
+ __builtins__["help"] = None
245
+
246
+ import sys
247
+
248
+ sys.modules["ipdb"] = None
249
+ sys.modules["joblib"] = None
250
+ sys.modules["resource"] = None
251
+ sys.modules["psutil"] = None
252
+ sys.modules["tkinter"] = None