gabeorlanski commited on
Commit
419ab80
·
unverified ·
1 Parent(s): 9610edf
Files changed (3) hide show
  1. .vscode/settings.json +6 -0
  2. README.md +35 -8
  3. bc_eval.py +29 -47
.vscode/settings.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "[python]": {
3
+ "editor.defaultFormatter": "ms-python.black-formatter"
4
+ },
5
+ "python.formatting.provider": "none"
6
+ }
README.md CHANGED
@@ -42,7 +42,7 @@ for row in ds:
42
  question_infos.append(row['question_info'])
43
  # Replace this with however you generate and postprocess predictions.
44
  predictions.append(model.generate(row['signature_with_docstring']))
45
- metric = evaluate.load("bc_eval")
46
  metrics, results = metric.compute(
47
  predictions=predictions, languages=languages, question_dicts=question_infos, k=[1]
48
  )
@@ -94,7 +94,7 @@ import os
94
  os.environ["HF_ALLOW_CODE_EVAL"] = "1"
95
  ds = load_dataset("gabeorlanski/bc-humaneval", split="test")
96
  example = ds[0]
97
- metric = evaluate.load("bc_eval")
98
  languages = ["Python"]
99
  question_infos = [example["question_info"]]
100
  predictions = [["""def has_close_elements(numbers: List[float], threshold: float) -> bool:
@@ -116,7 +116,35 @@ metrics, results = metric.compute(
116
  ```
117
  `results` is:
118
  ```
119
- [{"qid": 0, "idx": "0", "file_path": ".../tmpqt_p3dwn/0", "results": [{"return_code": 0, "runtime": 0.076369, "stdout": "TEST-0...PASSED\r\nTEST-1...PASSED\r\nTEST-2...PASSED\r\nTEST-3...PASSED\r\nTEST-4...PASSED\r\nTEST-5...PASSED\r\nTEST-6...PASSED\r\n", "stderr": "", "timed_out": false}], "failed": false, "timed_out": false, "test_cases": {"0": "PASSED", "1": "PASSED", "2": "PASSED", "3": "PASSED", "4": "PASSED", "5": "PASSED", "6": "PASSED"}, "outcome": "PASSED"}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  ```
121
 
122
 
@@ -131,7 +159,7 @@ ds = load_dataset(
131
  "gabeorlanski/bc-humaneval", "Python", split="test"
132
  )
133
  example = ds[0]
134
- metric = evaluate.load("bc_eval")
135
  languages = ["Python"]
136
  question_infos = [example["question_info"]]
137
  predictions = [["""def has_close_elements(numbers: List[float], threshold: float) -> bool:
@@ -170,7 +198,7 @@ ds = load_dataset(
170
  "gabeorlanski/bc-humaneval", "Python", split="test"
171
  )
172
  example = ds[0]
173
- metric = evaluate.load("bc_eval")
174
  languages = ["Python"]
175
  question_infos = [example["question_info"]]
176
  predictions = [["""import time
@@ -203,7 +231,7 @@ ds = load_dataset(
203
  "gabeorlanski/bc-humaneval", "Python", split="test"
204
  )
205
  example = ds[0]
206
- metric = evaluate.load("bc_eval")
207
  languages = ["Python"]
208
  question_infos = [example["question_info"]]
209
  predictions = [["""import time
@@ -223,8 +251,7 @@ metrics, results = metric.compute(
223
  {"Python/pass@1": 0.0, "Python/mean_pct_pass": 0.0}
224
  ```
225
  `results` is:
226
- ```
227
- [{"qid": 0, "idx": "0", "file_path": "/tmpjdn51aaa/0", "results": [{"return_code": 0, "runtime": 0.102855, "stdout": "TEST-0...ValueError\r\nTEST-1...ValueError\r\nTEST-2...ValueError\r\nTEST-3...ValueError\r\nTEST-4...ValueError\r\nTEST-5...ValueError\r\nTEST-6...ValueError\r\n", "stderr": "", "timed_out": false}], "failed": false, "timed_out": false, "test_cases": {"0": "ValueError", "1": "ValueError", "2": "ValueError", "3": "ValueError", "4": "ValueError", "5": "ValueError", "6": "ValueError"}, "outcome": "HAD_ERROR"},
228
  {"qid": 0, "idx": "1", "file_path": "/tmpjdn51aaa/1", "results": [{"return_code": 0, "runtime": 0.094347, "stdout": "TEST-0...NameError\r\nTEST-1...NameError\r\nTEST-2...NameError\r\nTEST-3...NameError\r\nTEST-4...NameError\r\nTEST-5...NameError\r\nTEST-6...NameError\r\n", "stderr": "", "timed_out": false}], "failed": false, "timed_out": false, "test_cases": {"0": "NameError", "1": "NameError", "2": "NameError", "3": "NameError", "4": "NameError", "5": "NameError", "6": "NameError"}, "outcome": "HAD_ERROR"}]
229
  ```
230
 
 
42
  question_infos.append(row['question_info'])
43
  # Replace this with however you generate and postprocess predictions.
44
  predictions.append(model.generate(row['signature_with_docstring']))
45
+ metric = evaluate.load("gabeorlanski/bc_eval")
46
  metrics, results = metric.compute(
47
  predictions=predictions, languages=languages, question_dicts=question_infos, k=[1]
48
  )
 
94
  os.environ["HF_ALLOW_CODE_EVAL"] = "1"
95
  ds = load_dataset("gabeorlanski/bc-humaneval", split="test")
96
  example = ds[0]
97
+ metric = evaluate.load("gabeorlanski/bc_eval")
98
  languages = ["Python"]
99
  question_infos = [example["question_info"]]
100
  predictions = [["""def has_close_elements(numbers: List[float], threshold: float) -> bool:
 
116
  ```
117
  `results` is:
118
  ```
119
+
120
+ [
121
+ {
122
+ "qid": 0,
123
+ "idx": "0",
124
+ "file_path": ".../tmpqt_p3dwn/0",
125
+ "results": [
126
+ {
127
+ "return_code": 0,
128
+ "runtime": 0.076369,
129
+ "stdout": "TEST-0...PASSED\r\nTEST-1...PASSED\r\nTEST-2...PASSED\r\nTEST-3...PASSED\r\nTEST-4...PASSED\r\nTEST-5...PASSED\r\nTEST-6...PASSED\r\n",
130
+ "stderr": "",
131
+ "timed_out": false,
132
+ }
133
+ ],
134
+ "failed": false,
135
+ "timed_out": false,
136
+ "test_cases": {
137
+ "0": "PASSED",
138
+ "1": "PASSED",
139
+ "2": "PASSED",
140
+ "3": "PASSED",
141
+ "4": "PASSED",
142
+ "5": "PASSED",
143
+ "6": "PASSED",
144
+ },
145
+ "outcome": "PASSED",
146
+ }
147
+ ]
148
  ```
149
 
150
 
 
159
  "gabeorlanski/bc-humaneval", "Python", split="test"
160
  )
161
  example = ds[0]
162
+ metric = evaluate.load("gabeorlanski/bc_eval")
163
  languages = ["Python"]
164
  question_infos = [example["question_info"]]
165
  predictions = [["""def has_close_elements(numbers: List[float], threshold: float) -> bool:
 
198
  "gabeorlanski/bc-humaneval", "Python", split="test"
199
  )
200
  example = ds[0]
201
+ metric = evaluate.load("gabeorlanski/bc_eval")
202
  languages = ["Python"]
203
  question_infos = [example["question_info"]]
204
  predictions = [["""import time
 
231
  "gabeorlanski/bc-humaneval", "Python", split="test"
232
  )
233
  example = ds[0]
234
+ metric = evaluate.load("gabeorlanski/bc_eval")
235
  languages = ["Python"]
236
  question_infos = [example["question_info"]]
237
  predictions = [["""import time
 
251
  {"Python/pass@1": 0.0, "Python/mean_pct_pass": 0.0}
252
  ```
253
  `results` is:
254
+ ```[{"qid": 0, "idx": "0", "file_path": "/tmpjdn51aaa/0", "results": [{"return_code": 0, "runtime": 0.102855, "stdout": "TEST-0...ValueError\r\nTEST-1...ValueError\r\nTEST-2...ValueError\r\nTEST-3...ValueError\r\nTEST-4...ValueError\r\nTEST-5...ValueError\r\nTEST-6...ValueError\r\n", "stderr": "", "timed_out": false}], "failed": false, "timed_out": false, "test_cases": {"0": "ValueError", "1": "ValueError", "2": "ValueError", "3": "ValueError", "4": "ValueError", "5": "ValueError", "6": "ValueError"}, "outcome": "HAD_ERROR"},
 
255
  {"qid": 0, "idx": "1", "file_path": "/tmpjdn51aaa/1", "results": [{"return_code": 0, "runtime": 0.094347, "stdout": "TEST-0...NameError\r\nTEST-1...NameError\r\nTEST-2...NameError\r\nTEST-3...NameError\r\nTEST-4...NameError\r\nTEST-5...NameError\r\nTEST-6...NameError\r\n", "stderr": "", "timed_out": false}], "failed": false, "timed_out": false, "test_cases": {"0": "NameError", "1": "NameError", "2": "NameError", "3": "NameError", "4": "NameError", "5": "NameError", "6": "NameError"}, "outcome": "HAD_ERROR"}]
256
  ```
257
 
bc_eval.py CHANGED
@@ -82,9 +82,7 @@ _QUESTION_INFO_KEYS = {
82
  }
83
 
84
 
85
- def make_file_and_command(
86
- qid, idx, pred, question, working_dir, timeout_override=None
87
- ):
88
  file_name = f"pred.{question['extension']}"
89
  pred_dir = working_dir.joinpath(idx)
90
  pred_dir.mkdir(parents=True)
@@ -101,9 +99,7 @@ def make_file_and_command(
101
  commands.append(
102
  {
103
  "timeout": t if timeout_override is None else timeout_override,
104
- "command": [
105
- c if c != "__FILENAME__" else file_name for c in cmd
106
- ],
107
  }
108
  )
109
 
@@ -124,7 +120,7 @@ def _write_preds(
124
  zip(preds, languages, question_dicts), desc="Setup", total=len(preds)
125
  ):
126
  qid = len(question_id_to_dict)
127
- q_dict['language'] = l
128
  question_id_to_dict[qid] = q_dict
129
  for p in pred_list:
130
  commands.append(
@@ -141,9 +137,7 @@ def _write_preds(
141
  return question_id_to_dict, commands
142
 
143
 
144
- @evaluate.utils.file_utils.add_start_docstrings(
145
- _DESCRIPTION, _KWARGS_DESCRIPTION
146
- )
147
  class BabelCodeEval(evaluate.Metric):
148
  def _info(self):
149
  list_keys = ["timeouts", "commands", "test_case_ids"]
@@ -153,12 +147,8 @@ class BabelCodeEval(evaluate.Metric):
153
  if k not in list_keys
154
  }
155
  question_info_type["test_case_ids"] = datasets.Value("string")
156
- question_info_type["commands"] = datasets.Sequence(
157
- datasets.Value("string")
158
- )
159
- question_info_type["timeouts"] = datasets.Sequence(
160
- datasets.Value("int32")
161
- )
162
 
163
  return evaluate.MetricInfo(
164
  # This is the description that will appear on the metrics page.
@@ -170,7 +160,7 @@ class BabelCodeEval(evaluate.Metric):
170
  {
171
  "predictions": datasets.Sequence(datasets.Value("string")),
172
  "languages": datasets.Value("string"),
173
- "question_dicts": question_info_type
174
  }
175
  ),
176
  homepage="https://github.com/google-research/babelcode",
@@ -211,18 +201,17 @@ class BabelCodeEval(evaluate.Metric):
211
  garbage_collection_freq=500,
212
  )
213
 
214
-
215
- all_results, q_passes, q_pct = _eval_predictions(
216
- results, question_map
217
- )
218
-
219
-
220
  assert len(q_passes) == len(q_pct)
221
  metrics = {}
222
  for lang in q_passes:
223
- metrics.update(_calculate_metrics(lang, q_passes[lang], q_pct[lang],k_vals=k))
 
 
224
  return metrics, all_results
225
 
 
226
  def _eval_single_pred(result, test_ids, num_expected_commands):
227
  test_case_results = {k: "MISSING" for k in test_ids}
228
  if len(result["results"]) != num_expected_commands:
@@ -279,39 +268,35 @@ def _eval_predictions(pred_results, question_map):
279
  p["results"] = [dataclasses.asdict(r) for r in p["results"]]
280
  p["test_cases"] = test_case_results
281
  p["outcome"] = outcome
282
-
283
- lang = question['language']
284
- question_results[lang][p["qid"]].append(
285
- num_passed == len(test_case_results)
286
- )
287
- question_pct_pass[lang][p["qid"]].append(
288
- num_passed / len(test_case_results)
289
- )
290
 
291
  out.append(p)
292
 
293
  return out, question_results, question_pct_pass
294
 
295
 
296
- def _calculate_metrics(lang,q_passed, q_pcts, k_vals):
297
  assert len(q_passed) == len(q_pcts)
298
-
299
  num_samples = np.zeros(len(q_passed))
300
  num_correct = np.zeros(len(q_passed))
301
  pcts_passed = np.zeros(len(q_passed))
302
- for i, (k,v) in enumerate(q_passed.items()):
303
  num_samples[i] = len(v)
304
  num_correct[i] = sum(v)
305
  pcts_passed[i] = np.mean(q_pcts[k])
306
-
307
-
308
- out = {f'{lang}/pass@{k}': estimate_pass_at_k(num_samples, num_correct, k).mean() for k in k_vals}
309
- out[f'{lang}/mean_pct_pass'] = np.mean(pcts_passed)
310
-
311
-
 
312
  return out
313
-
314
-
315
 
316
  def estimate_pass_at_k(num_samples, num_correct, k):
317
  """Estimates pass@k of each problem and returns them in an array."""
@@ -329,8 +314,5 @@ def estimate_pass_at_k(num_samples, num_correct, k):
329
  num_samples_it = iter(num_samples)
330
 
331
  return np.array(
332
- [
333
- estimator(int(n), int(c), k)
334
- for n, c in zip(num_samples_it, num_correct)
335
- ]
336
  )
 
82
  }
83
 
84
 
85
+ def make_file_and_command(qid, idx, pred, question, working_dir, timeout_override=None):
 
 
86
  file_name = f"pred.{question['extension']}"
87
  pred_dir = working_dir.joinpath(idx)
88
  pred_dir.mkdir(parents=True)
 
99
  commands.append(
100
  {
101
  "timeout": t if timeout_override is None else timeout_override,
102
+ "command": [c if c != "__FILENAME__" else file_name for c in cmd],
 
 
103
  }
104
  )
105
 
 
120
  zip(preds, languages, question_dicts), desc="Setup", total=len(preds)
121
  ):
122
  qid = len(question_id_to_dict)
123
+ q_dict["language"] = l
124
  question_id_to_dict[qid] = q_dict
125
  for p in pred_list:
126
  commands.append(
 
137
  return question_id_to_dict, commands
138
 
139
 
140
+ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 
 
141
  class BabelCodeEval(evaluate.Metric):
142
  def _info(self):
143
  list_keys = ["timeouts", "commands", "test_case_ids"]
 
147
  if k not in list_keys
148
  }
149
  question_info_type["test_case_ids"] = datasets.Value("string")
150
+ question_info_type["commands"] = datasets.Sequence(datasets.Value("string"))
151
+ question_info_type["timeouts"] = datasets.Sequence(datasets.Value("int32"))
 
 
 
 
152
 
153
  return evaluate.MetricInfo(
154
  # This is the description that will appear on the metrics page.
 
160
  {
161
  "predictions": datasets.Sequence(datasets.Value("string")),
162
  "languages": datasets.Value("string"),
163
+ "question_dicts": question_info_type,
164
  }
165
  ),
166
  homepage="https://github.com/google-research/babelcode",
 
201
  garbage_collection_freq=500,
202
  )
203
 
204
+ all_results, q_passes, q_pct = _eval_predictions(results, question_map)
205
+
 
 
 
 
206
  assert len(q_passes) == len(q_pct)
207
  metrics = {}
208
  for lang in q_passes:
209
+ metrics.update(
210
+ _calculate_metrics(lang, q_passes[lang], q_pct[lang], k_vals=k)
211
+ )
212
  return metrics, all_results
213
 
214
+
215
  def _eval_single_pred(result, test_ids, num_expected_commands):
216
  test_case_results = {k: "MISSING" for k in test_ids}
217
  if len(result["results"]) != num_expected_commands:
 
268
  p["results"] = [dataclasses.asdict(r) for r in p["results"]]
269
  p["test_cases"] = test_case_results
270
  p["outcome"] = outcome
271
+
272
+ lang = question["language"]
273
+ question_results[lang][p["qid"]].append(num_passed == len(test_case_results))
274
+ question_pct_pass[lang][p["qid"]].append(num_passed / len(test_case_results))
 
 
 
 
275
 
276
  out.append(p)
277
 
278
  return out, question_results, question_pct_pass
279
 
280
 
281
+ def _calculate_metrics(lang, q_passed, q_pcts, k_vals):
282
  assert len(q_passed) == len(q_pcts)
283
+
284
  num_samples = np.zeros(len(q_passed))
285
  num_correct = np.zeros(len(q_passed))
286
  pcts_passed = np.zeros(len(q_passed))
287
+ for i, (k, v) in enumerate(q_passed.items()):
288
  num_samples[i] = len(v)
289
  num_correct[i] = sum(v)
290
  pcts_passed[i] = np.mean(q_pcts[k])
291
+
292
+ out = {
293
+ f"{lang}/pass@{k}": estimate_pass_at_k(num_samples, num_correct, k).mean()
294
+ for k in k_vals
295
+ }
296
+ out[f"{lang}/mean_pct_pass"] = np.mean(pcts_passed)
297
+
298
  return out
299
+
 
300
 
301
  def estimate_pass_at_k(num_samples, num_correct, k):
302
  """Estimates pass@k of each problem and returns them in an array."""
 
314
  num_samples_it = iter(num_samples)
315
 
316
  return np.array(
317
+ [estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]
 
 
 
318
  )