gabeorlanski commited on
Commit
0fb6838
1 Parent(s): a7f3790

Update bc_eval.py

Browse files
Files changed (1) hide show
  1. bc_eval.py +10 -8
bc_eval.py CHANGED
@@ -3,7 +3,7 @@ import itertools
3
  import os
4
  import re
5
  import tempfile
6
- from collections import defaultdict
7
  from pathlib import Path
8
 
9
  import datasets
@@ -204,13 +204,13 @@ class BabelCodeEval(evaluate.Metric):
204
  garbage_collection_freq=gc_freq,
205
  )
206
 
207
- all_results, q_passes, q_pct = _eval_predictions(results, question_map)
208
 
209
  assert len(q_passes) == len(q_pct)
210
  metrics = {}
211
  for lang in q_passes:
212
  metrics.update(
213
- _calculate_metrics(lang, q_passes[lang], q_pct[lang], k_vals=k)
214
  )
215
  return metrics, all_results
216
 
@@ -258,7 +258,7 @@ def _eval_predictions(pred_results, question_map):
258
  out = []
259
  question_results = defaultdict(lambda: defaultdict(list))
260
  question_pct_pass = defaultdict(lambda: defaultdict(list))
261
-
262
  for p in pred_results:
263
  question = question_map[p["qid"]]
264
  test_cases = question["test_case_ids"]
@@ -275,13 +275,13 @@ def _eval_predictions(pred_results, question_map):
275
  lang = question["language"]
276
  question_results[lang][p["qid"]].append(num_passed == len(test_case_results))
277
  question_pct_pass[lang][p["qid"]].append(num_passed / len(test_case_results))
278
-
279
  out.append(p)
280
 
281
- return out, question_results, question_pct_pass
282
 
283
 
284
- def _calculate_metrics(lang, q_passed, q_pcts, k_vals):
285
  assert len(q_passed) == len(q_pcts)
286
 
287
  num_samples = np.zeros(len(q_passed))
@@ -298,7 +298,9 @@ def _calculate_metrics(lang, q_passed, q_pcts, k_vals):
298
  }
299
  out[f"{lang}/mean_pct_pass"] = np.mean(pcts_passed)
300
  out[f"{lang}/median_pct_pass"] = np.median(pcts_passed)
301
-
 
 
302
 
303
  return out
304
 
 
3
  import os
4
  import re
5
  import tempfile
6
+ from collections import defaultdict, Counter
7
  from pathlib import Path
8
 
9
  import datasets
 
204
  garbage_collection_freq=gc_freq,
205
  )
206
 
207
+ all_results, q_passes, q_pct, o_count = _eval_predictions(results, question_map)
208
 
209
  assert len(q_passes) == len(q_pct)
210
  metrics = {}
211
  for lang in q_passes:
212
  metrics.update(
213
+ _calculate_metrics(lang, q_passes[lang], q_pct[lang], o_count[lang], k_vals=k)
214
  )
215
  return metrics, all_results
216
 
 
258
  out = []
259
  question_results = defaultdict(lambda: defaultdict(list))
260
  question_pct_pass = defaultdict(lambda: defaultdict(list))
261
+ outcome_counts = defaultdict(Counter)
262
  for p in pred_results:
263
  question = question_map[p["qid"]]
264
  test_cases = question["test_case_ids"]
 
275
  lang = question["language"]
276
  question_results[lang][p["qid"]].append(num_passed == len(test_case_results))
277
  question_pct_pass[lang][p["qid"]].append(num_passed / len(test_case_results))
278
+ outcome_counts[lang][outcome] += 1
279
  out.append(p)
280
 
281
+ return out, question_results, question_pct_pass, outcome_counts
282
 
283
 
284
+ def _calculate_metrics(lang, q_passed, q_pcts, o_count, k_vals):
285
  assert len(q_passed) == len(q_pcts)
286
 
287
  num_samples = np.zeros(len(q_passed))
 
298
  }
299
  out[f"{lang}/mean_pct_pass"] = np.mean(pcts_passed)
300
  out[f"{lang}/median_pct_pass"] = np.median(pcts_passed)
301
+
302
+ for outcome, val in o_count.items():
303
+ out[f"{lang}/pct_{outcome}"] = val/len(q_passed)
304
 
305
  return out
306