Alpay Ariyak commited on
Commit
42f9642
1 Parent(s): c56b450

Changed Bench Eval to report metrics correctly by split. Added total accuracy and renamed previously used bench_accuracy to bench_average_accuracy. (#512)

Browse files

* Added "eval_" prefix

* Added total bench accuracy and renamed the previous one to bench_average_accuracy. Changed naming to use bench_split instead of always using eval_ prefix.

Files changed (1) hide show
  1. src/axolotl/utils/callbacks.py +11 -4
src/axolotl/utils/callbacks.py CHANGED
@@ -275,7 +275,7 @@ def bench_eval_callback_factory(trainer, tokenizer):
275
  else:
276
  dist.gather_object(local_bench_names, gathered_bench_names, dst=0)
277
  bench_loss = sum(loss_bench_ranks) / sum(len_data_loader_ranks)
278
- results = {"bench_loss": bench_loss}
279
 
280
  # Combine results from all GPUs
281
  combined_bench_names: Dict[str, Dict[str, List]] = {}
@@ -287,6 +287,8 @@ def bench_eval_callback_factory(trainer, tokenizer):
287
  combined_bench_names[name]["preds"].extend(data["preds"])
288
 
289
  bench_scores = []
 
 
290
  for (
291
  bench_name
292
  ) in combined_bench_names: # pylint: disable=consider-using-dict-items
@@ -294,15 +296,20 @@ def bench_eval_callback_factory(trainer, tokenizer):
294
  references=combined_bench_names[bench_name]["refs"],
295
  predictions=combined_bench_names[bench_name]["preds"],
296
  )["accuracy"]
 
 
297
  if not pd.isna(bench_score):
298
  results[
299
- f"bench_{bench_split}_accuracy_{bench_name}"
300
  ] = bench_score
301
  bench_scores.append(bench_score)
302
  else:
303
- results[f"bench_{bench_split}_accuracy_{bench_name}"] = 0.0
304
  bench_scores.append(0.0)
305
- results[f"bench_{bench_split}_accuracy"] = np.mean(bench_scores)
 
 
 
306
  trainer.log(results)
307
 
308
  return BenchEvalCallback
 
275
  else:
276
  dist.gather_object(local_bench_names, gathered_bench_names, dst=0)
277
  bench_loss = sum(loss_bench_ranks) / sum(len_data_loader_ranks)
278
+ results = {f"{bench_split}_bench_loss": bench_loss}
279
 
280
  # Combine results from all GPUs
281
  combined_bench_names: Dict[str, Dict[str, List]] = {}
 
287
  combined_bench_names[name]["preds"].extend(data["preds"])
288
 
289
  bench_scores = []
290
+ bench_refs = []
291
+ bench_preds = []
292
  for (
293
  bench_name
294
  ) in combined_bench_names: # pylint: disable=consider-using-dict-items
 
296
  references=combined_bench_names[bench_name]["refs"],
297
  predictions=combined_bench_names[bench_name]["preds"],
298
  )["accuracy"]
299
+ bench_refs.extend(combined_bench_names[bench_name]["refs"])
300
+ bench_preds.extend(combined_bench_names[bench_name]["preds"])
301
  if not pd.isna(bench_score):
302
  results[
303
+ f"{bench_split}_bench_accuracy_{bench_name}"
304
  ] = bench_score
305
  bench_scores.append(bench_score)
306
  else:
307
+ results[f"{bench_split}_bench_accuracy_{bench_name}"] = 0.0
308
  bench_scores.append(0.0)
309
+ results[f"{bench_split}_bench_average_accuracy"] = np.mean(bench_scores)
310
+ results[f"{bench_split}_bench_total_accuracy"] = accuracy.compute(
311
+ references=bench_refs, predictions=bench_preds
312
+ )["accuracy"]
313
  trainer.log(results)
314
 
315
  return BenchEvalCallback