|
import os |
|
import csv |
|
import json |
|
from collections import OrderedDict, defaultdict |
|
|
|
def insert_sorted_acc_fields(result_dict): |
|
|
|
acc_fields = { |
|
k: result_dict.pop(k) for k in list(result_dict.keys()) |
|
if k != "model" and k.endswith("_acc") |
|
} |
|
|
|
|
|
sorted_acc_fields = dict(sorted(acc_fields.items())) |
|
|
|
|
|
reordered = OrderedDict() |
|
reordered["model"] = result_dict["model"] |
|
reordered.update(sorted_acc_fields) |
|
reordered.update(result_dict) |
|
|
|
return reordered |
|
|
|
def convert_latex_table(data, selected_data=None): |
|
""" |
|
Convert a list of dicts into a LaTeX table, sorted by descending average accuracy. |
|
|
|
Args: |
|
data (List[Dict]): your JSON‐like list. |
|
selected_data (List[str], optional): |
|
List of metric names _without_ the '_acc' suffix to include. |
|
E.g. ['aime24', 'amc23', 'hmmt_2024']. |
|
Defaults to all metrics found in data except 'avg_acc'. |
|
Returns: |
|
str: the LaTeX code for a table. |
|
""" |
|
|
|
if selected_data is None: |
|
selected_data = sorted( |
|
k[:-4] for k in data[0].keys() |
|
if k.endswith('_acc') and k != "avg_acc" |
|
) |
|
|
|
|
|
rows = [] |
|
for item in data: |
|
model_name = item["model"].replace("_temp0_n1_seed2", "") |
|
vals = [] |
|
for metric in selected_data: |
|
key = f"{metric}_acc" |
|
vals.append(float(item.get(key, 0.0))) |
|
avg_selected = sum(vals) / len(vals) if vals else 0.0 |
|
if model_name != "Qwen2.5-7B": |
|
model_name = model_name.replace("Qwen2.5-7B", "7B").replace("_stage1", "").replace("qwen2.5-7b", "7B") |
|
model_name = model_name.replace("_", "\\_") |
|
rows.append((model_name, vals, avg_selected)) |
|
|
|
|
|
rows.sort(key=lambda x: x[2], reverse=True) |
|
|
|
|
|
col_spec = "l" + "r" * (len(selected_data) + 1) |
|
header = ["Model"] + [m.replace("_", r"\_") for m in selected_data] + ["Avg"] |
|
header = " & ".join(header) + r" \\" |
|
header = header.replace("livemathbench", "livemath").replace("olympiadbench", "olympiad").replace("minerva\\_math", "minerva").replace("hmmt\\_2024", "hmmt24") |
|
|
|
lines = [] |
|
lines.append(r"\begin{table}[ht]") |
|
lines.append(r"\centering") |
|
lines.append(rf"\begin{{tabular}}{{{col_spec}}}") |
|
lines.append(r"\toprule") |
|
lines.append(header) |
|
lines.append(r"\midrule") |
|
for model, vals, avg in rows: |
|
formatted = [f"{v:.1f}" for v in vals] + [f"{avg:.1f}"] |
|
lines.append(" & ".join([model] + formatted) + r" \\") |
|
lines.append(r"\bottomrule") |
|
lines.append(r"\end{tabular}") |
|
lines.append(r"\caption{Model accuracies on selected benchmarks, sorted by average}") |
|
lines.append(r"\label{tab:acc_sorted}") |
|
lines.append(r"\end{table}") |
|
|
|
return "\n".join(lines) |
|
def compute_method_ranks(data, selected_models=None, selected_data=None): |
|
""" |
|
Compute, for each metric, the rank of each model (1 = best accuracy). |
|
|
|
Args: |
|
data (List[Dict]): your JSON‐like list of dicts. |
|
selected_models (List[str], optional): |
|
List of clean model names (with "_temp0_n1_seed2" already stripped) |
|
whose ranks you care about. If None, returns ranks for _all_ models. |
|
selected_data (List[str], optional): |
|
List of metric names _without_ the "_acc" suffix. If None, |
|
defaults to all keys ending in "_acc" except "avg_acc". |
|
|
|
Returns: |
|
Dict[str, Dict[str,int]]: |
|
Outer: metric → |
|
Inner: model_name → rank (1 = highest accuracy) |
|
""" |
|
|
|
if selected_data is None: |
|
selected_data = sorted( |
|
k[:-4] for k in data[0].keys() |
|
if k.endswith("_acc") and k != "avg_acc" |
|
) |
|
|
|
|
|
models = [] |
|
for item in data: |
|
clean_name = item["model"].replace("_temp0_n1_seed2", "") |
|
models.append((clean_name, item)) |
|
|
|
|
|
all_ranks = {} |
|
for metric in selected_data: |
|
key = f"{metric}_acc" |
|
|
|
vals = [ |
|
(name, float(item.get(key, 0.0))) |
|
for name, item in models |
|
] |
|
|
|
vals.sort(key=lambda x: x[1], reverse=True) |
|
|
|
ranks = {} |
|
prev_score = None |
|
prev_rank = 0 |
|
for idx, (name, score) in enumerate(vals, start=1): |
|
if score == prev_score: |
|
rank = prev_rank |
|
else: |
|
rank = idx |
|
ranks[name] = rank |
|
prev_score, prev_rank = score, rank |
|
|
|
|
|
if selected_models is not None: |
|
ranks = {m: ranks[m] for m in selected_models if m in ranks} |
|
|
|
all_ranks[metric] = ranks |
|
|
|
return all_ranks |
|
|
|
def collect_eval_results_by_prefix(root): |
|
all_results = [] |
|
|
|
for model_dir in os.listdir(root): |
|
model_path = os.path.join(root, model_dir) |
|
if not os.path.isdir(model_path): |
|
continue |
|
|
|
|
|
eval_results_dir = os.path.join(model_path, "eval_results") |
|
if not os.path.isdir(eval_results_dir): |
|
print(f"⚠️ Missing eval_results directory for: {model_dir}") |
|
continue |
|
|
|
|
|
global_step_dirs = [d for d in os.listdir(eval_results_dir) if os.path.isdir(os.path.join(eval_results_dir, d))] |
|
if not global_step_dirs: |
|
print(f"⚠️ No global step directories found in: {eval_results_dir}") |
|
continue |
|
|
|
|
|
global_step_dir = os.path.join(eval_results_dir, global_step_dirs[0]) |
|
|
|
|
|
result = OrderedDict() |
|
result["model"] = model_dir |
|
|
|
|
|
benchmark_dirs = [d for d in os.listdir(global_step_dir) if os.path.isdir(os.path.join(global_step_dir, d))] |
|
|
|
for benchmark in benchmark_dirs: |
|
if "livemath" in benchmark : |
|
|
|
continue |
|
benchmark_path = os.path.join(global_step_dir, benchmark) |
|
|
|
|
|
metrics_files = [f for f in os.listdir(benchmark_path) if f.endswith('_metrics.json')] |
|
if not metrics_files: |
|
print(f"⚠️ No metrics file found for {model_dir}/{benchmark}") |
|
continue |
|
|
|
|
|
metrics_file = os.path.join(benchmark_path, metrics_files[0]) |
|
|
|
try: |
|
with open(metrics_file, 'r') as f: |
|
metrics_data = json.load(f) |
|
|
|
|
|
if 'acc' in metrics_data: |
|
result[f"{benchmark}_acc"] = metrics_data['acc'] |
|
else: |
|
print(f"⚠️ No accuracy found in {metrics_file}") |
|
except Exception as e: |
|
print(f"⚠️ Error reading {metrics_file}: {e}") |
|
|
|
|
|
if len(result) > 1: |
|
|
|
acc_values = [v for k, v in result.items() if k.endswith('_acc')] |
|
if acc_values: |
|
avg_acc = sum(acc_values) / len(acc_values) |
|
result["avg_acc"] = round(avg_acc, 1) |
|
|
|
|
|
result["avg_metadata"] = { |
|
"num_benchmarks": len(acc_values), |
|
"benchmarks": [k[:-4] for k in result.keys() if k.endswith('_acc') and k != "avg_acc"] |
|
} |
|
|
|
result = insert_sorted_acc_fields(result) |
|
all_results.append(result) |
|
else: |
|
print(f"⚠️ No accuracies found for {model_dir}") |
|
|
|
|
|
all_results.sort(key=lambda x: x["model"]) |
|
output_path = os.path.join(root, "combined_eval_results.json") |
|
with open(output_path, "w") as f: |
|
json.dump(all_results, f, indent=2) |
|
|
|
print(f"✅ Saved structured JSON to {output_path}") |
|
|
|
collect_eval_results_by_prefix("./EVAL/checkpoints") |