Spaces:
Running
Running
import os | |
import json | |
import tyro | |
import pandas as pd | |
TASK_METRICS = { | |
"arc_challenge": "acc_norm", | |
"hellaswag": "acc_norm", | |
"truthfulqa_mc": "mc2", | |
} | |
TASK_SHORT_NAMES = { | |
"arc_challenge": "arc", | |
"hellaswag": "hellaswag", | |
"truthfulqa_mc": "truthfulqa", | |
} | |
def main(data_dir: str, out_file: str = "score.csv") -> None: | |
"""Aggregate results from lm-evaluation-harness into a CSV file. | |
Args: | |
data_dir: The directory containing the results. Model names are | |
expected to be the immediate subdirectories of `data_dir`. | |
out_file: The path to the output CSV file. (Default: `score.csv`) | |
""" | |
models = list(filter(lambda x: os.path.isdir(f"{data_dir}/{x}"), os.listdir(data_dir))) | |
df = pd.DataFrame(columns=TASK_SHORT_NAMES.values()) | |
for model_dir in models: | |
for task, metric in TASK_METRICS.items(): | |
model_name = "/".join(model_dir.split("--")[-2:]) | |
results = json.load(open(f"{data_dir}/{model_dir}/{task}")) | |
df.loc[model_name, TASK_SHORT_NAMES[task]] = float(results["results"][task][metric]) * 100.0 | |
df = df.reset_index().rename(columns={"index": "model"}) | |
# Write the CSV file. | |
if dirname := os.path.dirname(out_file): | |
os.makedirs(dirname, exist_ok=True) | |
df.to_csv(out_file, index=False) | |
if __name__ == "__main__": | |
tyro.cli(main) | |