Spaces:

ml-energy
/

leaderboard

Running

App Files Files Community

Jae-Won Chung commited on Jul 6, 2023

Commit

f98b171

1 Parent(s): 663521e

Add aggregate_nlp_metrics.py and more precise score.csv

Browse files

Files changed (2) hide show

data/score.csv +19 -20
scripts/aggregate_nlp_metrics.py +44 -0

data/score.csv CHANGED Viewed

@@ -1,21 +1,20 @@
 model,arc,hellaswag,truthfulqa
-lmsys/vicuna-7B,53.5,77.5,49.0
-lmsys/vicuna-13B,52.9,80.1,51.8
-tatsu-lab/alpaca-7B,52.6,76.9,39.6
-metaai/llama-7B,51.1,77.7,34.1
-metaai/llama-13B,56.3,80.9,39.9
-camel-ai/CAMEL-13B-Combined-Data,55.5,79.3,47.3
-BlinkDL/RWKV-4-Raven-7B-v12-Eng98%-Other2%-20230521-ctx8192.pth,NaN,NaN,NaN
-databricks/dolly-v2-12b,42.2,71.8,33.4
-FreedomIntelligence/phoenix-inst-chat-7b,45.0,63.2,47.1
-h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2,36.9,61.6,37.9
-lmsys/fastchat-t5-3b-v1.0,35.9,46.4,48.8
-Neutralzz/BiLLa-7B-SFT,27.7,26.0,49.0
-nomic-ai/gpt4all-13b-snoozy,56.1,78.7,48.4
-openaccess-ai-collective/manticore-13b-chat-pyg,58.7,82.0,48.9
-OpenAssistant/oasst-sft-1-pythia-12b,45.6,69.9,39.2
-project-baize/baize-v2-7B,48.5,75.0,41.7
-BAIR/koala-7b,47.1,73.7,46.0
-BAIR/koala-13b,52.9,77.5,50.1
-StabilityAI/stablelm-tuned-alpha-7b,31.9,53.6,40.2
-togethercomputer/RedPajama-INCITE-7B-Chat,42.2,70.8,36.1

 model,arc,hellaswag,truthfulqa
+BAIR/koala-13b,52.901023890784984,77.54431388169687,50.091065219059125
+BAIR/koala-7b,47.098976109215016,73.70045807608047,45.997635958147875
+lmsys/vicuna-7B,53.49829351535836,77.53435570603465,48.997614637055264
+metaai/llama-13B,56.31399317406144,80.8603863772157,39.90298264801161
+tatsu-lab/alpaca-7B,52.64505119453925,76.90699063931487,39.552770976749336
+OpenAssistant/oasst-sft-1-pythia-12b,45.563139931740615,69.92630950009958,39.1893543136912
+databricks/dolly-v2-12b,42.15017064846416,71.82832105158336,33.37136000408915
+h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2,36.86006825938566,61.551483768173675,37.9421602393762
+lmsys/fastchat-t5-3b-v1.0,35.92150170648464,46.355307707627965,48.787610045893985
+nomic-ai/gpt4all-13b-snoozy,56.058020477815695,78.68950408285203,48.35948664919701
+openaccess-ai-collective/manticore-13b-chat-pyg,58.703071672354945,81.95578570005975,48.86009773651491
+lmsys/vicuna-13B,52.901023890784984,80.12348137821151,51.81653185716687
+metaai/llama-7B,51.10921501706485,77.74347739494125,34.0786227034917
+StabilityAI/stablelm-tuned-alpha-7b,31.91126279863481,53.59490141406095,40.22458364155103
+project-baize/baize-v2-7B,48.4641638225256,75.00497908783112,41.66264911575524
+FreedomIntelligence/phoenix-inst-chat-7b,44.965870307167236,63.2244572794264,47.084372288512725
+camel-ai/CAMEL-13B-Combined-Data,55.54607508532423,79.29695279824736,47.33219922854091
+Neutralzz/BiLLa-7B-SFT,27.730375426621162,26.04062935670185,49.045640164325754
+togethercomputer/RedPajama-INCITE-7B-Chat,42.15017064846416,70.8424616610237,36.10055989611241

scripts/aggregate_nlp_metrics.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+import json
+import tyro
+import pandas as pd
+TASK_METRICS = {
+    "arc_challenge": "acc_norm",
+    "hellaswag": "acc_norm",
+    "truthfulqa_mc": "mc2",
+}
+TASK_SHORT_NAMES = {
+    "arc_challenge": "arc",
+    "hellaswag": "hellaswag",
+    "truthfulqa_mc": "truthfulqa",
+}
+def main(data_dir: str, out_file: str = "score.csv") -> None:
+    """Aggregate results from lm-evaluation-harness into a CSV file.
+    Args:
+        data_dir: The directory containing the results. Model names are
+            expected to be the immediate subdirectories of `data_dir`.
+        out_file: The path to the output CSV file. (Default: `score.csv`)
+    """
+    models = list(filter(lambda x: os.path.isdir(f"{data_dir}/{x}"), os.listdir(data_dir)))
+    df = pd.DataFrame(columns=TASK_SHORT_NAMES.values())
+    for model_dir in models:
+        for task, metric in TASK_METRICS.items():
+            model_name = "/".join(model_dir.split("--")[-2:])
+            results = json.load(open(f"{data_dir}/{model_dir}/{task}"))
+            df.loc[model_name, TASK_SHORT_NAMES[task]] = float(results["results"][task][metric]) * 100.0
+    df = df.reset_index().rename(columns={"index": "model"})
+    # Write the CSV file.
+    if dirname := os.path.dirname(out_file):
+        os.makedirs(dirname, exist_ok=True)
+    df.to_csv(out_file, index=False)
+if __name__ == "__main__":
+    tyro.cli(main)