diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..840d929a00830d99ebd744bf3ad1d39228cc0963 --- /dev/null +++ b/app.py @@ -0,0 +1,110 @@ +import os +import json +import glob +from collections import defaultdict +import gradio as gr + +import glob + +ARC = "arc_challenge" +HELLASWAG = "hellaswag" +MMLU = "mmlu" +TRUTHFULQA = "truthfulqa-mc" +BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA] + +METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"] + + +def collect_results(): + performance_dict = defaultdict(dict) + pretrained_models = set() + for file in glob.glob('evals/*/*.json'): + with open(file, 'r') as f: + data = json.load(f) + if 'results' not in data: + continue + if 'config' not in data: + continue + results = data['results'] + config = data['config'] + if 'model_args' not in config: + continue + + model_args = config['model_args'].split(',') + pretrained = [x for x in model_args if x.startswith('pretrained=')] + if len(pretrained) != 1: + continue + pretrained = pretrained[0].split('=')[1] + pretrained = pretrained.split('/')[-1] + pretrained_models.add(pretrained) + + for lang_task, perfs in results.items(): + if lang_task.startswith('arc_') and lang_task.endswith('_challenge'): + lang = lang_task.split('_')[1] + task = ARC + elif lang_task.startswith('hellaswag_'): + _, lang = lang_task.split('_') + task = HELLASWAG + elif lang_task.startswith('mmlu_'): + _, lang = lang_task.split('_') + task = MMLU + elif lang_task.startswith('truthfulqa_') and lang_task.endswith('_mc'): + lang = lang_task.split('_')[1] + task = TRUTHFULQA + + if lang and task: + metric = METRICS[BENCHMARKS.index(task)] + p = round(perfs[metric] * 100, 1) + performance_dict[(pretrained, lang)][task] = p + return performance_dict, pretrained_models + + +def get_leaderboard_df(performance_dict, pretrained_models): + df = list() + for (pretrained, lang), perfs in performance_dict.items(): + arc_perf = perfs.get(ARC, 0.0) + hellaswag_perf = perfs.get(HELLASWAG, 0.0) + mmlu_perf = perfs.get(MMLU, 0.0) + truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0) + + if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0: + continue + avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1) + row = [pretrained, lang, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf] + df.append(row) + return df + + +MODEL_COL = "Model" +LANG_COL = "Language" +AVERAGE_COL = "Average" +ARC_COL = "ARC (25-shot)" +HELLASWAG_COL = "HellaSwag (10-shot)️" +MMLU_COL = "MMLU (5-shot))️" +TRUTHFULQA_COL = "TruthfulQA (0-shot)" + +COLS = [MODEL_COL, LANG_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL] +TYPES = ["str", "str", "number", "number", "number", "number", "number"] + +args = collect_results() +leaderboard_df = get_leaderboard_df(*args) + +demo = gr.Blocks() +with demo: + gr.HTML('Open Multilingual Large Language Model Evaluation Leaderboard') + gr.Markdown('INTRODUCTION TEXT', elem_classes="markdown-text") + + with gr.Box(): + search_bar = gr.Textbox( + placeholder="Search models...", show_label=False, elem_id="search-bar" + ) + + leaderboard_table = gr.components.Dataframe( + value=leaderboard_df, + headers=COLS, + datatype=TYPES, + max_rows=5, + elem_id="leaderboard-table", + ) + +demo.launch() diff --git a/evals/arc-challenge/arc_ar_challenge_bloom-1b7.json b/evals/arc-challenge/arc_ar_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..f11ea3c48ac461ea8df812ba639e5871955a3481 --- /dev/null +++ b/evals/arc-challenge/arc_ar_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ar_challenge": { + "acc": 0.22818791946308725, + "acc_stderr": 0.02435139725761051, + "acc_norm": 0.2516778523489933, + "acc_norm_stderr": 0.025181904610615872 + } + }, + "versions": { + "arc_ar_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ar_challenge_bloom-560.json b/evals/arc-challenge/arc_ar_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..49fe745a2caa93a57a99f2a5d13b829f8544cd13 --- /dev/null +++ b/evals/arc-challenge/arc_ar_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ar_challenge": { + "acc": 0.2550335570469799, + "acc_stderr": 0.025292327380712708, + "acc_norm": 0.2550335570469799, + "acc_norm_stderr": 0.025292327380712708 + } + }, + "versions": { + "arc_ar_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ar_challenge_bloom-7b1.json b/evals/arc-challenge/arc_ar_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..b79172a73e91dbbf21909686c17e2c23c1f18bef --- /dev/null +++ b/evals/arc-challenge/arc_ar_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ar_challenge": { + "acc": 0.28187919463087246, + "acc_stderr": 0.026106703750007426, + "acc_norm": 0.3087248322147651, + "acc_norm_stderr": 0.026806063072940547 + } + }, + "versions": { + "arc_ar_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ar_challenge_gpt2-large.json b/evals/arc-challenge/arc_ar_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..f1aadc6691007c31ca76e985257d9ebfbffa04c5 --- /dev/null +++ b/evals/arc-challenge/arc_ar_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ar_challenge": { + "acc": 0.20134228187919462, + "acc_stderr": 0.023268565767685306, + "acc_norm": 0.21476510067114093, + "acc_norm_stderr": 0.023828868848284352 + } + }, + "versions": { + "arc_ar_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ar_challenge_gpt2-medium.json b/evals/arc-challenge/arc_ar_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..db628063ccf012f4301410acf74c6449499d4a18 --- /dev/null +++ b/evals/arc-challenge/arc_ar_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ar_challenge": { + "acc": 0.19463087248322147, + "acc_stderr": 0.022973392306598162, + "acc_norm": 0.21140939597315436, + "acc_norm_stderr": 0.02369243605357901 + } + }, + "versions": { + "arc_ar_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ar_challenge_gpt2.json b/evals/arc-challenge/arc_ar_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..5deb8a5f49f36a08688564ca109ad5160192b56e --- /dev/null +++ b/evals/arc-challenge/arc_ar_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ar_challenge": { + "acc": 0.20134228187919462, + "acc_stderr": 0.023268565767685313, + "acc_norm": 0.22483221476510068, + "acc_norm_stderr": 0.024224169829650755 + } + }, + "versions": { + "arc_ar_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ar_challenge_llama-7B.json b/evals/arc-challenge/arc_ar_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..e1b5a76fae32ffadeb87c9a634cef2c6de55e923 --- /dev/null +++ b/evals/arc-challenge/arc_ar_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ar_challenge": { + "acc": 0.22483221476510068, + "acc_stderr": 0.02422416982965075, + "acc_norm": 0.24161073825503357, + "acc_norm_stderr": 0.024838535108028477 + } + }, + "versions": { + "arc_ar_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_bn_challenge_bloom-1b7.json b/evals/arc-challenge/arc_bn_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..fa55573d46ebd614a4feb5a1aac46df0effefe2f --- /dev/null +++ b/evals/arc-challenge/arc_bn_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_bn_challenge": { + "acc": 0.20945945945945946, + "acc_stderr": 0.023691963473475724, + "acc_norm": 0.2533783783783784, + "acc_norm_stderr": 0.025323518629100008 + } + }, + "versions": { + "arc_bn_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_bn_challenge_bloom-560.json b/evals/arc-challenge/arc_bn_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..389eeb09c0a92f6b7861501b6a3e0b9caff08e3e --- /dev/null +++ b/evals/arc-challenge/arc_bn_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_bn_challenge": { + "acc": 0.22972972972972974, + "acc_stderr": 0.024491712953916975, + "acc_norm": 0.24662162162162163, + "acc_norm_stderr": 0.025096383517594287 + } + }, + "versions": { + "arc_bn_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_bn_challenge_bloom-7b1.json b/evals/arc-challenge/arc_bn_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..7cf6ca71cd6f8268d0ed709fbff3ff9aa1aa20f9 --- /dev/null +++ b/evals/arc-challenge/arc_bn_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_bn_challenge": { + "acc": 0.23986486486486486, + "acc_stderr": 0.02486094967084638, + "acc_norm": 0.28040540540540543, + "acc_norm_stderr": 0.026153277917823237 + } + }, + "versions": { + "arc_bn_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_bn_challenge_gpt2-large.json b/evals/arc-challenge/arc_bn_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..6b36e33e7bf7866400a4c7d058836627255b75a8 --- /dev/null +++ b/evals/arc-challenge/arc_bn_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_bn_challenge": { + "acc": 0.2195945945945946, + "acc_stderr": 0.024102381106046785, + "acc_norm": 0.2668918918918919, + "acc_norm_stderr": 0.025753762926257924 + } + }, + "versions": { + "arc_bn_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_bn_challenge_gpt2-medium.json b/evals/arc-challenge/arc_bn_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..69dd44fcae67f0511715af28d9a6762dc0732634 --- /dev/null +++ b/evals/arc-challenge/arc_bn_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_bn_challenge": { + "acc": 0.20608108108108109, + "acc_stderr": 0.02355028295929425, + "acc_norm": 0.24662162162162163, + "acc_norm_stderr": 0.02509638351759427 + } + }, + "versions": { + "arc_bn_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_bn_challenge_gpt2.json b/evals/arc-challenge/arc_bn_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..2de0f9a7b900ac9accabd3ade0c8a4d14d7fda03 --- /dev/null +++ b/evals/arc-challenge/arc_bn_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_bn_challenge": { + "acc": 0.22635135135135134, + "acc_stderr": 0.024364215012920555, + "acc_norm": 0.2668918918918919, + "acc_norm_stderr": 0.025753762926257917 + } + }, + "versions": { + "arc_bn_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_bn_challenge_llama-7B.json b/evals/arc-challenge/arc_bn_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..a3dbec93edb13b0fdf7c70d9a22d0f709e0a25b2 --- /dev/null +++ b/evals/arc-challenge/arc_bn_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_bn_challenge": { + "acc": 0.22635135135135134, + "acc_stderr": 0.024364215012920565, + "acc_norm": 0.26013513513513514, + "acc_norm_stderr": 0.02554257639364025 + } + }, + "versions": { + "arc_bn_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ca_challenge_bloom-1b7.json b/evals/arc-challenge/arc_ca_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..80c6381676cf5f4508fe26a2e71b75de9f5857f5 --- /dev/null +++ b/evals/arc-challenge/arc_ca_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ca_challenge": { + "acc": 0.2356902356902357, + "acc_stderr": 0.02466946003490763, + "acc_norm": 0.27946127946127947, + "acc_norm_stderr": 0.026082164400369843 + } + }, + "versions": { + "arc_ca_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ca_challenge_bloom-560.json b/evals/arc-challenge/arc_ca_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..74ea721d64eabef94a72533148cf4d15946ea667 --- /dev/null +++ b/evals/arc-challenge/arc_ca_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ca_challenge": { + "acc": 0.2053872053872054, + "acc_stderr": 0.02348110951859932, + "acc_norm": 0.23232323232323232, + "acc_norm_stderr": 0.02454650495612789 + } + }, + "versions": { + "arc_ca_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ca_challenge_bloom-7b1.json b/evals/arc-challenge/arc_ca_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..828e5442ee5f197e68f640cec0d3f5a4d2190a86 --- /dev/null +++ b/evals/arc-challenge/arc_ca_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ca_challenge": { + "acc": 0.3164983164983165, + "acc_stderr": 0.02703395838420779, + "acc_norm": 0.3434343434343434, + "acc_norm_stderr": 0.0276003816062635 + } + }, + "versions": { + "arc_ca_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ca_challenge_gpt2-large.json b/evals/arc-challenge/arc_ca_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..1d1333c44929e8c397db2c9c89aa32f6c849e02f --- /dev/null +++ b/evals/arc-challenge/arc_ca_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ca_challenge": { + "acc": 0.20875420875420875, + "acc_stderr": 0.02362258775627148, + "acc_norm": 0.22895622895622897, + "acc_norm_stderr": 0.02442136264227106 + } + }, + "versions": { + "arc_ca_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ca_challenge_gpt2-medium.json b/evals/arc-challenge/arc_ca_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..b9427197beac9ba8529aa3e8014b5dee0307e089 --- /dev/null +++ b/evals/arc-challenge/arc_ca_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ca_challenge": { + "acc": 0.20875420875420875, + "acc_stderr": 0.023622587756271473, + "acc_norm": 0.21212121212121213, + "acc_norm_stderr": 0.023761611918761673 + } + }, + "versions": { + "arc_ca_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ca_challenge_gpt2.json b/evals/arc-challenge/arc_ca_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..a9ebfd334ce3c7fa9305ddb2650d0c9ed8d727ac --- /dev/null +++ b/evals/arc-challenge/arc_ca_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ca_challenge": { + "acc": 0.21885521885521886, + "acc_stderr": 0.024032467624412215, + "acc_norm": 0.21885521885521886, + "acc_norm_stderr": 0.02403246762441221 + } + }, + "versions": { + "arc_ca_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ca_challenge_llama-7B.json b/evals/arc-challenge/arc_ca_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..5b79736bea0e6806983af2b1d26982bb71d2169c --- /dev/null +++ b/evals/arc-challenge/arc_ca_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ca_challenge": { + "acc": 0.29292929292929293, + "acc_stderr": 0.026452514969665927, + "acc_norm": 0.29292929292929293, + "acc_norm_stderr": 0.02645251496966592 + } + }, + "versions": { + "arc_ca_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_da_challenge_bloom-1b7.json b/evals/arc-challenge/arc_da_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..ad507f37ee73db4c175fcd2ff76b2949c5186f12 --- /dev/null +++ b/evals/arc-challenge/arc_da_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_da_challenge": { + "acc": 0.2255892255892256, + "acc_stderr": 0.02429399929295737, + "acc_norm": 0.26262626262626265, + "acc_norm_stderr": 0.02557802773320011 + } + }, + "versions": { + "arc_da_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_da_challenge_bloom-560.json b/evals/arc-challenge/arc_da_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..76c97cf086a3d4eb479d7ea19745c4f301127a2e --- /dev/null +++ b/evals/arc-challenge/arc_da_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_da_challenge": { + "acc": 0.25925925925925924, + "acc_stderr": 0.025471492792791667, + "acc_norm": 0.24579124579124578, + "acc_norm_stderr": 0.025025521384235284 + } + }, + "versions": { + "arc_da_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_da_challenge_bloom-7b1.json b/evals/arc-challenge/arc_da_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..38cbbb63a1aa857301e47a632ca28cb48df2b26a --- /dev/null +++ b/evals/arc-challenge/arc_da_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_da_challenge": { + "acc": 0.24242424242424243, + "acc_stderr": 0.02490893747050877, + "acc_norm": 0.24915824915824916, + "acc_norm_stderr": 0.025140041284626418 + } + }, + "versions": { + "arc_da_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_da_challenge_gpt2-large.json b/evals/arc-challenge/arc_da_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..c8ee21dc7b9e87604443ebe5bc43e5cd6006ac8a --- /dev/null +++ b/evals/arc-challenge/arc_da_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_da_challenge": { + "acc": 0.23232323232323232, + "acc_stderr": 0.02454650495612789, + "acc_norm": 0.24242424242424243, + "acc_norm_stderr": 0.024908937470508753 + } + }, + "versions": { + "arc_da_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_da_challenge_gpt2-medium.json b/evals/arc-challenge/arc_da_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..df7aa6d8d8bffd69ae15219bdb1f31971d2146b7 --- /dev/null +++ b/evals/arc-challenge/arc_da_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_da_challenge": { + "acc": 0.24579124579124578, + "acc_stderr": 0.0250255213842353, + "acc_norm": 0.2727272727272727, + "acc_norm_stderr": 0.025886127156886297 + } + }, + "versions": { + "arc_da_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_da_challenge_gpt2.json b/evals/arc-challenge/arc_da_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..e06d761ac718567edd82446e7cab3db268352caf --- /dev/null +++ b/evals/arc-challenge/arc_da_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_da_challenge": { + "acc": 0.2222222222222222, + "acc_stderr": 0.02416437978893547, + "acc_norm": 0.23905723905723905, + "acc_norm_stderr": 0.024790260423468984 + } + }, + "versions": { + "arc_da_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_da_challenge_llama-7B.json b/evals/arc-challenge/arc_da_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..0669687f3d0755614d71660a1b71b9c1d16c99af --- /dev/null +++ b/evals/arc-challenge/arc_da_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_da_challenge": { + "acc": 0.3063973063973064, + "acc_stderr": 0.026794891419479452, + "acc_norm": 0.3367003367003367, + "acc_norm_stderr": 0.02746823841289221 + } + }, + "versions": { + "arc_da_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_de_challenge_bloom-1b7.json b/evals/arc-challenge/arc_de_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..2c10bc700c0ecb2dfc8bde73b2f3f18879be1571 --- /dev/null +++ b/evals/arc-challenge/arc_de_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_de_challenge": { + "acc": 0.24496644295302014, + "acc_stderr": 0.024955035980898946, + "acc_norm": 0.2953020134228188, + "acc_norm_stderr": 0.026470155629081085 + } + }, + "versions": { + "arc_de_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_de_challenge_bloom-560.json b/evals/arc-challenge/arc_de_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..0c23e9b1eaef780d6a824e7c0f623556d950ca89 --- /dev/null +++ b/evals/arc-challenge/arc_de_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_de_challenge": { + "acc": 0.2348993288590604, + "acc_stderr": 0.024599255015999244, + "acc_norm": 0.28187919463087246, + "acc_norm_stderr": 0.026106703750007426 + } + }, + "versions": { + "arc_de_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_de_challenge_bloom-7b1.json b/evals/arc-challenge/arc_de_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..477d702b1bc9eee6d2f6b2ada459a35f84ed90e2 --- /dev/null +++ b/evals/arc-challenge/arc_de_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_de_challenge": { + "acc": 0.2684563758389262, + "acc_stderr": 0.0257145395148175, + "acc_norm": 0.2684563758389262, + "acc_norm_stderr": 0.0257145395148175 + } + }, + "versions": { + "arc_de_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_de_challenge_gpt2-large.json b/evals/arc-challenge/arc_de_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..2bc523b2a951a72b3cd9a3ca1f364c1880010ab0 --- /dev/null +++ b/evals/arc-challenge/arc_de_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_de_challenge": { + "acc": 0.23825503355704697, + "acc_stderr": 0.024719951493159625, + "acc_norm": 0.27181208053691275, + "acc_norm_stderr": 0.025815342279487567 + } + }, + "versions": { + "arc_de_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_de_challenge_gpt2-medium.json b/evals/arc-challenge/arc_de_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..45b24780309957f9064133758d7f8cccdb182f96 --- /dev/null +++ b/evals/arc-challenge/arc_de_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_de_challenge": { + "acc": 0.23825503355704697, + "acc_stderr": 0.024719951493159625, + "acc_norm": 0.28859060402684567, + "acc_norm_stderr": 0.026291942108676806 + } + }, + "versions": { + "arc_de_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_de_challenge_gpt2.json b/evals/arc-challenge/arc_de_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..dcac4b017ab401c82005ea115725c223d14f4bbb --- /dev/null +++ b/evals/arc-challenge/arc_de_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_de_challenge": { + "acc": 0.22483221476510068, + "acc_stderr": 0.02422416982965075, + "acc_norm": 0.21140939597315436, + "acc_norm_stderr": 0.02369243605357901 + } + }, + "versions": { + "arc_de_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_de_challenge_llama-7B.json b/evals/arc-challenge/arc_de_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..8cb6300f14d8c556143f550509be7862841dc7c6 --- /dev/null +++ b/evals/arc-challenge/arc_de_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_de_challenge": { + "acc": 0.2785234899328859, + "acc_stderr": 0.0260114035784859, + "acc_norm": 0.348993288590604, + "acc_norm_stderr": 0.027658144793750224 + } + }, + "versions": { + "arc_de_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_es_challenge_bloom-1b7.json b/evals/arc-challenge/arc_es_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..74eba78a722fcedb488ec904b2f0d58171c8a749 --- /dev/null +++ b/evals/arc-challenge/arc_es_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_es_challenge": { + "acc": 0.2356902356902357, + "acc_stderr": 0.02466946003490763, + "acc_norm": 0.2895622895622896, + "acc_norm_stderr": 0.026362594432681956 + } + }, + "versions": { + "arc_es_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_es_challenge_bloom-560.json b/evals/arc-challenge/arc_es_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..f03023ac512f6466bc05adcbbd4b74fafdb0701e --- /dev/null +++ b/evals/arc-challenge/arc_es_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_es_challenge": { + "acc": 0.2255892255892256, + "acc_stderr": 0.024293999292957367, + "acc_norm": 0.2356902356902357, + "acc_norm_stderr": 0.02466946003490764 + } + }, + "versions": { + "arc_es_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_es_challenge_bloom-7b1.json b/evals/arc-challenge/arc_es_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..42cce52cd279c31092e728aadcc63cb1e0a04b59 --- /dev/null +++ b/evals/arc-challenge/arc_es_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_es_challenge": { + "acc": 0.3265993265993266, + "acc_stderr": 0.027258287015652305, + "acc_norm": 0.3602693602693603, + "acc_norm_stderr": 0.02790399493827167 + } + }, + "versions": { + "arc_es_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_es_challenge_gpt2-large.json b/evals/arc-challenge/arc_es_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..8889a96dc89f373c32d03d03beba715496d3c5cf --- /dev/null +++ b/evals/arc-challenge/arc_es_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_es_challenge": { + "acc": 0.2222222222222222, + "acc_stderr": 0.024164379788935483, + "acc_norm": 0.26262626262626265, + "acc_norm_stderr": 0.02557802773320012 + } + }, + "versions": { + "arc_es_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_es_challenge_gpt2-medium.json b/evals/arc-challenge/arc_es_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..292e3ed1cc0e8b1b1063554055397c13de7ff5f7 --- /dev/null +++ b/evals/arc-challenge/arc_es_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_es_challenge": { + "acc": 0.1919191919191919, + "acc_stderr": 0.022889733897083934, + "acc_norm": 0.25252525252525254, + "acc_norm_stderr": 0.02525252525252536 + } + }, + "versions": { + "arc_es_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_es_challenge_gpt2.json b/evals/arc-challenge/arc_es_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..e71f05e3b44a477a0c85e997c61776163460f160 --- /dev/null +++ b/evals/arc-challenge/arc_es_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_es_challenge": { + "acc": 0.19865319865319866, + "acc_stderr": 0.023190610381322127, + "acc_norm": 0.24579124579124578, + "acc_norm_stderr": 0.0250255213842353 + } + }, + "versions": { + "arc_es_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_es_challenge_llama-7B.json b/evals/arc-challenge/arc_es_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..0fab72d1a1f2e4fd24095bb5ec61c4a1d8f08aee --- /dev/null +++ b/evals/arc-challenge/arc_es_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_es_challenge": { + "acc": 0.3501683501683502, + "acc_stderr": 0.027726370308831506, + "acc_norm": 0.3602693602693603, + "acc_norm_stderr": 0.02790399493827167 + } + }, + "versions": { + "arc_es_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_eu_challenge_bloom-1b7.json b/evals/arc-challenge/arc_eu_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..ec1113a347e63807533e24faa9f8f1133a725ba3 --- /dev/null +++ b/evals/arc-challenge/arc_eu_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_eu_challenge": { + "acc": 0.22377622377622378, + "acc_stderr": 0.02468755105337312, + "acc_norm": 0.2517482517482518, + "acc_norm_stderr": 0.02570896966075011 + } + }, + "versions": { + "arc_eu_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_eu_challenge_bloom-560.json b/evals/arc-challenge/arc_eu_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..d21d146ef31af9e17f56082cab45ffcd1938858f --- /dev/null +++ b/evals/arc-challenge/arc_eu_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_eu_challenge": { + "acc": 0.24475524475524477, + "acc_stderr": 0.02546756553847068, + "acc_norm": 0.19230769230769232, + "acc_norm_stderr": 0.023345268410264786 + } + }, + "versions": { + "arc_eu_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_eu_challenge_bloom-7b1.json b/evals/arc-challenge/arc_eu_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..a5c3fd12b9223764b5f572dbfa37a6903f058c5e --- /dev/null +++ b/evals/arc-challenge/arc_eu_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_eu_challenge": { + "acc": 0.23076923076923078, + "acc_stderr": 0.024957141712425013, + "acc_norm": 0.24125874125874125, + "acc_norm_stderr": 0.025343462496583764 + } + }, + "versions": { + "arc_eu_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_eu_challenge_gpt2-large.json b/evals/arc-challenge/arc_eu_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..1ca1581ef49b197cacfd25186739d7697494240c --- /dev/null +++ b/evals/arc-challenge/arc_eu_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_eu_challenge": { + "acc": 0.25874125874125875, + "acc_stderr": 0.02594151450124707, + "acc_norm": 0.24125874125874125, + "acc_norm_stderr": 0.025343462496583737 + } + }, + "versions": { + "arc_eu_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_eu_challenge_gpt2-medium.json b/evals/arc-challenge/arc_eu_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..9fcb0f103e4f8b17826dc742c5e2fd7760677501 --- /dev/null +++ b/evals/arc-challenge/arc_eu_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_eu_challenge": { + "acc": 0.2762237762237762, + "acc_stderr": 0.026485626798716442, + "acc_norm": 0.25874125874125875, + "acc_norm_stderr": 0.025941514501247064 + } + }, + "versions": { + "arc_eu_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_eu_challenge_gpt2.json b/evals/arc-challenge/arc_eu_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..7a6f7747e337535ab8fba538b1b3e6292e596be8 --- /dev/null +++ b/evals/arc-challenge/arc_eu_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_eu_challenge": { + "acc": 0.2762237762237762, + "acc_stderr": 0.026485626798716456, + "acc_norm": 0.24825174825174826, + "acc_norm_stderr": 0.025589390464738234 + } + }, + "versions": { + "arc_eu_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_eu_challenge_llama-7B.json b/evals/arc-challenge/arc_eu_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..748beb769c74d6f45c8e93c5a0151df8949243d5 --- /dev/null +++ b/evals/arc-challenge/arc_eu_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_eu_challenge": { + "acc": 0.26223776223776224, + "acc_stderr": 0.026054539173797044, + "acc_norm": 0.23426573426573427, + "acc_norm_stderr": 0.02508828621716978 + } + }, + "versions": { + "arc_eu_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_fr_challenge_bloom-1b7.json b/evals/arc-challenge/arc_fr_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..e45f16627cad6e7f9c00c5e957f834e5d38c0364 --- /dev/null +++ b/evals/arc-challenge/arc_fr_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_fr_challenge": { + "acc": 0.2550335570469799, + "acc_stderr": 0.025292327380712687, + "acc_norm": 0.2953020134228188, + "acc_norm_stderr": 0.026470155629081078 + } + }, + "versions": { + "arc_fr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_fr_challenge_bloom-560.json b/evals/arc-challenge/arc_fr_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..c6a22e37448b26cc7b45d56b9eb1cb9358ea8a34 --- /dev/null +++ b/evals/arc-challenge/arc_fr_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_fr_challenge": { + "acc": 0.2348993288590604, + "acc_stderr": 0.024599255015999244, + "acc_norm": 0.25838926174496646, + "acc_norm_stderr": 0.025400777524610105 + } + }, + "versions": { + "arc_fr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_fr_challenge_bloom-7b1.json b/evals/arc-challenge/arc_fr_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..e7fc02c83acce1c27f68cacb276ebf9d1038459b --- /dev/null +++ b/evals/arc-challenge/arc_fr_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_fr_challenge": { + "acc": 0.36577181208053694, + "acc_stderr": 0.027947930997299652, + "acc_norm": 0.3825503355704698, + "acc_norm_stderr": 0.02820115194087938 + } + }, + "versions": { + "arc_fr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_fr_challenge_gpt2-large.json b/evals/arc-challenge/arc_fr_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..9aae5d2ce6adfb2eb44ca3f0cdc1108895cd0a83 --- /dev/null +++ b/evals/arc-challenge/arc_fr_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_fr_challenge": { + "acc": 0.1912751677852349, + "acc_stderr": 0.02282188225534101, + "acc_norm": 0.2684563758389262, + "acc_norm_stderr": 0.025714539514817496 + } + }, + "versions": { + "arc_fr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_fr_challenge_gpt2-medium.json b/evals/arc-challenge/arc_fr_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..465234e97d674cd00fa45996ea2f08a2d3e81dff --- /dev/null +++ b/evals/arc-challenge/arc_fr_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_fr_challenge": { + "acc": 0.2181208053691275, + "acc_stderr": 0.023962942745646792, + "acc_norm": 0.2785234899328859, + "acc_norm_stderr": 0.026011403578485918 + } + }, + "versions": { + "arc_fr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_fr_challenge_gpt2.json b/evals/arc-challenge/arc_fr_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..4e91d18eac5ed9bf7def9d899e70e9280a10d994 --- /dev/null +++ b/evals/arc-challenge/arc_fr_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_fr_challenge": { + "acc": 0.2080536912751678, + "acc_stderr": 0.023553603370264107, + "acc_norm": 0.2751677852348993, + "acc_norm_stderr": 0.025914289910427518 + } + }, + "versions": { + "arc_fr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_fr_challenge_llama-7B.json b/evals/arc-challenge/arc_fr_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..289f9e2b1689351de784a6a0a22e47ebaa0bcc28 --- /dev/null +++ b/evals/arc-challenge/arc_fr_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_fr_challenge": { + "acc": 0.3523489932885906, + "acc_stderr": 0.027719080218117063, + "acc_norm": 0.3422818791946309, + "acc_norm_stderr": 0.027531738303985358 + } + }, + "versions": { + "arc_fr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_gu_challenge_bloom-1b7.json b/evals/arc-challenge/arc_gu_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..a68c6f6a88aaab21388ac0f6f47a96fcad831091 --- /dev/null +++ b/evals/arc-challenge/arc_gu_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_gu_challenge": { + "acc": 0.23693379790940766, + "acc_stderr": 0.02514268188080883, + "acc_norm": 0.2613240418118467, + "acc_norm_stderr": 0.025979671112800046 + } + }, + "versions": { + "arc_gu_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_gu_challenge_bloom-560.json b/evals/arc-challenge/arc_gu_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..8e1e6a4854fc92fa9250450b250a4769a4c3586d --- /dev/null +++ b/evals/arc-challenge/arc_gu_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_gu_challenge": { + "acc": 0.21951219512195122, + "acc_stderr": 0.0244753759026465, + "acc_norm": 0.25435540069686413, + "acc_norm_stderr": 0.025751551710541783 + } + }, + "versions": { + "arc_gu_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_gu_challenge_bloom-7b1.json b/evals/arc-challenge/arc_gu_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..920acb43e2275592dbf6351e0ee175bbb1a322c1 --- /dev/null +++ b/evals/arc-challenge/arc_gu_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_gu_challenge": { + "acc": 0.23693379790940766, + "acc_stderr": 0.02514268188080883, + "acc_norm": 0.23693379790940766, + "acc_norm_stderr": 0.025142681880808825 + } + }, + "versions": { + "arc_gu_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_gu_challenge_gpt2-large.json b/evals/arc-challenge/arc_gu_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..c441954523c6d4bea5cc1b2cba0305b6c41fee49 --- /dev/null +++ b/evals/arc-challenge/arc_gu_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_gu_challenge": { + "acc": 0.22996515679442509, + "acc_stderr": 0.02488302588342452, + "acc_norm": 0.23693379790940766, + "acc_norm_stderr": 0.025142681880808832 + } + }, + "versions": { + "arc_gu_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_gu_challenge_gpt2-medium.json b/evals/arc-challenge/arc_gu_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..7aaeca4ab77d4bf203d3bf29e50b2c3f50320f78 --- /dev/null +++ b/evals/arc-challenge/arc_gu_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_gu_challenge": { + "acc": 0.2229965156794425, + "acc_stderr": 0.02461373413263406, + "acc_norm": 0.2508710801393728, + "acc_norm_stderr": 0.02563424701238326 + } + }, + "versions": { + "arc_gu_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_gu_challenge_gpt2.json b/evals/arc-challenge/arc_gu_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..a988ac9706a7406299e0de78b92c41a2151d0204 --- /dev/null +++ b/evals/arc-challenge/arc_gu_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_gu_challenge": { + "acc": 0.22996515679442509, + "acc_stderr": 0.024883025883424517, + "acc_norm": 0.24390243902439024, + "acc_norm_stderr": 0.025392997717581856 + } + }, + "versions": { + "arc_gu_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_gu_challenge_llama-7B.json b/evals/arc-challenge/arc_gu_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..12e906c731a45f8bd9b92a525fa2d3edc9a6f62e --- /dev/null +++ b/evals/arc-challenge/arc_gu_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_gu_challenge": { + "acc": 0.20557491289198607, + "acc_stderr": 0.023896181928798988, + "acc_norm": 0.26480836236933797, + "acc_norm_stderr": 0.026090542561414385 + } + }, + "versions": { + "arc_gu_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hi_challenge_bloom-1b7.json b/evals/arc-challenge/arc_hi_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..474da43c63438f6e87405fb3780c9b001241b895 --- /dev/null +++ b/evals/arc-challenge/arc_hi_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hi_challenge": { + "acc": 0.21140939597315436, + "acc_stderr": 0.02369243605357901, + "acc_norm": 0.23825503355704697, + "acc_norm_stderr": 0.024719951493159625 + } + }, + "versions": { + "arc_hi_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hi_challenge_bloom-560.json b/evals/arc-challenge/arc_hi_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..1606ed0007915536346cb01b3395ab2cb67b09a9 --- /dev/null +++ b/evals/arc-challenge/arc_hi_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hi_challenge": { + "acc": 0.19798657718120805, + "acc_stderr": 0.023122269968056355, + "acc_norm": 0.2181208053691275, + "acc_norm_stderr": 0.023962942745646806 + } + }, + "versions": { + "arc_hi_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hi_challenge_bloom-7b1.json b/evals/arc-challenge/arc_hi_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..b5660d5853f1219cfdbd0d886a4fccd9e6a3ab2b --- /dev/null +++ b/evals/arc-challenge/arc_hi_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hi_challenge": { + "acc": 0.25838926174496646, + "acc_stderr": 0.025400777524610105, + "acc_norm": 0.29194630872483224, + "acc_norm_stderr": 0.026381917944561784 + } + }, + "versions": { + "arc_hi_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hi_challenge_gpt2-large.json b/evals/arc-challenge/arc_hi_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..e6870360e984b19d105ccc86592d36a7564ff98a --- /dev/null +++ b/evals/arc-challenge/arc_hi_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hi_challenge": { + "acc": 0.22818791946308725, + "acc_stderr": 0.024351397257610513, + "acc_norm": 0.25838926174496646, + "acc_norm_stderr": 0.025400777524610105 + } + }, + "versions": { + "arc_hi_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hi_challenge_gpt2-medium.json b/evals/arc-challenge/arc_hi_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..f64cba429b30075841311a50303cbff1487551af --- /dev/null +++ b/evals/arc-challenge/arc_hi_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hi_challenge": { + "acc": 0.24161073825503357, + "acc_stderr": 0.02483853510802848, + "acc_norm": 0.27181208053691275, + "acc_norm_stderr": 0.025815342279487567 + } + }, + "versions": { + "arc_hi_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hi_challenge_gpt2.json b/evals/arc-challenge/arc_hi_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..9ccb8fb7bd3bc4c523ed703b76c3d2526c010107 --- /dev/null +++ b/evals/arc-challenge/arc_hi_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hi_challenge": { + "acc": 0.2181208053691275, + "acc_stderr": 0.023962942745646785, + "acc_norm": 0.2785234899328859, + "acc_norm_stderr": 0.026011403578485925 + } + }, + "versions": { + "arc_hi_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hi_challenge_llama-7B.json b/evals/arc-challenge/arc_hi_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..90d5c1ec99c8e977e4997800431e69a1dc078659 --- /dev/null +++ b/evals/arc-challenge/arc_hi_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hi_challenge": { + "acc": 0.20469798657718122, + "acc_stderr": 0.02341232810510543, + "acc_norm": 0.2751677852348993, + "acc_norm_stderr": 0.025914289910427518 + } + }, + "versions": { + "arc_hi_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hr_challenge_bloom-1b7.json b/evals/arc-challenge/arc_hr_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..c4ea79c0ffc6047bb74b51d401771a577f7b2a2e --- /dev/null +++ b/evals/arc-challenge/arc_hr_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hr_challenge": { + "acc": 0.24579124579124578, + "acc_stderr": 0.025025521384235302, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.025471492792791692 + } + }, + "versions": { + "arc_hr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hr_challenge_bloom-560.json b/evals/arc-challenge/arc_hr_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..d0388389e9fdfe66978f0bb663af6b9c14905b74 --- /dev/null +++ b/evals/arc-challenge/arc_hr_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hr_challenge": { + "acc": 0.19865319865319866, + "acc_stderr": 0.023190610381322117, + "acc_norm": 0.2558922558922559, + "acc_norm_stderr": 0.025363000375801963 + } + }, + "versions": { + "arc_hr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hr_challenge_bloom-7b1.json b/evals/arc-challenge/arc_hr_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..27a6b5e7862ae33a52b4fcee86a333d1819e8514 --- /dev/null +++ b/evals/arc-challenge/arc_hr_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hr_challenge": { + "acc": 0.23905723905723905, + "acc_stderr": 0.02479026042346899, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.026540687854980666 + } + }, + "versions": { + "arc_hr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hr_challenge_gpt2-large.json b/evals/arc-challenge/arc_hr_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..daac6d38e4cc4974c0a8b524053297e0971694a9 --- /dev/null +++ b/evals/arc-challenge/arc_hr_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hr_challenge": { + "acc": 0.18855218855218855, + "acc_stderr": 0.0227352759557704, + "acc_norm": 0.2255892255892256, + "acc_norm_stderr": 0.02429399929295737 + } + }, + "versions": { + "arc_hr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hr_challenge_gpt2-medium.json b/evals/arc-challenge/arc_hr_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..b69e7a89e1d024529a1ccfa184f0ed211ab024e6 --- /dev/null +++ b/evals/arc-challenge/arc_hr_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hr_challenge": { + "acc": 0.18855218855218855, + "acc_stderr": 0.0227352759557704, + "acc_norm": 0.2255892255892256, + "acc_norm_stderr": 0.024293999292957367 + } + }, + "versions": { + "arc_hr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hr_challenge_gpt2.json b/evals/arc-challenge/arc_hr_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..d27da666a194a216383a01fe3c520895dbaada29 --- /dev/null +++ b/evals/arc-challenge/arc_hr_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hr_challenge": { + "acc": 0.19528619528619529, + "acc_stderr": 0.02304149438665811, + "acc_norm": 0.24242424242424243, + "acc_norm_stderr": 0.02490893747050875 + } + }, + "versions": { + "arc_hr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hr_challenge_llama-7B.json b/evals/arc-challenge/arc_hr_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..cc0a77d97f36393c01b3325f7f341ed832c808cb --- /dev/null +++ b/evals/arc-challenge/arc_hr_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hr_challenge": { + "acc": 0.2996632996632997, + "acc_stderr": 0.026627130450114996, + "acc_norm": 0.3468013468013468, + "acc_norm_stderr": 0.027664139917201607 + } + }, + "versions": { + "arc_hr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hu_challenge_bloom-1b7.json b/evals/arc-challenge/arc_hu_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..d6ee518fa194a5cab2b0fcc73ab71cfa9a4c7938 --- /dev/null +++ b/evals/arc-challenge/arc_hu_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hu_challenge": { + "acc": 0.20875420875420875, + "acc_stderr": 0.023622587756271476, + "acc_norm": 0.21212121212121213, + "acc_norm_stderr": 0.023761611918761676 + } + }, + "versions": { + "arc_hu_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hu_challenge_bloom-560.json b/evals/arc-challenge/arc_hu_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..4326e9a449bfff5b4bffcb01ae73902068b16858 --- /dev/null +++ b/evals/arc-challenge/arc_hu_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hu_challenge": { + "acc": 0.20202020202020202, + "acc_stderr": 0.023337132573282595, + "acc_norm": 0.23905723905723905, + "acc_norm_stderr": 0.024790260423468987 + } + }, + "versions": { + "arc_hu_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hu_challenge_bloom-7b1.json b/evals/arc-challenge/arc_hu_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..7638b2f77f7140b0c0af0df71d4b9e1fd457bfb3 --- /dev/null +++ b/evals/arc-challenge/arc_hu_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hu_challenge": { + "acc": 0.2222222222222222, + "acc_stderr": 0.02416437978893547, + "acc_norm": 0.265993265993266, + "acc_norm_stderr": 0.025682629556652854 + } + }, + "versions": { + "arc_hu_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hu_challenge_gpt2-large.json b/evals/arc-challenge/arc_hu_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..9a7113da6667b32d4460a28d91f71e3e716239d0 --- /dev/null +++ b/evals/arc-challenge/arc_hu_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hu_challenge": { + "acc": 0.21212121212121213, + "acc_stderr": 0.023761611918761655, + "acc_norm": 0.24242424242424243, + "acc_norm_stderr": 0.02490893747050876 + } + }, + "versions": { + "arc_hu_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hu_challenge_gpt2-medium.json b/evals/arc-challenge/arc_hu_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..9f05d0f663b1d94cfc4087ba1aae889603546e4a --- /dev/null +++ b/evals/arc-challenge/arc_hu_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hu_challenge": { + "acc": 0.2356902356902357, + "acc_stderr": 0.02466946003490763, + "acc_norm": 0.2828282828282828, + "acc_norm_stderr": 0.026177438014745417 + } + }, + "versions": { + "arc_hu_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hu_challenge_gpt2.json b/evals/arc-challenge/arc_hu_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..3cdc244f3a355351f2b2e8826aed014e23f29fab --- /dev/null +++ b/evals/arc-challenge/arc_hu_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hu_challenge": { + "acc": 0.2053872053872054, + "acc_stderr": 0.023481109518599295, + "acc_norm": 0.25252525252525254, + "acc_norm_stderr": 0.025252525252525353 + } + }, + "versions": { + "arc_hu_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hu_challenge_llama-7B.json b/evals/arc-challenge/arc_hu_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..d0add74575f51f34aaed4497cfc6e42d0d8d9bc9 --- /dev/null +++ b/evals/arc-challenge/arc_hu_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hu_challenge": { + "acc": 0.24915824915824916, + "acc_stderr": 0.025140041284626418, + "acc_norm": 0.30976430976430974, + "acc_norm_stderr": 0.0268762417790141 + } + }, + "versions": { + "arc_hu_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hy_challenge_bloom-1b7.json b/evals/arc-challenge/arc_hy_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..c569232cfdeeffa2b9c398fa8102342e55669d6d --- /dev/null +++ b/evals/arc-challenge/arc_hy_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hy_challenge": { + "acc": 0.2206896551724138, + "acc_stderr": 0.024394801425351647, + "acc_norm": 0.27241379310344827, + "acc_norm_stderr": 0.026188332965202905 + } + }, + "versions": { + "arc_hy_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hy_challenge_bloom-560.json b/evals/arc-challenge/arc_hy_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..38b99f7004830ebf484274ad893c53cff9de33a4 --- /dev/null +++ b/evals/arc-challenge/arc_hy_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hy_challenge": { + "acc": 0.19655172413793104, + "acc_stderr": 0.023375906908472157, + "acc_norm": 0.2482758620689655, + "acc_norm_stderr": 0.02541251077219611 + } + }, + "versions": { + "arc_hy_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hy_challenge_bloom-7b1.json b/evals/arc-challenge/arc_hy_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..6c5bcfbaa2c0570aa97441fc418e71f242460803 --- /dev/null +++ b/evals/arc-challenge/arc_hy_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hy_challenge": { + "acc": 0.18620689655172415, + "acc_stderr": 0.022898443475326664, + "acc_norm": 0.2689655172413793, + "acc_norm_stderr": 0.02608364690576629 + } + }, + "versions": { + "arc_hy_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hy_challenge_gpt2-large.json b/evals/arc-challenge/arc_hy_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..d3fa3d404e18049ccef76e50f8abe3deed88b1e6 --- /dev/null +++ b/evals/arc-challenge/arc_hy_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hy_challenge": { + "acc": 0.19310344827586207, + "acc_stderr": 0.02321961545031108, + "acc_norm": 0.23793103448275862, + "acc_norm_stderr": 0.025048040852790374 + } + }, + "versions": { + "arc_hy_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hy_challenge_gpt2-medium.json b/evals/arc-challenge/arc_hy_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..a8f1fd794a777a25dca5bd3d54b52082a503039d --- /dev/null +++ b/evals/arc-challenge/arc_hy_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hy_challenge": { + "acc": 0.20689655172413793, + "acc_stderr": 0.02382827611454507, + "acc_norm": 0.25862068965517243, + "acc_norm_stderr": 0.025757454562272446 + } + }, + "versions": { + "arc_hy_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hy_challenge_gpt2.json b/evals/arc-challenge/arc_hy_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..a6b0c05a8a5c5112ef3326264ffa348cbe02c2ff --- /dev/null +++ b/evals/arc-challenge/arc_hy_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hy_challenge": { + "acc": 0.1793103448275862, + "acc_stderr": 0.022565410117928373, + "acc_norm": 0.27241379310344827, + "acc_norm_stderr": 0.026188332965202905 + } + }, + "versions": { + "arc_hy_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_hy_challenge_llama-7B.json b/evals/arc-challenge/arc_hy_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..76c60ed9c16ffa50256b3420a3d1c544d27d0f8a --- /dev/null +++ b/evals/arc-challenge/arc_hy_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_hy_challenge": { + "acc": 0.2206896551724138, + "acc_stderr": 0.024394801425351637, + "acc_norm": 0.30344827586206896, + "acc_norm_stderr": 0.02704394858012006 + } + }, + "versions": { + "arc_hy_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_id_challenge_bloom-1b7.json b/evals/arc-challenge/arc_id_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..8edb6191b5ef4693fcf7dfc5cfad9800d7044c56 --- /dev/null +++ b/evals/arc-challenge/arc_id_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_id_challenge": { + "acc": 0.2986577181208054, + "acc_stderr": 0.026556672487880535, + "acc_norm": 0.2751677852348993, + "acc_norm_stderr": 0.025914289910427518 + } + }, + "versions": { + "arc_id_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_id_challenge_bloom-560.json b/evals/arc-challenge/arc_id_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..1d88eb711d44c2d77c4554d4f4d6e553aa1209eb --- /dev/null +++ b/evals/arc-challenge/arc_id_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_id_challenge": { + "acc": 0.24496644295302014, + "acc_stderr": 0.024955035980898963, + "acc_norm": 0.28187919463087246, + "acc_norm_stderr": 0.026106703750007423 + } + }, + "versions": { + "arc_id_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_id_challenge_bloom-7b1.json b/evals/arc-challenge/arc_id_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..9d6908c8177308068c88e133ad1287687c46dcce --- /dev/null +++ b/evals/arc-challenge/arc_id_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_id_challenge": { + "acc": 0.3187919463087248, + "acc_stderr": 0.027040538296634997, + "acc_norm": 0.3825503355704698, + "acc_norm_stderr": 0.028201151940879375 + } + }, + "versions": { + "arc_id_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_id_challenge_gpt2-large.json b/evals/arc-challenge/arc_id_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..ab5432ed0c027006e5940d1dbd8e9231eccd5ab0 --- /dev/null +++ b/evals/arc-challenge/arc_id_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_id_challenge": { + "acc": 0.23825503355704697, + "acc_stderr": 0.02471995149315962, + "acc_norm": 0.2684563758389262, + "acc_norm_stderr": 0.025714539514817496 + } + }, + "versions": { + "arc_id_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_id_challenge_gpt2-medium.json b/evals/arc-challenge/arc_id_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..156b2294f71673c6950d132b56805c5e36900b92 --- /dev/null +++ b/evals/arc-challenge/arc_id_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_id_challenge": { + "acc": 0.2080536912751678, + "acc_stderr": 0.023553603370264114, + "acc_norm": 0.2483221476510067, + "acc_norm_stderr": 0.025069483148037884 + } + }, + "versions": { + "arc_id_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_id_challenge_gpt2.json b/evals/arc-challenge/arc_id_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..ef1ed97c321fe9cc50de905c218517b2d6bb812d --- /dev/null +++ b/evals/arc-challenge/arc_id_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_id_challenge": { + "acc": 0.23825503355704697, + "acc_stderr": 0.024719951493159628, + "acc_norm": 0.2785234899328859, + "acc_norm_stderr": 0.026011403578485907 + } + }, + "versions": { + "arc_id_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_id_challenge_llama-7B.json b/evals/arc-challenge/arc_id_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..531f6f81397ca5506b0f36d1291417201eb9b72e --- /dev/null +++ b/evals/arc-challenge/arc_id_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_id_challenge": { + "acc": 0.23154362416107382, + "acc_stderr": 0.024476414420146617, + "acc_norm": 0.28523489932885904, + "acc_norm_stderr": 0.02620021021413825 + } + }, + "versions": { + "arc_id_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_it_challenge_bloom-1b7.json b/evals/arc-challenge/arc_it_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..c38c75e09195bcf94e26d180f17837747473c6f7 --- /dev/null +++ b/evals/arc-challenge/arc_it_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_it_challenge": { + "acc": 0.2558922558922559, + "acc_stderr": 0.025363000375801963, + "acc_norm": 0.24579124579124578, + "acc_norm_stderr": 0.025025521384235284 + } + }, + "versions": { + "arc_it_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_it_challenge_bloom-560.json b/evals/arc-challenge/arc_it_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..a1001fcc2f2df8d064ae2cefca3cbcf0212ed670 --- /dev/null +++ b/evals/arc-challenge/arc_it_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_it_challenge": { + "acc": 0.20202020202020202, + "acc_stderr": 0.023337132573282612, + "acc_norm": 0.23232323232323232, + "acc_norm_stderr": 0.02454650495612789 + } + }, + "versions": { + "arc_it_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_it_challenge_bloom-7b1.json b/evals/arc-challenge/arc_it_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..fe8c476fe99201a63e06353589f9b571026510a6 --- /dev/null +++ b/evals/arc-challenge/arc_it_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_it_challenge": { + "acc": 0.24242424242424243, + "acc_stderr": 0.02490893747050875, + "acc_norm": 0.23232323232323232, + "acc_norm_stderr": 0.02454650495612789 + } + }, + "versions": { + "arc_it_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_it_challenge_gpt2-large.json b/evals/arc-challenge/arc_it_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..2508d33a6975391a9665c19ebb10213e84bd23da --- /dev/null +++ b/evals/arc-challenge/arc_it_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_it_challenge": { + "acc": 0.2255892255892256, + "acc_stderr": 0.02429399929295737, + "acc_norm": 0.25252525252525254, + "acc_norm_stderr": 0.025252525252525342 + } + }, + "versions": { + "arc_it_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_it_challenge_gpt2-medium.json b/evals/arc-challenge/arc_it_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..2663af9d466539843f48e70d58dd9a236db69c79 --- /dev/null +++ b/evals/arc-challenge/arc_it_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_it_challenge": { + "acc": 0.2255892255892256, + "acc_stderr": 0.02429399929295737, + "acc_norm": 0.2727272727272727, + "acc_norm_stderr": 0.025886127156886297 + } + }, + "versions": { + "arc_it_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_it_challenge_gpt2.json b/evals/arc-challenge/arc_it_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..611874b61c1374b902d583cf5cefbc4492ed6ac6 --- /dev/null +++ b/evals/arc-challenge/arc_it_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_it_challenge": { + "acc": 0.22895622895622897, + "acc_stderr": 0.024421362642271068, + "acc_norm": 0.24579124579124578, + "acc_norm_stderr": 0.025025521384235284 + } + }, + "versions": { + "arc_it_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_it_challenge_llama-7B.json b/evals/arc-challenge/arc_it_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..026bc2c2a59b0b1e397e34c3f50a439cc3237e6c --- /dev/null +++ b/evals/arc-challenge/arc_it_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_it_challenge": { + "acc": 0.3164983164983165, + "acc_stderr": 0.02703395838420781, + "acc_norm": 0.3367003367003367, + "acc_norm_stderr": 0.02746823841289221 + } + }, + "versions": { + "arc_it_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_kn_challenge_bloom-1b7.json b/evals/arc-challenge/arc_kn_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..d30129acdd6c23d97224155d05ff525778afc39a --- /dev/null +++ b/evals/arc-challenge/arc_kn_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_kn_challenge": { + "acc": 0.2097902097902098, + "acc_stderr": 0.024118005042923673, + "acc_norm": 0.25874125874125875, + "acc_norm_stderr": 0.025941514501247074 + } + }, + "versions": { + "arc_kn_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_kn_challenge_bloom-560.json b/evals/arc-challenge/arc_kn_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..9061ffd18bb78ef2415b46937475b366aaba5e70 --- /dev/null +++ b/evals/arc-challenge/arc_kn_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_kn_challenge": { + "acc": 0.2097902097902098, + "acc_stderr": 0.024118005042923676, + "acc_norm": 0.2727272727272727, + "acc_norm_stderr": 0.026380954549454924 + } + }, + "versions": { + "arc_kn_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_kn_challenge_bloom-7b1.json b/evals/arc-challenge/arc_kn_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..083303db0d99abb50df9664e66431757fcbc34cf --- /dev/null +++ b/evals/arc-challenge/arc_kn_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_kn_challenge": { + "acc": 0.2062937062937063, + "acc_stderr": 0.023969030679396822, + "acc_norm": 0.27972027972027974, + "acc_norm_stderr": 0.02658827368712313 + } + }, + "versions": { + "arc_kn_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_kn_challenge_gpt2-large.json b/evals/arc-challenge/arc_kn_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..cc1d0795f8679f5f353a8fe04a823ce8944d6180 --- /dev/null +++ b/evals/arc-challenge/arc_kn_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_kn_challenge": { + "acc": 0.24125874125874125, + "acc_stderr": 0.02534346249658375, + "acc_norm": 0.2062937062937063, + "acc_norm_stderr": 0.02396903067939682 + } + }, + "versions": { + "arc_kn_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_kn_challenge_gpt2-medium.json b/evals/arc-challenge/arc_kn_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..3272316d0c0fa316ff58bd4f0a3c248c27457501 --- /dev/null +++ b/evals/arc-challenge/arc_kn_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_kn_challenge": { + "acc": 0.23076923076923078, + "acc_stderr": 0.02495714171242502, + "acc_norm": 0.23426573426573427, + "acc_norm_stderr": 0.025088286217169773 + } + }, + "versions": { + "arc_kn_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_kn_challenge_gpt2.json b/evals/arc-challenge/arc_kn_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..06e41e33136f376ee8441914155f63301d2b3150 --- /dev/null +++ b/evals/arc-challenge/arc_kn_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_kn_challenge": { + "acc": 0.21678321678321677, + "acc_stderr": 0.02440795482238759, + "acc_norm": 0.1993006993006993, + "acc_norm_stderr": 0.023662831210753306 + } + }, + "versions": { + "arc_kn_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_kn_challenge_llama-7B.json b/evals/arc-challenge/arc_kn_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..54ade592ef4b8faca4ac733019e8a288ffcd7080 --- /dev/null +++ b/evals/arc-challenge/arc_kn_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_kn_challenge": { + "acc": 0.25524475524475526, + "acc_stderr": 0.025826334320570847, + "acc_norm": 0.2762237762237762, + "acc_norm_stderr": 0.026485626798716456 + } + }, + "versions": { + "arc_kn_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ml_challenge_bloom-1b7.json b/evals/arc-challenge/arc_ml_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..237a4de001e4d03d3a5da1bd85ff383ee5ed3641 --- /dev/null +++ b/evals/arc-challenge/arc_ml_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ml_challenge": { + "acc": 0.20270270270270271, + "acc_stderr": 0.023406091994174035, + "acc_norm": 0.20945945945945946, + "acc_norm_stderr": 0.023691963473475734 + } + }, + "versions": { + "arc_ml_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ml_challenge_bloom-560.json b/evals/arc-challenge/arc_ml_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..b276b36482cf0a1c5ed243c8a17297e981587426 --- /dev/null +++ b/evals/arc-challenge/arc_ml_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ml_challenge": { + "acc": 0.19932432432432431, + "acc_stderr": 0.02325934388926828, + "acc_norm": 0.23310810810810811, + "acc_norm_stderr": 0.024616978985669728 + } + }, + "versions": { + "arc_ml_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ml_challenge_bloom-7b1.json b/evals/arc-challenge/arc_ml_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..57e340993dc80aab56386e3c1ade388f4d786241 --- /dev/null +++ b/evals/arc-challenge/arc_ml_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ml_challenge": { + "acc": 0.22635135135135134, + "acc_stderr": 0.024364215012920545, + "acc_norm": 0.22297297297297297, + "acc_norm_stderr": 0.02423444993634421 + } + }, + "versions": { + "arc_ml_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ml_challenge_gpt2-large.json b/evals/arc-challenge/arc_ml_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..a23148b0cf58ef04dc9ab3bb8d26aedadda9296f --- /dev/null +++ b/evals/arc-challenge/arc_ml_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ml_challenge": { + "acc": 0.22972972972972974, + "acc_stderr": 0.024491712953916972, + "acc_norm": 0.22297297297297297, + "acc_norm_stderr": 0.024234449936344216 + } + }, + "versions": { + "arc_ml_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ml_challenge_gpt2-medium.json b/evals/arc-challenge/arc_ml_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..9aa842f5ce9d59030c7aae3de538f9b3ea816580 --- /dev/null +++ b/evals/arc-challenge/arc_ml_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ml_challenge": { + "acc": 0.2533783783783784, + "acc_stderr": 0.0253235186291, + "acc_norm": 0.21283783783783783, + "acc_norm_stderr": 0.0238311783119674 + } + }, + "versions": { + "arc_ml_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ml_challenge_gpt2.json b/evals/arc-challenge/arc_ml_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..0c8fc7d983c690076289a5040bce6204cb0b9146 --- /dev/null +++ b/evals/arc-challenge/arc_ml_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ml_challenge": { + "acc": 0.25, + "acc_stderr": 0.025210974204480537, + "acc_norm": 0.21283783783783783, + "acc_norm_stderr": 0.023831178311967415 + } + }, + "versions": { + "arc_ml_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ml_challenge_llama-7B.json b/evals/arc-challenge/arc_ml_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..3f4555f5009cd795dea8981be98bec45e2ed9369 --- /dev/null +++ b/evals/arc-challenge/arc_ml_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ml_challenge": { + "acc": 0.21621621621621623, + "acc_stderr": 0.023967970439477224, + "acc_norm": 0.20270270270270271, + "acc_norm_stderr": 0.023406091994174035 + } + }, + "versions": { + "arc_ml_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_mr_challenge_bloom-1b7.json b/evals/arc-challenge/arc_mr_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..c8b3bb6a26b22a95c0a8de8ae3221f476963428f --- /dev/null +++ b/evals/arc-challenge/arc_mr_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_mr_challenge": { + "acc": 0.24067796610169492, + "acc_stderr": 0.02493202205172924, + "acc_norm": 0.2440677966101695, + "acc_norm_stderr": 0.02505088069031971 + } + }, + "versions": { + "arc_mr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_mr_challenge_bloom-560.json b/evals/arc-challenge/arc_mr_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..213f904f45633d7bdef01eef045a28ec2636faf5 --- /dev/null +++ b/evals/arc-challenge/arc_mr_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_mr_challenge": { + "acc": 0.2440677966101695, + "acc_stderr": 0.025050880690319716, + "acc_norm": 0.22372881355932203, + "acc_norm_stderr": 0.02430491058853199 + } + }, + "versions": { + "arc_mr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_mr_challenge_bloom-7b1.json b/evals/arc-challenge/arc_mr_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..4a6cfb61ab6cccf8da1ad0ec46c1bde46e11be82 --- /dev/null +++ b/evals/arc-challenge/arc_mr_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_mr_challenge": { + "acc": 0.23389830508474577, + "acc_stderr": 0.024687839412166384, + "acc_norm": 0.2440677966101695, + "acc_norm_stderr": 0.025050880690319702 + } + }, + "versions": { + "arc_mr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_mr_challenge_gpt2-large.json b/evals/arc-challenge/arc_mr_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..380f5aee1d555e85568122130af494663cb3123f --- /dev/null +++ b/evals/arc-challenge/arc_mr_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_mr_challenge": { + "acc": 0.2, + "acc_stderr": 0.023328473740792135, + "acc_norm": 0.2440677966101695, + "acc_norm_stderr": 0.025050880690319702 + } + }, + "versions": { + "arc_mr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_mr_challenge_gpt2-medium.json b/evals/arc-challenge/arc_mr_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..7df5889da7e82e2529e4532947c4e0e8507ba94c --- /dev/null +++ b/evals/arc-challenge/arc_mr_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_mr_challenge": { + "acc": 0.2, + "acc_stderr": 0.023328473740792135, + "acc_norm": 0.22372881355932203, + "acc_norm_stderr": 0.024304910588531993 + } + }, + "versions": { + "arc_mr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_mr_challenge_gpt2.json b/evals/arc-challenge/arc_mr_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..8344c19a2efa7d7c252e94ea149ef5b421b34214 --- /dev/null +++ b/evals/arc-challenge/arc_mr_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_mr_challenge": { + "acc": 0.18305084745762712, + "acc_stderr": 0.02255328043040195, + "acc_norm": 0.2033898305084746, + "acc_norm_stderr": 0.023475447251410726 + } + }, + "versions": { + "arc_mr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_mr_challenge_llama-7B.json b/evals/arc-challenge/arc_mr_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..f1cf03e6c1c130bd7352dd7963fe03ae5f4303fe --- /dev/null +++ b/evals/arc-challenge/arc_mr_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_mr_challenge": { + "acc": 0.2271186440677966, + "acc_stderr": 0.024434819973932945, + "acc_norm": 0.2711864406779661, + "acc_norm_stderr": 0.025927971596786177 + } + }, + "versions": { + "arc_mr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ne_challenge_bloom-1b7.json b/evals/arc-challenge/arc_ne_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..9ef6fea604fc9172e63676717b7455a756bbbd4e --- /dev/null +++ b/evals/arc-challenge/arc_ne_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ne_challenge": { + "acc": 0.2222222222222222, + "acc_stderr": 0.024164379788935486, + "acc_norm": 0.30303030303030304, + "acc_norm_stderr": 0.026711859553317677 + } + }, + "versions": { + "arc_ne_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ne_challenge_bloom-560.json b/evals/arc-challenge/arc_ne_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..490a9ae38f7edf0f013f898d0c075db2184dc99b --- /dev/null +++ b/evals/arc-challenge/arc_ne_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ne_challenge": { + "acc": 0.25925925925925924, + "acc_stderr": 0.02547149279279167, + "acc_norm": 0.28619528619528617, + "acc_norm_stderr": 0.02627090829835463 + } + }, + "versions": { + "arc_ne_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ne_challenge_bloom-7b1.json b/evals/arc-challenge/arc_ne_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..0b1c6c30b759cb29ce78c358d0d709a7b53f16f3 --- /dev/null +++ b/evals/arc-challenge/arc_ne_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ne_challenge": { + "acc": 0.24242424242424243, + "acc_stderr": 0.024908937470508766, + "acc_norm": 0.2996632996632997, + "acc_norm_stderr": 0.02662713045011499 + } + }, + "versions": { + "arc_ne_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ne_challenge_gpt2-large.json b/evals/arc-challenge/arc_ne_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..82b4b764b3fb7ef15563ca6d2c27830e3aef8d51 --- /dev/null +++ b/evals/arc-challenge/arc_ne_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ne_challenge": { + "acc": 0.23905723905723905, + "acc_stderr": 0.024790260423468984, + "acc_norm": 0.23905723905723905, + "acc_norm_stderr": 0.02479026042346898 + } + }, + "versions": { + "arc_ne_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ne_challenge_gpt2-medium.json b/evals/arc-challenge/arc_ne_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..18464b4f845260d9e4122a7c74c4fc758519296a --- /dev/null +++ b/evals/arc-challenge/arc_ne_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ne_challenge": { + "acc": 0.23905723905723905, + "acc_stderr": 0.024790260423468984, + "acc_norm": 0.24579124579124578, + "acc_norm_stderr": 0.025025521384235295 + } + }, + "versions": { + "arc_ne_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ne_challenge_gpt2.json b/evals/arc-challenge/arc_ne_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..669e0661f7894b2bdc02512e274ab12a340e6f2c --- /dev/null +++ b/evals/arc-challenge/arc_ne_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ne_challenge": { + "acc": 0.2356902356902357, + "acc_stderr": 0.024669460034907637, + "acc_norm": 0.2255892255892256, + "acc_norm_stderr": 0.02429399929295737 + } + }, + "versions": { + "arc_ne_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ne_challenge_llama-7B.json b/evals/arc-challenge/arc_ne_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..a22c844ed32434eb2d404f76e104c502e7218625 --- /dev/null +++ b/evals/arc-challenge/arc_ne_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ne_challenge": { + "acc": 0.2255892255892256, + "acc_stderr": 0.024293999292957367, + "acc_norm": 0.265993265993266, + "acc_norm_stderr": 0.025682629556652858 + } + }, + "versions": { + "arc_ne_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_nl_challenge_bloom-1b7.json b/evals/arc-challenge/arc_nl_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..de6df0fa84c07702ad9d3005757f4412e835e175 --- /dev/null +++ b/evals/arc-challenge/arc_nl_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_nl_challenge": { + "acc": 0.20469798657718122, + "acc_stderr": 0.02341232810510543, + "acc_norm": 0.24161073825503357, + "acc_norm_stderr": 0.024838535108028484 + } + }, + "versions": { + "arc_nl_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_nl_challenge_bloom-560.json b/evals/arc-challenge/arc_nl_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..4bd9dec46927eea8709a44925f7f7f5e4d35c055 --- /dev/null +++ b/evals/arc-challenge/arc_nl_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_nl_challenge": { + "acc": 0.22483221476510068, + "acc_stderr": 0.024224169829650748, + "acc_norm": 0.2651006711409396, + "acc_norm_stderr": 0.025611859712206003 + } + }, + "versions": { + "arc_nl_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_nl_challenge_bloom-7b1.json b/evals/arc-challenge/arc_nl_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..5360e3ed9ed9f43f4cbddc65166e1d83d89a29e6 --- /dev/null +++ b/evals/arc-challenge/arc_nl_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_nl_challenge": { + "acc": 0.20134228187919462, + "acc_stderr": 0.0232685657676853, + "acc_norm": 0.2684563758389262, + "acc_norm_stderr": 0.025714539514817496 + } + }, + "versions": { + "arc_nl_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_nl_challenge_gpt2-large.json b/evals/arc-challenge/arc_nl_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..432863c5e4840c2d01bdac986765c61050413f9f --- /dev/null +++ b/evals/arc-challenge/arc_nl_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_nl_challenge": { + "acc": 0.2080536912751678, + "acc_stderr": 0.023553603370264114, + "acc_norm": 0.2516778523489933, + "acc_norm_stderr": 0.025181904610615855 + } + }, + "versions": { + "arc_nl_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_nl_challenge_gpt2-medium.json b/evals/arc-challenge/arc_nl_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..65d7c05ced99e1bd53aa3110a033d9c0975025fa --- /dev/null +++ b/evals/arc-challenge/arc_nl_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_nl_challenge": { + "acc": 0.23154362416107382, + "acc_stderr": 0.024476414420146628, + "acc_norm": 0.2550335570469799, + "acc_norm_stderr": 0.025292327380712687 + } + }, + "versions": { + "arc_nl_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_nl_challenge_gpt2.json b/evals/arc-challenge/arc_nl_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..bce39d9e1424be6bf01a0c15447e59c3348a08d6 --- /dev/null +++ b/evals/arc-challenge/arc_nl_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_nl_challenge": { + "acc": 0.21476510067114093, + "acc_stderr": 0.023828868848284373, + "acc_norm": 0.24496644295302014, + "acc_norm_stderr": 0.024955035980898956 + } + }, + "versions": { + "arc_nl_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_nl_challenge_llama-7B.json b/evals/arc-challenge/arc_nl_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..a9b3e1e927abac3aba0720a5085b3a1b041af85b --- /dev/null +++ b/evals/arc-challenge/arc_nl_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_nl_challenge": { + "acc": 0.2953020134228188, + "acc_stderr": 0.026470155629081078, + "acc_norm": 0.32550335570469796, + "acc_norm_stderr": 0.027188760373954457 + } + }, + "versions": { + "arc_nl_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_pt_challenge_bloom-1b7.json b/evals/arc-challenge/arc_pt_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..86206aa4c02654dee089146263800252a9280415 --- /dev/null +++ b/evals/arc-challenge/arc_pt_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_pt_challenge": { + "acc": 0.22483221476510068, + "acc_stderr": 0.024224169829650755, + "acc_norm": 0.28187919463087246, + "acc_norm_stderr": 0.026106703750007426 + } + }, + "versions": { + "arc_pt_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_pt_challenge_bloom-560.json b/evals/arc-challenge/arc_pt_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..11021802d7ffa732fc84739fd8ec1d531dc637b6 --- /dev/null +++ b/evals/arc-challenge/arc_pt_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_pt_challenge": { + "acc": 0.22483221476510068, + "acc_stderr": 0.02422416982965075, + "acc_norm": 0.23154362416107382, + "acc_norm_stderr": 0.02447641442014662 + } + }, + "versions": { + "arc_pt_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_pt_challenge_bloom-7b1.json b/evals/arc-challenge/arc_pt_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..e9f27045095eca6ce035e90605bdff561f37a5a8 --- /dev/null +++ b/evals/arc-challenge/arc_pt_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_pt_challenge": { + "acc": 0.348993288590604, + "acc_stderr": 0.02765814479375022, + "acc_norm": 0.3724832214765101, + "acc_norm_stderr": 0.02805354855477509 + } + }, + "versions": { + "arc_pt_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_pt_challenge_gpt2-large.json b/evals/arc-challenge/arc_pt_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..fd1a4b8d1948d7ebf686b68f03b68fae0c5e41de --- /dev/null +++ b/evals/arc-challenge/arc_pt_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_pt_challenge": { + "acc": 0.18791946308724833, + "acc_stderr": 0.022667687029933926, + "acc_norm": 0.24161073825503357, + "acc_norm_stderr": 0.024838535108028477 + } + }, + "versions": { + "arc_pt_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_pt_challenge_gpt2-medium.json b/evals/arc-challenge/arc_pt_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..0380aff06ff37610aa48dddf5d15f62376f1d08b --- /dev/null +++ b/evals/arc-challenge/arc_pt_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_pt_challenge": { + "acc": 0.18120805369127516, + "acc_stderr": 0.02235101779623449, + "acc_norm": 0.2348993288590604, + "acc_norm_stderr": 0.024599255015999244 + } + }, + "versions": { + "arc_pt_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_pt_challenge_gpt2.json b/evals/arc-challenge/arc_pt_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..6a1952ed53a80de06750b3d6155487089a0672bd --- /dev/null +++ b/evals/arc-challenge/arc_pt_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_pt_challenge": { + "acc": 0.19463087248322147, + "acc_stderr": 0.022973392306598166, + "acc_norm": 0.2483221476510067, + "acc_norm_stderr": 0.025069483148037884 + } + }, + "versions": { + "arc_pt_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_pt_challenge_llama-7B.json b/evals/arc-challenge/arc_pt_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..e49526aa9a3f1e1f7fda72f9bf9b3a58227a95ce --- /dev/null +++ b/evals/arc-challenge/arc_pt_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_pt_challenge": { + "acc": 0.32550335570469796, + "acc_stderr": 0.027188760373954457, + "acc_norm": 0.33557046979865773, + "acc_norm_stderr": 0.027399214125091453 + } + }, + "versions": { + "arc_pt_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ro_challenge_bloom-1b7.json b/evals/arc-challenge/arc_ro_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..bd189e9050be188d43e3bac19cd42c400c5df7c8 --- /dev/null +++ b/evals/arc-challenge/arc_ro_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ro_challenge": { + "acc": 0.24915824915824916, + "acc_stderr": 0.025140041284626418, + "acc_norm": 0.28619528619528617, + "acc_norm_stderr": 0.026270908298354635 + } + }, + "versions": { + "arc_ro_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ro_challenge_bloom-560.json b/evals/arc-challenge/arc_ro_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..a797f1ebfa7d92e0c78e624b99da52e77c92822c --- /dev/null +++ b/evals/arc-challenge/arc_ro_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ro_challenge": { + "acc": 0.20875420875420875, + "acc_stderr": 0.023622587756271473, + "acc_norm": 0.26936026936026936, + "acc_norm_stderr": 0.025785321789052268 + } + }, + "versions": { + "arc_ro_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ro_challenge_bloom-7b1.json b/evals/arc-challenge/arc_ro_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..7e63a3d72b4f1a770523a9859787818e4e1ed26e --- /dev/null +++ b/evals/arc-challenge/arc_ro_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ro_challenge": { + "acc": 0.25252525252525254, + "acc_stderr": 0.025252525252525346, + "acc_norm": 0.30303030303030304, + "acc_norm_stderr": 0.02671185955331767 + } + }, + "versions": { + "arc_ro_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ro_challenge_gpt2-large.json b/evals/arc-challenge/arc_ro_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..68f4f45196bec82ad2ec165f33cae93bfbedbe44 --- /dev/null +++ b/evals/arc-challenge/arc_ro_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ro_challenge": { + "acc": 0.18855218855218855, + "acc_stderr": 0.022735275955770386, + "acc_norm": 0.2828282828282828, + "acc_norm_stderr": 0.026177438014745407 + } + }, + "versions": { + "arc_ro_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ro_challenge_gpt2-medium.json b/evals/arc-challenge/arc_ro_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..5df0a11438afe98b491a6e5528d70eacb48652cf --- /dev/null +++ b/evals/arc-challenge/arc_ro_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ro_challenge": { + "acc": 0.18855218855218855, + "acc_stderr": 0.022735275955770375, + "acc_norm": 0.2558922558922559, + "acc_norm_stderr": 0.025363000375801976 + } + }, + "versions": { + "arc_ro_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ro_challenge_gpt2.json b/evals/arc-challenge/arc_ro_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..37203889a39601337bd2d8ffcd85a3e4693013ad --- /dev/null +++ b/evals/arc-challenge/arc_ro_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ro_challenge": { + "acc": 0.20875420875420875, + "acc_stderr": 0.02362258775627147, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.026540687854980673 + } + }, + "versions": { + "arc_ro_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ro_challenge_llama-7B.json b/evals/arc-challenge/arc_ro_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..37d943e737472a25d2c879425d478f6dd746e1f4 --- /dev/null +++ b/evals/arc-challenge/arc_ro_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ro_challenge": { + "acc": 0.2828282828282828, + "acc_stderr": 0.02617743801474542, + "acc_norm": 0.3164983164983165, + "acc_norm_stderr": 0.027033958384207805 + } + }, + "versions": { + "arc_ro_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ru_challenge_bloom-1b7.json b/evals/arc-challenge/arc_ru_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..fc9a3f783edc283ec79c7906da73bc8a27f80a9d --- /dev/null +++ b/evals/arc-challenge/arc_ru_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ru_challenge": { + "acc": 0.25252525252525254, + "acc_stderr": 0.02525252525252537, + "acc_norm": 0.3569023569023569, + "acc_norm_stderr": 0.027846288057490554 + } + }, + "versions": { + "arc_ru_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ru_challenge_bloom-560.json b/evals/arc-challenge/arc_ru_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..863c94dcc4459d25ef7faec70a11d6199434c8af --- /dev/null +++ b/evals/arc-challenge/arc_ru_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ru_challenge": { + "acc": 0.24915824915824916, + "acc_stderr": 0.025140041284626418, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.027399831217559588 + } + }, + "versions": { + "arc_ru_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ru_challenge_bloom-7b1.json b/evals/arc-challenge/arc_ru_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..5b61e526e728d5523f1e61b4fe49307c1c872c4c --- /dev/null +++ b/evals/arc-challenge/arc_ru_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ru_challenge": { + "acc": 0.25925925925925924, + "acc_stderr": 0.025471492792791674, + "acc_norm": 0.32996632996632996, + "acc_norm_stderr": 0.02732985145570343 + } + }, + "versions": { + "arc_ru_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ru_challenge_gpt2-large.json b/evals/arc-challenge/arc_ru_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..fd367513e4157fb1556348f212a5c6e94922beee --- /dev/null +++ b/evals/arc-challenge/arc_ru_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ru_challenge": { + "acc": 0.24579124579124578, + "acc_stderr": 0.02502552138423529, + "acc_norm": 0.29292929292929293, + "acc_norm_stderr": 0.026452514969665924 + } + }, + "versions": { + "arc_ru_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ru_challenge_gpt2-medium.json b/evals/arc-challenge/arc_ru_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..8a7b6aee643ab931ddd7a2528c36075699604170 --- /dev/null +++ b/evals/arc-challenge/arc_ru_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ru_challenge": { + "acc": 0.21548821548821548, + "acc_stderr": 0.023898224834697, + "acc_norm": 0.2558922558922559, + "acc_norm_stderr": 0.025363000375801963 + } + }, + "versions": { + "arc_ru_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ru_challenge_gpt2.json b/evals/arc-challenge/arc_ru_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..6c01167509035c09b2ab40ba64c6f23d0d3b61c6 --- /dev/null +++ b/evals/arc-challenge/arc_ru_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ru_challenge": { + "acc": 0.19865319865319866, + "acc_stderr": 0.023190610381322137, + "acc_norm": 0.26936026936026936, + "acc_norm_stderr": 0.025785321789052268 + } + }, + "versions": { + "arc_ru_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ru_challenge_llama-7B.json b/evals/arc-challenge/arc_ru_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..c6af8bacc84e8232e587af0b1b62f0360595f5b8 --- /dev/null +++ b/evals/arc-challenge/arc_ru_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ru_challenge": { + "acc": 0.2895622895622896, + "acc_stderr": 0.026362594432681956, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.027399831217559577 + } + }, + "versions": { + "arc_ru_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_sk_challenge_bloom-1b7.json b/evals/arc-challenge/arc_sk_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..5c061cbf7e912082f72face7e42633294acb46b4 --- /dev/null +++ b/evals/arc-challenge/arc_sk_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_sk_challenge": { + "acc": 0.2516778523489933, + "acc_stderr": 0.02518190461061586, + "acc_norm": 0.2516778523489933, + "acc_norm_stderr": 0.025181904610615865 + } + }, + "versions": { + "arc_sk_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_sk_challenge_bloom-560.json b/evals/arc-challenge/arc_sk_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..77221ca57be5ff0cc96e73fc774d0670d7c7208c --- /dev/null +++ b/evals/arc-challenge/arc_sk_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_sk_challenge": { + "acc": 0.24161073825503357, + "acc_stderr": 0.02483853510802848, + "acc_norm": 0.22483221476510068, + "acc_norm_stderr": 0.02422416982965075 + } + }, + "versions": { + "arc_sk_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_sk_challenge_bloom-7b1.json b/evals/arc-challenge/arc_sk_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..2d78271208e5af3f6496e645f8b79b3b7394aa34 --- /dev/null +++ b/evals/arc-challenge/arc_sk_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_sk_challenge": { + "acc": 0.2348993288590604, + "acc_stderr": 0.024599255015999244, + "acc_norm": 0.25838926174496646, + "acc_norm_stderr": 0.025400777524610105 + } + }, + "versions": { + "arc_sk_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_sk_challenge_gpt2-large.json b/evals/arc-challenge/arc_sk_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..128f662c32c44780afb9fd950815540a151364d6 --- /dev/null +++ b/evals/arc-challenge/arc_sk_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_sk_challenge": { + "acc": 0.24161073825503357, + "acc_stderr": 0.02483853510802848, + "acc_norm": 0.2516778523489933, + "acc_norm_stderr": 0.025181904610615858 + } + }, + "versions": { + "arc_sk_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_sk_challenge_gpt2-medium.json b/evals/arc-challenge/arc_sk_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..75bc31afba2a470fbe33869562f865ae458240c8 --- /dev/null +++ b/evals/arc-challenge/arc_sk_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_sk_challenge": { + "acc": 0.23825503355704697, + "acc_stderr": 0.02471995149315962, + "acc_norm": 0.24496644295302014, + "acc_norm_stderr": 0.02495503598089895 + } + }, + "versions": { + "arc_sk_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_sk_challenge_gpt2.json b/evals/arc-challenge/arc_sk_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..28459f8e1e1dc32e8d92343933fa438b717eb85b --- /dev/null +++ b/evals/arc-challenge/arc_sk_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_sk_challenge": { + "acc": 0.2348993288590604, + "acc_stderr": 0.024599255015999244, + "acc_norm": 0.23154362416107382, + "acc_norm_stderr": 0.02447641442014662 + } + }, + "versions": { + "arc_sk_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_sk_challenge_llama-7B.json b/evals/arc-challenge/arc_sk_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..3701c2f5034fd64259683639da7b904f8bf0d1d1 --- /dev/null +++ b/evals/arc-challenge/arc_sk_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_sk_challenge": { + "acc": 0.2348993288590604, + "acc_stderr": 0.024599255015999244, + "acc_norm": 0.2550335570469799, + "acc_norm_stderr": 0.025292327380712683 + } + }, + "versions": { + "arc_sk_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_sr_challenge_bloom-1b7.json b/evals/arc-challenge/arc_sr_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..dbdcdb6f40e4a2a2d630ac6967d84266a19ee386 --- /dev/null +++ b/evals/arc-challenge/arc_sr_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_sr_challenge": { + "acc": 0.23986486486486486, + "acc_stderr": 0.024860949670846393, + "acc_norm": 0.2635135135135135, + "acc_norm_stderr": 0.025649141242391035 + } + }, + "versions": { + "arc_sr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_sr_challenge_bloom-560.json b/evals/arc-challenge/arc_sr_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..f4e4aafa24a952d05d4ff3efde104237233e2747 --- /dev/null +++ b/evals/arc-challenge/arc_sr_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_sr_challenge": { + "acc": 0.22972972972972974, + "acc_stderr": 0.02449171295391697, + "acc_norm": 0.27702702702702703, + "acc_norm_stderr": 0.02605620088360472 + } + }, + "versions": { + "arc_sr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_sr_challenge_bloom-7b1.json b/evals/arc-challenge/arc_sr_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..e70cc59ff97ac76e9506b0a8c29249c91543af45 --- /dev/null +++ b/evals/arc-challenge/arc_sr_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_sr_challenge": { + "acc": 0.26013513513513514, + "acc_stderr": 0.025542576393640232, + "acc_norm": 0.30067567567567566, + "acc_norm_stderr": 0.026697921821786215 + } + }, + "versions": { + "arc_sr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_sr_challenge_gpt2-large.json b/evals/arc-challenge/arc_sr_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..381e33947c532c85c78a23c4986d737ed19bc7e1 --- /dev/null +++ b/evals/arc-challenge/arc_sr_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_sr_challenge": { + "acc": 0.1891891891891892, + "acc_stderr": 0.022803258753373676, + "acc_norm": 0.24324324324324326, + "acc_norm_stderr": 0.024979718407699757 + } + }, + "versions": { + "arc_sr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_sr_challenge_gpt2-medium.json b/evals/arc-challenge/arc_sr_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..d59206fddbda1dfd8cd1e6514ca6cba7f09dd45b --- /dev/null +++ b/evals/arc-challenge/arc_sr_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_sr_challenge": { + "acc": 0.20608108108108109, + "acc_stderr": 0.023550282959294247, + "acc_norm": 0.24662162162162163, + "acc_norm_stderr": 0.02509638351759426 + } + }, + "versions": { + "arc_sr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_sr_challenge_gpt2.json b/evals/arc-challenge/arc_sr_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..ed4d03dcbbbdb78f9e36972c6c09ea65f958accf --- /dev/null +++ b/evals/arc-challenge/arc_sr_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_sr_challenge": { + "acc": 0.18243243243243243, + "acc_stderr": 0.0224854634796718, + "acc_norm": 0.22972972972972974, + "acc_norm_stderr": 0.024491712953916972 + } + }, + "versions": { + "arc_sr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_sr_challenge_llama-7B.json b/evals/arc-challenge/arc_sr_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..9a1c5c3f8986ce3acbf704e6d2fbd4d82fbcc724 --- /dev/null +++ b/evals/arc-challenge/arc_sr_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_sr_challenge": { + "acc": 0.2905405405405405, + "acc_stderr": 0.026433590266607382, + "acc_norm": 0.2972972972972973, + "acc_norm_stderr": 0.02661155695908287 + } + }, + "versions": { + "arc_sr_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_sv_challenge_bloom-1b7.json b/evals/arc-challenge/arc_sv_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..962c6f1d023be86a6fa7adf0d018a08eda14f1b8 --- /dev/null +++ b/evals/arc-challenge/arc_sv_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_sv_challenge": { + "acc": 0.20202020202020202, + "acc_stderr": 0.023337132573282605, + "acc_norm": 0.23232323232323232, + "acc_norm_stderr": 0.02454650495612789 + } + }, + "versions": { + "arc_sv_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_sv_challenge_bloom-560.json b/evals/arc-challenge/arc_sv_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..9477cbe0f42a6cdde99f9a0af2293c4b1c23cf00 --- /dev/null +++ b/evals/arc-challenge/arc_sv_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_sv_challenge": { + "acc": 0.21212121212121213, + "acc_stderr": 0.02376161191876168, + "acc_norm": 0.2053872053872054, + "acc_norm_stderr": 0.023481109518599313 + } + }, + "versions": { + "arc_sv_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_sv_challenge_bloom-7b1.json b/evals/arc-challenge/arc_sv_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..c89c1d01bfea674f9f7d9549f8abf2abe32192f8 --- /dev/null +++ b/evals/arc-challenge/arc_sv_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_sv_challenge": { + "acc": 0.2255892255892256, + "acc_stderr": 0.024293999292957367, + "acc_norm": 0.265993265993266, + "acc_norm_stderr": 0.02568262955665285 + } + }, + "versions": { + "arc_sv_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_sv_challenge_gpt2-large.json b/evals/arc-challenge/arc_sv_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..c090b83981933a41b620746123d08d4ba90f53a2 --- /dev/null +++ b/evals/arc-challenge/arc_sv_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_sv_challenge": { + "acc": 0.22895622895622897, + "acc_stderr": 0.02442136264227106, + "acc_norm": 0.23232323232323232, + "acc_norm_stderr": 0.02454650495612789 + } + }, + "versions": { + "arc_sv_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_sv_challenge_gpt2-medium.json b/evals/arc-challenge/arc_sv_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..31f537c4fb8157ec63b8cbcb4d2001cfd08e1533 --- /dev/null +++ b/evals/arc-challenge/arc_sv_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_sv_challenge": { + "acc": 0.2255892255892256, + "acc_stderr": 0.024293999292957367, + "acc_norm": 0.24242424242424243, + "acc_norm_stderr": 0.02490893747050876 + } + }, + "versions": { + "arc_sv_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_sv_challenge_gpt2.json b/evals/arc-challenge/arc_sv_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..718a97a6d9df935c9f0818257fda43ef3bfc7996 --- /dev/null +++ b/evals/arc-challenge/arc_sv_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_sv_challenge": { + "acc": 0.2255892255892256, + "acc_stderr": 0.024293999292957367, + "acc_norm": 0.2356902356902357, + "acc_norm_stderr": 0.024669460034907637 + } + }, + "versions": { + "arc_sv_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_sv_challenge_llama-7B.json b/evals/arc-challenge/arc_sv_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..c2c4e7550c402c4d3dbaf7d6ea56dbf864c439ce --- /dev/null +++ b/evals/arc-challenge/arc_sv_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_sv_challenge": { + "acc": 0.2962962962962963, + "acc_stderr": 0.026540687854980646, + "acc_norm": 0.30303030303030304, + "acc_norm_stderr": 0.02671185955331767 + } + }, + "versions": { + "arc_sv_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ta_challenge_bloom-1b7.json b/evals/arc-challenge/arc_ta_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..a937aa6dd9066efa74a5b88515612f7dc4ba6691 --- /dev/null +++ b/evals/arc-challenge/arc_ta_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ta_challenge": { + "acc": 0.21283783783783783, + "acc_stderr": 0.02383117831196738, + "acc_norm": 0.25675675675675674, + "acc_norm_stderr": 0.025434043955304575 + } + }, + "versions": { + "arc_ta_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ta_challenge_bloom-560.json b/evals/arc-challenge/arc_ta_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..6b1c389d448803dd7a2c483cec6aa7ff1876c4a6 --- /dev/null +++ b/evals/arc-challenge/arc_ta_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ta_challenge": { + "acc": 0.19932432432432431, + "acc_stderr": 0.02325934388926828, + "acc_norm": 0.2533783783783784, + "acc_norm_stderr": 0.025323518629100025 + } + }, + "versions": { + "arc_ta_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ta_challenge_bloom-7b1.json b/evals/arc-challenge/arc_ta_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..a5da07219683283eaafbda47b1ed0957be400dda --- /dev/null +++ b/evals/arc-challenge/arc_ta_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ta_challenge": { + "acc": 0.23310810810810811, + "acc_stderr": 0.024616978985669728, + "acc_norm": 0.24324324324324326, + "acc_norm_stderr": 0.02497971840769973 + } + }, + "versions": { + "arc_ta_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ta_challenge_gpt2-large.json b/evals/arc-challenge/arc_ta_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..918cb1c7f6be3a7693ecf8713714c664843cfc38 --- /dev/null +++ b/evals/arc-challenge/arc_ta_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ta_challenge": { + "acc": 0.21283783783783783, + "acc_stderr": 0.02383117831196738, + "acc_norm": 0.23310810810810811, + "acc_norm_stderr": 0.024616978985669724 + } + }, + "versions": { + "arc_ta_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ta_challenge_gpt2-medium.json b/evals/arc-challenge/arc_ta_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..6af3ab31fdcf16311ec8594bad8ee052c05b16bc --- /dev/null +++ b/evals/arc-challenge/arc_ta_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ta_challenge": { + "acc": 0.2195945945945946, + "acc_stderr": 0.02410238110604679, + "acc_norm": 0.2668918918918919, + "acc_norm_stderr": 0.025753762926257903 + } + }, + "versions": { + "arc_ta_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ta_challenge_gpt2.json b/evals/arc-challenge/arc_ta_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..5245a03aac201f65f42e53dcabf6d1f7c0717d52 --- /dev/null +++ b/evals/arc-challenge/arc_ta_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ta_challenge": { + "acc": 0.23986486486486486, + "acc_stderr": 0.024860949670846396, + "acc_norm": 0.26013513513513514, + "acc_norm_stderr": 0.025542576393640246 + } + }, + "versions": { + "arc_ta_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_ta_challenge_llama-7B.json b/evals/arc-challenge/arc_ta_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..241feef032d750202d858fbc9162e3549a178160 --- /dev/null +++ b/evals/arc-challenge/arc_ta_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_ta_challenge": { + "acc": 0.20270270270270271, + "acc_stderr": 0.02340609199417405, + "acc_norm": 0.22297297297297297, + "acc_norm_stderr": 0.02423444993634422 + } + }, + "versions": { + "arc_ta_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_te_challenge_bloom-1b7.json b/evals/arc-challenge/arc_te_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..ce9a2c9841dcb9e494770a8c9199b82c8ab4c9f7 --- /dev/null +++ b/evals/arc-challenge/arc_te_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_te_challenge": { + "acc": 0.21897810218978103, + "acc_stderr": 0.02502941075517834, + "acc_norm": 0.2591240875912409, + "acc_norm_stderr": 0.026518277256436896 + } + }, + "versions": { + "arc_te_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_te_challenge_bloom-560.json b/evals/arc-challenge/arc_te_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..0d326f4a1b5d45a12a085af0588dc48da1242b19 --- /dev/null +++ b/evals/arc-challenge/arc_te_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_te_challenge": { + "acc": 0.22627737226277372, + "acc_stderr": 0.02532397574413385, + "acc_norm": 0.24087591240875914, + "acc_norm_stderr": 0.025880445559939208 + } + }, + "versions": { + "arc_te_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_te_challenge_bloom-7b1.json b/evals/arc-challenge/arc_te_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..1c6d34bb9da6f86f1a4494caba49a2d1bab46bcf --- /dev/null +++ b/evals/arc-challenge/arc_te_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_te_challenge": { + "acc": 0.20072992700729927, + "acc_stderr": 0.024242171306158907, + "acc_norm": 0.25547445255474455, + "acc_norm_stderr": 0.026395641265678074 + } + }, + "versions": { + "arc_te_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_te_challenge_gpt2-large.json b/evals/arc-challenge/arc_te_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..226ed83458102ea0a3f4161159558d6ae8875357 --- /dev/null +++ b/evals/arc-challenge/arc_te_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_te_challenge": { + "acc": 0.22627737226277372, + "acc_stderr": 0.02532397574413385, + "acc_norm": 0.24087591240875914, + "acc_norm_stderr": 0.025880445559939208 + } + }, + "versions": { + "arc_te_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_te_challenge_gpt2-medium.json b/evals/arc-challenge/arc_te_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..a5bd92092ab22f31db2d36d69626c32b485ab331 --- /dev/null +++ b/evals/arc-challenge/arc_te_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_te_challenge": { + "acc": 0.2116788321167883, + "acc_stderr": 0.02472344500978517, + "acc_norm": 0.22992700729927007, + "acc_norm_stderr": 0.025467107178386465 + } + }, + "versions": { + "arc_te_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_te_challenge_gpt2.json b/evals/arc-challenge/arc_te_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..c6b5f06c5f92b644a3c4ac037330810277460f0a --- /dev/null +++ b/evals/arc-challenge/arc_te_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_te_challenge": { + "acc": 0.22627737226277372, + "acc_stderr": 0.02532397574413385, + "acc_norm": 0.24087591240875914, + "acc_norm_stderr": 0.025880445559939215 + } + }, + "versions": { + "arc_te_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_te_challenge_llama-7B.json b/evals/arc-challenge/arc_te_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..a20fb71e7ce5932ff220ab3a23466714b469cd51 --- /dev/null +++ b/evals/arc-challenge/arc_te_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_te_challenge": { + "acc": 0.24087591240875914, + "acc_stderr": 0.025880445559939215, + "acc_norm": 0.26277372262773724, + "acc_norm_stderr": 0.026638517193281797 + } + }, + "versions": { + "arc_te_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_uk_challenge_bloom-1b7.json b/evals/arc-challenge/arc_uk_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..72eee1e288b03359fecf649039ec7e1a796086ee --- /dev/null +++ b/evals/arc-challenge/arc_uk_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_uk_challenge": { + "acc": 0.24579124579124578, + "acc_stderr": 0.025025521384235305, + "acc_norm": 0.28619528619528617, + "acc_norm_stderr": 0.026270908298354635 + } + }, + "versions": { + "arc_uk_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_uk_challenge_bloom-560.json b/evals/arc-challenge/arc_uk_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..ef5e9d5a99c327e81413b16eb715a91e70b6c5b3 --- /dev/null +++ b/evals/arc-challenge/arc_uk_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_uk_challenge": { + "acc": 0.265993265993266, + "acc_stderr": 0.02568262955665285, + "acc_norm": 0.2895622895622896, + "acc_norm_stderr": 0.026362594432681956 + } + }, + "versions": { + "arc_uk_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_uk_challenge_bloom-7b1.json b/evals/arc-challenge/arc_uk_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..3c2cc6b833fb7540bcca14af70e018d3eb236524 --- /dev/null +++ b/evals/arc-challenge/arc_uk_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_uk_challenge": { + "acc": 0.2222222222222222, + "acc_stderr": 0.02416437978893547, + "acc_norm": 0.265993265993266, + "acc_norm_stderr": 0.02568262955665285 + } + }, + "versions": { + "arc_uk_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_uk_challenge_gpt2-large.json b/evals/arc-challenge/arc_uk_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..c03f6ddf265c02f0fc83f91f5c16d2586666d682 --- /dev/null +++ b/evals/arc-challenge/arc_uk_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_uk_challenge": { + "acc": 0.23232323232323232, + "acc_stderr": 0.02454650495612789, + "acc_norm": 0.27946127946127947, + "acc_norm_stderr": 0.026082164400369843 + } + }, + "versions": { + "arc_uk_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_uk_challenge_gpt2-medium.json b/evals/arc-challenge/arc_uk_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..51083b7158f2de8700c8c253b7e5e98eba1626a9 --- /dev/null +++ b/evals/arc-challenge/arc_uk_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_uk_challenge": { + "acc": 0.2222222222222222, + "acc_stderr": 0.02416437978893546, + "acc_norm": 0.265993265993266, + "acc_norm_stderr": 0.02568262955665285 + } + }, + "versions": { + "arc_uk_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_uk_challenge_gpt2.json b/evals/arc-challenge/arc_uk_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..e32104934ab1fe23828d680bf766e04e93ea044a --- /dev/null +++ b/evals/arc-challenge/arc_uk_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_uk_challenge": { + "acc": 0.21212121212121213, + "acc_stderr": 0.023761611918761662, + "acc_norm": 0.24242424242424243, + "acc_norm_stderr": 0.02490893747050876 + } + }, + "versions": { + "arc_uk_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_uk_challenge_llama-7B.json b/evals/arc-challenge/arc_uk_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..a02491cf171678a4ddc940caa47d4c778b0e3cf5 --- /dev/null +++ b/evals/arc-challenge/arc_uk_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_uk_challenge": { + "acc": 0.30976430976430974, + "acc_stderr": 0.026876241779014095, + "acc_norm": 0.3367003367003367, + "acc_norm_stderr": 0.027468238412892212 + } + }, + "versions": { + "arc_uk_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_vi_challenge_bloom-1b7.json b/evals/arc-challenge/arc_vi_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..508c46f8cd77b71773ecc8623d362eae91a1dc3f --- /dev/null +++ b/evals/arc-challenge/arc_vi_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_vi_challenge": { + "acc": 0.24496644295302014, + "acc_stderr": 0.024955035980898942, + "acc_norm": 0.28187919463087246, + "acc_norm_stderr": 0.026106703750007423 + } + }, + "versions": { + "arc_vi_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_vi_challenge_bloom-560.json b/evals/arc-challenge/arc_vi_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..70d9cffdbf7b3adea2bbded15e8a36d7f930b24b --- /dev/null +++ b/evals/arc-challenge/arc_vi_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_vi_challenge": { + "acc": 0.2483221476510067, + "acc_stderr": 0.025069483148037874, + "acc_norm": 0.25838926174496646, + "acc_norm_stderr": 0.025400777524610105 + } + }, + "versions": { + "arc_vi_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_vi_challenge_bloom-7b1.json b/evals/arc-challenge/arc_vi_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..f1588613ea4565257bfb7f46328c5e696a1434de --- /dev/null +++ b/evals/arc-challenge/arc_vi_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_vi_challenge": { + "acc": 0.3087248322147651, + "acc_stderr": 0.02680606307294056, + "acc_norm": 0.3288590604026846, + "acc_norm_stderr": 0.02726048303556786 + } + }, + "versions": { + "arc_vi_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_vi_challenge_gpt2-large.json b/evals/arc-challenge/arc_vi_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..c071ea16496ed3627a0dc0840835a827894a8a61 --- /dev/null +++ b/evals/arc-challenge/arc_vi_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_vi_challenge": { + "acc": 0.18120805369127516, + "acc_stderr": 0.02235101779623446, + "acc_norm": 0.23825503355704697, + "acc_norm_stderr": 0.024719951493159628 + } + }, + "versions": { + "arc_vi_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_vi_challenge_gpt2-medium.json b/evals/arc-challenge/arc_vi_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..0cb1f34c59a21cb916520b7e956a1bd193ba1395 --- /dev/null +++ b/evals/arc-challenge/arc_vi_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_vi_challenge": { + "acc": 0.2080536912751678, + "acc_stderr": 0.023553603370264103, + "acc_norm": 0.23825503355704697, + "acc_norm_stderr": 0.024719951493159628 + } + }, + "versions": { + "arc_vi_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_vi_challenge_gpt2.json b/evals/arc-challenge/arc_vi_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..6f912cfc57fb3d8efe3773d82b7a95532a6f69b0 --- /dev/null +++ b/evals/arc-challenge/arc_vi_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_vi_challenge": { + "acc": 0.2080536912751678, + "acc_stderr": 0.0235536033702641, + "acc_norm": 0.2080536912751678, + "acc_norm_stderr": 0.023553603370264124 + } + }, + "versions": { + "arc_vi_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_vi_challenge_llama-7B.json b/evals/arc-challenge/arc_vi_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..8427c0ad1958ea7ad114255f020f43c5d50d076c --- /dev/null +++ b/evals/arc-challenge/arc_vi_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_vi_challenge": { + "acc": 0.1912751677852349, + "acc_stderr": 0.022821882255340997, + "acc_norm": 0.2516778523489933, + "acc_norm_stderr": 0.025181904610615855 + } + }, + "versions": { + "arc_vi_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_zh_challenge_bloom-1b7.json b/evals/arc-challenge/arc_zh_challenge_bloom-1b7.json new file mode 100644 index 0000000000000000000000000000000000000000..4626e7c607b4dd4f9c82472abe983c30203c245c --- /dev/null +++ b/evals/arc-challenge/arc_zh_challenge_bloom-1b7.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_zh_challenge": { + "acc": 0.25252525252525254, + "acc_stderr": 0.025252525252525356, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.025471492792791674 + } + }, + "versions": { + "arc_zh_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-1b7", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_zh_challenge_bloom-560.json b/evals/arc-challenge/arc_zh_challenge_bloom-560.json new file mode 100644 index 0000000000000000000000000000000000000000..127c0ce8f0b322902ecae312152c6905394bf82e --- /dev/null +++ b/evals/arc-challenge/arc_zh_challenge_bloom-560.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_zh_challenge": { + "acc": 0.24242424242424243, + "acc_stderr": 0.024908937470508753, + "acc_norm": 0.26936026936026936, + "acc_norm_stderr": 0.025785321789052268 + } + }, + "versions": { + "arc_zh_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-560m", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_zh_challenge_bloom-7b1.json b/evals/arc-challenge/arc_zh_challenge_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..b488311a8cccbd9e611c8abe983c979453acd882 --- /dev/null +++ b/evals/arc-challenge/arc_zh_challenge_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_zh_challenge": { + "acc": 0.3400673400673401, + "acc_stderr": 0.027535084762190663, + "acc_norm": 0.367003367003367, + "acc_norm_stderr": 0.028014951100692458 + } + }, + "versions": { + "arc_zh_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_zh_challenge_gpt2-large.json b/evals/arc-challenge/arc_zh_challenge_gpt2-large.json new file mode 100644 index 0000000000000000000000000000000000000000..b20ff9d4fb351205e7abdc821a99a7a9c62aa9c6 --- /dev/null +++ b/evals/arc-challenge/arc_zh_challenge_gpt2-large.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_zh_challenge": { + "acc": 0.21548821548821548, + "acc_stderr": 0.023898224834697, + "acc_norm": 0.24915824915824916, + "acc_norm_stderr": 0.025140041284626418 + } + }, + "versions": { + "arc_zh_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-large", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_zh_challenge_gpt2-medium.json b/evals/arc-challenge/arc_zh_challenge_gpt2-medium.json new file mode 100644 index 0000000000000000000000000000000000000000..fe9d9b64694a7c0355b5de8e14577532c3e16db0 --- /dev/null +++ b/evals/arc-challenge/arc_zh_challenge_gpt2-medium.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_zh_challenge": { + "acc": 0.21548821548821548, + "acc_stderr": 0.023898224834697005, + "acc_norm": 0.23232323232323232, + "acc_norm_stderr": 0.02454650495612789 + } + }, + "versions": { + "arc_zh_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2-medium", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_zh_challenge_gpt2.json b/evals/arc-challenge/arc_zh_challenge_gpt2.json new file mode 100644 index 0000000000000000000000000000000000000000..d8da342e3dfff17d37f9f34a3f90753cb4850243 --- /dev/null +++ b/evals/arc-challenge/arc_zh_challenge_gpt2.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_zh_challenge": { + "acc": 0.20875420875420875, + "acc_stderr": 0.023622587756271476, + "acc_norm": 0.22895622895622897, + "acc_norm_stderr": 0.02442136264227106 + } + }, + "versions": { + "arc_zh_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=gpt2", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/arc-challenge/arc_zh_challenge_llama-7B.json b/evals/arc-challenge/arc_zh_challenge_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..51e82fa68d852ff2bafe284c29d895d2422b66e9 --- /dev/null +++ b/evals/arc-challenge/arc_zh_challenge_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "arc_zh_challenge": { + "acc": 0.2558922558922559, + "acc_stderr": 0.02536300037580196, + "acc_norm": 0.27946127946127947, + "acc_norm_stderr": 0.026082164400369843 + } + }, + "versions": { + "arc_zh_challenge": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_ar_bloom-7b1.json b/evals/hellaswag/hellaswag_ar_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..69248e00b845c50b1eb8379e9d0ec05aaffc075d --- /dev/null +++ b/evals/hellaswag/hellaswag_ar_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_ar": { + "acc": 0.3561464690496949, + "acc_stderr": 0.004999249661771764, + "acc_norm": 0.43341325196163905, + "acc_norm_stderr": 0.005173461992734505 + } + }, + "versions": { + "hellaswag_ar": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_ar_llama-7B.json b/evals/hellaswag/hellaswag_ar_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..53797549241b15b072b9f0ce5f8b12ea57bce437 --- /dev/null +++ b/evals/hellaswag/hellaswag_ar_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_ar": { + "acc": 0.28040540540540543, + "acc_stderr": 0.004689581635445738, + "acc_norm": 0.3085222319093287, + "acc_norm_stderr": 0.004822023322058258 + } + }, + "versions": { + "hellaswag_ar": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_bn_bloom-7b1.json b/evals/hellaswag/hellaswag_bn_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..7e6f1a343c04d236c977fa61b55e3bd8c74fa3f1 --- /dev/null +++ b/evals/hellaswag/hellaswag_bn_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_bn": { + "acc": 0.28381302748322873, + "acc_stderr": 0.004689968075947356, + "acc_norm": 0.3277429127894395, + "acc_norm_stderr": 0.004882866652334284 + } + }, + "versions": { + "hellaswag_bn": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_bn_llama-7B.json b/evals/hellaswag/hellaswag_bn_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..cb1676e09ecdce592c17a4ff25f63c87e2a2a971 --- /dev/null +++ b/evals/hellaswag/hellaswag_bn_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_bn": { + "acc": 0.26011685782298205, + "acc_stderr": 0.00456358696087763, + "acc_norm": 0.28251460722787275, + "acc_norm_stderr": 0.004683467388784859 + } + }, + "versions": { + "hellaswag_bn": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_ca_bloom-7b1.json b/evals/hellaswag/hellaswag_ca_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..fa322ff2eccfdf62925b1b79ced281791b64de0e --- /dev/null +++ b/evals/hellaswag/hellaswag_ca_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_ca": { + "acc": 0.40186712983065564, + "acc_stderr": 0.005108421054557395, + "acc_norm": 0.5120495006513244, + "acc_norm_stderr": 0.005208233728494265 + } + }, + "versions": { + "hellaswag_ca": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_ca_llama-7B.json b/evals/hellaswag/hellaswag_ca_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..4e0b22ebaf8ac031767a3f3ab1e4789d623a3c02 --- /dev/null +++ b/evals/hellaswag/hellaswag_ca_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_ca": { + "acc": 0.38460703430308296, + "acc_stderr": 0.0050691072999641, + "acc_norm": 0.49565783760312637, + "acc_norm_stderr": 0.005209550302588167 + } + }, + "versions": { + "hellaswag_ca": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_da_bloom-7b1.json b/evals/hellaswag/hellaswag_da_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..248065e86f7721ea28ce5b176e014af8e2c365bf --- /dev/null +++ b/evals/hellaswag/hellaswag_da_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_da": { + "acc": 0.2806018269747448, + "acc_stderr": 0.00465795256586935, + "acc_norm": 0.31176786673831275, + "acc_norm_stderr": 0.004802289060894963 + } + }, + "versions": { + "hellaswag_da": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_da_llama-7B.json b/evals/hellaswag/hellaswag_da_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..158172ac8091f5c183cde64b120c8c32ef6b2da7 --- /dev/null +++ b/evals/hellaswag/hellaswag_da_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_da": { + "acc": 0.3730252552391188, + "acc_stderr": 0.005013710932255912, + "acc_norm": 0.46695325094035467, + "acc_norm_stderr": 0.005172309453152385 + } + }, + "versions": { + "hellaswag_da": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_de_bloom-7b1.json b/evals/hellaswag/hellaswag_de_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..1a42078cb7cf48cd71502713357d1faa121702cc --- /dev/null +++ b/evals/hellaswag/hellaswag_de_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_de": { + "acc": 0.2982493595217763, + "acc_stderr": 0.004726948912322779, + "acc_norm": 0.32418872758326217, + "acc_norm_stderr": 0.004836279708509382 + } + }, + "versions": { + "hellaswag_de": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_de_llama-7B.json b/evals/hellaswag/hellaswag_de_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..a027e43f548e49b4fd7dd60cc606b68dc314cb9d --- /dev/null +++ b/evals/hellaswag/hellaswag_de_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_de": { + "acc": 0.39427900523001386, + "acc_stderr": 0.005049108443939032, + "acc_norm": 0.49855907780979825, + "acc_norm_stderr": 0.005165885308732062 + } + }, + "versions": { + "hellaswag_de": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_es_bloom-7b1.json b/evals/hellaswag/hellaswag_es_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..7fd9710255ac60d17ca496eac2cdcfe416fd02be --- /dev/null +++ b/evals/hellaswag/hellaswag_es_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_es": { + "acc": 0.4372733091529763, + "acc_stderr": 0.0051237264293392815, + "acc_norm": 0.566567100490719, + "acc_norm_stderr": 0.005118554174253425 + } + }, + "versions": { + "hellaswag_es": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_es_llama-7B.json b/evals/hellaswag/hellaswag_es_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..571b2651d1c438f6d95ef828887f685893f506ff --- /dev/null +++ b/evals/hellaswag/hellaswag_es_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_es": { + "acc": 0.4311466666666667, + "acc_stderr": 0.005115053675969629, + "acc_norm": 0.5640533333333333, + "acc_norm_stderr": 0.0051217018246512425 + } + }, + "versions": { + "hellaswag_es": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_eu_bloom-7b1.json b/evals/hellaswag/hellaswag_eu_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..aaa2bac442dd619e5e485a1c9bb7770c1aaad3e8 --- /dev/null +++ b/evals/hellaswag/hellaswag_eu_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_eu": { + "acc": 0.27380695314187, + "acc_stderr": 0.004633608505053738, + "acc_norm": 0.31235154394299286, + "acc_norm_stderr": 0.00481588516396214 + } + }, + "versions": { + "hellaswag_eu": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_eu_llama-7B.json b/evals/hellaswag/hellaswag_eu_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..f969135230558d6c41262c2194dbbe0e29c848f6 --- /dev/null +++ b/evals/hellaswag/hellaswag_eu_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_eu": { + "acc": 0.25847549125458863, + "acc_stderr": 0.004549288692503547, + "acc_norm": 0.28719499028287626, + "acc_norm_stderr": 0.004701591142825526 + } + }, + "versions": { + "hellaswag_eu": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_fr_bloom-7b1.json b/evals/hellaswag/hellaswag_fr_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..737e5f885ea8810e330462182af605bac6f7338e --- /dev/null +++ b/evals/hellaswag/hellaswag_fr_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_fr": { + "acc": 0.4255729278218034, + "acc_stderr": 0.005116827391881862, + "acc_norm": 0.5656457485542943, + "acc_norm_stderr": 0.005129684120180618 + } + }, + "versions": { + "hellaswag_fr": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_fr_llama-7B.json b/evals/hellaswag/hellaswag_fr_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..3f0fd2446e8e689f67cfb568e162f7b4dba1a617 --- /dev/null +++ b/evals/hellaswag/hellaswag_fr_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_fr": { + "acc": 0.4255729278218034, + "acc_stderr": 0.00511682739188186, + "acc_norm": 0.5566502463054187, + "acc_norm_stderr": 0.005141155729141772 + } + }, + "versions": { + "hellaswag_fr": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_gu_bloom-7b1.json b/evals/hellaswag/hellaswag_gu_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..0ef2b298131daf31fa9c77d37366818ba539e0bb --- /dev/null +++ b/evals/hellaswag/hellaswag_gu_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_gu": { + "acc": 0.2683176189935249, + "acc_stderr": 0.004722752779022285, + "acc_norm": 0.30625922980802, + "acc_norm_stderr": 0.0049130651137809294 + } + }, + "versions": { + "hellaswag_gu": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_gu_llama-7B.json b/evals/hellaswag/hellaswag_gu_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..a610259f2ef19c9db88847c04b399dfcbcc4a463 --- /dev/null +++ b/evals/hellaswag/hellaswag_gu_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_gu": { + "acc": 0.2560490741792571, + "acc_stderr": 0.004652036002377334, + "acc_norm": 0.28899238895830964, + "acc_norm_stderr": 0.004831585233585411 + } + }, + "versions": { + "hellaswag_gu": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_hi_bloom-7b1.json b/evals/hellaswag/hellaswag_hi_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..63eeb2a2481895efb7ecade2660f0911184073b6 --- /dev/null +++ b/evals/hellaswag/hellaswag_hi_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_hi": { + "acc": 0.31202209005947323, + "acc_stderr": 0.004774960194792877, + "acc_norm": 0.36363636363636365, + "acc_norm_stderr": 0.004957653483174718 + } + }, + "versions": { + "hellaswag_hi": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_hi_llama-7B.json b/evals/hellaswag/hellaswag_hi_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..35969545033bac79e8237b539ab99ce740103734 --- /dev/null +++ b/evals/hellaswag/hellaswag_hi_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_hi": { + "acc": 0.2729396771452846, + "acc_stderr": 0.0045910116736375154, + "acc_norm": 0.2917374681393373, + "acc_norm_stderr": 0.004684713934059222 + } + }, + "versions": { + "hellaswag_hi": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_hr_bloom-7b1.json b/evals/hellaswag/hellaswag_hr_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..2571f200efda69d65fed248bfa1462accaa0e80f --- /dev/null +++ b/evals/hellaswag/hellaswag_hr_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_hr": { + "acc": 0.27478095640240685, + "acc_stderr": 0.004586771132918674, + "acc_norm": 0.3000105563179563, + "acc_norm_stderr": 0.004708614858618206 + } + }, + "versions": { + "hellaswag_hr": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_hr_llama-7B.json b/evals/hellaswag/hellaswag_hr_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..0c8aa308a99a4d9917300d2b6bca88d4fbd44a07 --- /dev/null +++ b/evals/hellaswag/hellaswag_hr_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_hr": { + "acc": 0.3393856222949435, + "acc_stderr": 0.004865190903217322, + "acc_norm": 0.41148527393645096, + "acc_norm_stderr": 0.005056324888258699 + } + }, + "versions": { + "hellaswag_hr": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_hu_bloom-7b1.json b/evals/hellaswag/hellaswag_hu_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..cfb0859d6479ddd7e6caa9ab28436da9061fafe0 --- /dev/null +++ b/evals/hellaswag/hellaswag_hu_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_hu": { + "acc": 0.2749780893952673, + "acc_stderr": 0.004673697346652944, + "acc_norm": 0.30127081507449605, + "acc_norm_stderr": 0.004802517407348953 + } + }, + "versions": { + "hellaswag_hu": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_hu_llama-7B.json b/evals/hellaswag/hellaswag_hu_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..7f1300419e0e8727d8da4787e4074336d82c6d64 --- /dev/null +++ b/evals/hellaswag/hellaswag_hu_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_hu": { + "acc": 0.31879929886064856, + "acc_stderr": 0.004877892181685683, + "acc_norm": 0.3785056967572305, + "acc_norm_stderr": 0.005076808255387223 + } + }, + "versions": { + "hellaswag_hu": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_hy_bloom-7b1.json b/evals/hellaswag/hellaswag_hy_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..b7aadfc69e7d37e7a69be9da8de7f6f479daa078 --- /dev/null +++ b/evals/hellaswag/hellaswag_hy_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_hy": { + "acc": 0.2517377201112141, + "acc_stderr": 0.00467165233929534, + "acc_norm": 0.2761816496756256, + "acc_norm_stderr": 0.004812620824973181 + } + }, + "versions": { + "hellaswag_hy": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_hy_llama-7B.json b/evals/hellaswag/hellaswag_hy_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..85198baf9a0a8e2dcb229b74cd9c22b5421c95b3 --- /dev/null +++ b/evals/hellaswag/hellaswag_hy_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_hy": { + "acc": 0.2545180722891566, + "acc_stderr": 0.004688644596808388, + "acc_norm": 0.2849860982391103, + "acc_norm_stderr": 0.004858906279128767 + } + }, + "versions": { + "hellaswag_hy": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_id_bloom-7b1.json b/evals/hellaswag/hellaswag_id_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..b4bcc31e157c6a9c8fc29d08fd6088001c2a4e2b --- /dev/null +++ b/evals/hellaswag/hellaswag_id_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_id": { + "acc": 0.3894849785407725, + "acc_stderr": 0.005051366474018924, + "acc_norm": 0.49484978540772534, + "acc_norm_stderr": 0.005179195541251435 + } + }, + "versions": { + "hellaswag_id": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_id_llama-7B.json b/evals/hellaswag/hellaswag_id_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..d408a6b8209abf2afa7b33e28f960ce7cf71596b --- /dev/null +++ b/evals/hellaswag/hellaswag_id_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_id": { + "acc": 0.3017167381974249, + "acc_stderr": 0.004754784760510309, + "acc_norm": 0.34431330472103006, + "acc_norm_stderr": 0.004921986658657097 + } + }, + "versions": { + "hellaswag_id": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_it_bloom-7b1.json b/evals/hellaswag/hellaswag_it_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..f071bbb39cf2e6048f33a2ac1444d8d24657c9ab --- /dev/null +++ b/evals/hellaswag/hellaswag_it_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_it": { + "acc": 0.33380465520991953, + "acc_stderr": 0.004918337887582365, + "acc_norm": 0.40765716771807703, + "acc_norm_stderr": 0.005125137013353996 + } + }, + "versions": { + "hellaswag_it": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_it_llama-7B.json b/evals/hellaswag/hellaswag_it_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..2698d8e1b02654e67b142631369916d337041789 --- /dev/null +++ b/evals/hellaswag/hellaswag_it_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_it": { + "acc": 0.3975851191123681, + "acc_stderr": 0.0051045551272873, + "acc_norm": 0.5201783966061133, + "acc_norm_stderr": 0.005210879697577827 + } + }, + "versions": { + "hellaswag_it": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_kn_bloom-7b1.json b/evals/hellaswag/hellaswag_kn_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..ec110ed487575de37a4630739da2ee9264bd8d08 --- /dev/null +++ b/evals/hellaswag/hellaswag_kn_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_kn": { + "acc": 0.26337169939065674, + "acc_stderr": 0.004679154494054024, + "acc_norm": 0.30275332881967953, + "acc_norm_stderr": 0.004880859653925846 + } + }, + "versions": { + "hellaswag_kn": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_kn_llama-7B.json b/evals/hellaswag/hellaswag_kn_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..219c76670fe5ee2040cfa43d6e6360e4684a6fe4 --- /dev/null +++ b/evals/hellaswag/hellaswag_kn_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_kn": { + "acc": 0.25603701196118256, + "acc_stderr": 0.004636450973386679, + "acc_norm": 0.2887610020311442, + "acc_norm_stderr": 0.0048143280788988845 + } + }, + "versions": { + "hellaswag_kn": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_ml_bloom-7b1.json b/evals/hellaswag/hellaswag_ml_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..a4de930d07f3cb8e48668e5b5f1b53560c0ff7f1 --- /dev/null +++ b/evals/hellaswag/hellaswag_ml_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_ml": { + "acc": 0.25444979290272024, + "acc_stderr": 0.004608558887983242, + "acc_norm": 0.2878092466136796, + "acc_norm_stderr": 0.004790448543019756 + } + }, + "versions": { + "hellaswag_ml": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_ml_llama-7B.json b/evals/hellaswag/hellaswag_ml_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..d0fff179c59dc9c44b1a6de207bcba30d72726a7 --- /dev/null +++ b/evals/hellaswag/hellaswag_ml_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_ml": { + "acc": 0.2510914586365163, + "acc_stderr": 0.004588344357712618, + "acc_norm": 0.2890406358446211, + "acc_norm_stderr": 0.004796533523475371 + } + }, + "versions": { + "hellaswag_ml": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_mr_bloom-7b1.json b/evals/hellaswag/hellaswag_mr_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..5768dcee263277655dc8087f17858a884c937b53 --- /dev/null +++ b/evals/hellaswag/hellaswag_mr_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_mr": { + "acc": 0.2701799762905486, + "acc_stderr": 0.004610067484763786, + "acc_norm": 0.3100549628192693, + "acc_norm_stderr": 0.004801748474056546 + } + }, + "versions": { + "hellaswag_mr": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_mr_llama-7B.json b/evals/hellaswag/hellaswag_mr_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..6c3e2cc455a43fee3f289e2eab0831003b552a30 --- /dev/null +++ b/evals/hellaswag/hellaswag_mr_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_mr": { + "acc": 0.2592951826705464, + "acc_stderr": 0.004549803334314971, + "acc_norm": 0.2879620648776808, + "acc_norm_stderr": 0.004701019162604622 + } + }, + "versions": { + "hellaswag_mr": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_ne_bloom-7b1.json b/evals/hellaswag/hellaswag_ne_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..3b95e1d5f31b1e69f29c233339889469700c84bd --- /dev/null +++ b/evals/hellaswag/hellaswag_ne_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_ne": { + "acc": 0.27441511053874224, + "acc_stderr": 0.004622852940386713, + "acc_norm": 0.30897188237819273, + "acc_norm_stderr": 0.004787064632332303 + } + }, + "versions": { + "hellaswag_ne": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_ne_llama-7B.json b/evals/hellaswag/hellaswag_ne_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..8c4989d19a23d592896ca0b4e6fded1f62cc01f3 --- /dev/null +++ b/evals/hellaswag/hellaswag_ne_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_ne": { + "acc": 0.264112470487229, + "acc_stderr": 0.004567327225923831, + "acc_norm": 0.28171281390856406, + "acc_norm_stderr": 0.00466030469849661 + } + }, + "versions": { + "hellaswag_ne": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_nl_bloom-7b1.json b/evals/hellaswag/hellaswag_nl_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..599727af50ee8ce9a291e94ceb4f493ad958f009 --- /dev/null +++ b/evals/hellaswag/hellaswag_nl_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_nl": { + "acc": 0.28667026443604965, + "acc_stderr": 0.004698261813459453, + "acc_norm": 0.3172153264975715, + "acc_norm_stderr": 0.004835258421184045 + } + }, + "versions": { + "hellaswag_nl": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_nl_llama-7B.json b/evals/hellaswag/hellaswag_nl_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..f41371bf4d5b2933a8b16a029b69b62675604b1f --- /dev/null +++ b/evals/hellaswag/hellaswag_nl_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_nl": { + "acc": 0.38117850205050724, + "acc_stderr": 0.0050457320519523, + "acc_norm": 0.48748111374919056, + "acc_norm_stderr": 0.00519291390537233 + } + }, + "versions": { + "hellaswag_nl": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_pt_bloom-7b1.json b/evals/hellaswag/hellaswag_pt_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..5050ad2ec66e4750cc93be5c7e0c4c942051e7a9 --- /dev/null +++ b/evals/hellaswag/hellaswag_pt_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_pt": { + "acc": 0.4227977028930545, + "acc_stderr": 0.005142526543466809, + "acc_norm": 0.5511973128182902, + "acc_norm_stderr": 0.005177587858629525 + } + }, + "versions": { + "hellaswag_pt": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_pt_llama-7B.json b/evals/hellaswag/hellaswag_pt_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..7ec9536f323c0aa592fcadb1d9e1333cd323941d --- /dev/null +++ b/evals/hellaswag/hellaswag_pt_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_pt": { + "acc": 0.4037273810813739, + "acc_stderr": 0.005107551363682552, + "acc_norm": 0.532343699209015, + "acc_norm_stderr": 0.005194044440586472 + } + }, + "versions": { + "hellaswag_pt": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_ro_bloom-7b1.json b/evals/hellaswag/hellaswag_ro_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..dafe7356bdb6ae258020ac1efcc6169d4f31dd20 --- /dev/null +++ b/evals/hellaswag/hellaswag_ro_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_ro": { + "acc": 0.2795024337479719, + "acc_stderr": 0.00466744369483023, + "acc_norm": 0.3182260681449432, + "acc_norm_stderr": 0.004844601996973363 + } + }, + "versions": { + "hellaswag_ro": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_ro_llama-7B.json b/evals/hellaswag/hellaswag_ro_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..03cce6eee60bd007c3835cca157c9f654b0774a7 --- /dev/null +++ b/evals/hellaswag/hellaswag_ro_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_ro": { + "acc": 0.36041103299080585, + "acc_stderr": 0.004993666697380137, + "acc_norm": 0.4491076257436452, + "acc_norm_stderr": 0.005173430588992903 + } + }, + "versions": { + "hellaswag_ro": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_ru_bloom-7b1.json b/evals/hellaswag/hellaswag_ru_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..a1114c4bc91539820ff9a813a92206eb0b0aaf89 --- /dev/null +++ b/evals/hellaswag/hellaswag_ru_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_ru": { + "acc": 0.2975625539257981, + "acc_stderr": 0.004748207348707273, + "acc_norm": 0.32538826574633306, + "acc_norm_stderr": 0.004865915900810558 + } + }, + "versions": { + "hellaswag_ru": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_ru_llama-7B.json b/evals/hellaswag/hellaswag_ru_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..9da4ad4e94c2effcad5429b563495f832b369727 --- /dev/null +++ b/evals/hellaswag/hellaswag_ru_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_ru": { + "acc": 0.370685936151855, + "acc_stderr": 0.005016184279255606, + "acc_norm": 0.4568593615185505, + "acc_norm_stderr": 0.005173496063169706 + } + }, + "versions": { + "hellaswag_ru": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_sk_bloom-7b1.json b/evals/hellaswag/hellaswag_sk_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..a452682d669ca439c37ef65351b2482280cb6a25 --- /dev/null +++ b/evals/hellaswag/hellaswag_sk_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_sk": { + "acc": 0.27053241960991037, + "acc_stderr": 0.004561596675422169, + "acc_norm": 0.2981549815498155, + "acc_norm_stderr": 0.004697273773957717 + } + }, + "versions": { + "hellaswag_sk": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_sk_llama-7B.json b/evals/hellaswag/hellaswag_sk_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..7720fc7912fd16392b3c8ddc4e66fd5530405fce --- /dev/null +++ b/evals/hellaswag/hellaswag_sk_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_sk": { + "acc": 0.30173958882445967, + "acc_stderr": 0.004713343422332119, + "acc_norm": 0.35888244596731683, + "acc_norm_stderr": 0.004925486913523139 + } + }, + "versions": { + "hellaswag_sk": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_sr_bloom-7b1.json b/evals/hellaswag/hellaswag_sr_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..2d4dc8c27d8e4e36fa4c1a0c3d9a5431716e620d --- /dev/null +++ b/evals/hellaswag/hellaswag_sr_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_sr": { + "acc": 0.27748968144777225, + "acc_stderr": 0.004606546970716383, + "acc_norm": 0.29855011112287017, + "acc_norm_stderr": 0.004708005935082949 + } + }, + "versions": { + "hellaswag_sr": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_sr_llama-7B.json b/evals/hellaswag/hellaswag_sr_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..05dc0fdc8921cb49fe2182f475f6d81e20eb5990 --- /dev/null +++ b/evals/hellaswag/hellaswag_sr_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_sr": { + "acc": 0.3437400783151656, + "acc_stderr": 0.004886333271945336, + "acc_norm": 0.41147211345115886, + "acc_norm_stderr": 0.005062718548853834 + } + }, + "versions": { + "hellaswag_sr": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_sv_bloom-7b1.json b/evals/hellaswag/hellaswag_sv_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..4ebba6534a5e9e09a423a4422dbf2aae81a1bc02 --- /dev/null +++ b/evals/hellaswag/hellaswag_sv_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_sv": { + "acc": 0.27647445735584303, + "acc_stderr": 0.0046830976447929905, + "acc_norm": 0.3101293575970182, + "acc_norm_stderr": 0.0048432182915872585 + } + }, + "versions": { + "hellaswag_sv": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_sv_llama-7B.json b/evals/hellaswag/hellaswag_sv_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..ee471bcb53bf2f1459136089da2f6e7ae0cdafd1 --- /dev/null +++ b/evals/hellaswag/hellaswag_sv_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_sv": { + "acc": 0.3857706643279982, + "acc_stderr": 0.005096929762325147, + "acc_norm": 0.5051523788642841, + "acc_norm_stderr": 0.005235108858635741 + } + }, + "versions": { + "hellaswag_sv": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_ta_bloom-7b1.json b/evals/hellaswag/hellaswag_ta_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..584724a6119d2433aab6e11c1971faf29ca9ce8f --- /dev/null +++ b/evals/hellaswag/hellaswag_ta_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_ta": { + "acc": 0.2588850588375134, + "acc_stderr": 0.004775805657688067, + "acc_norm": 0.29406870319743256, + "acc_norm_stderr": 0.0049677071891109335 + } + }, + "versions": { + "hellaswag_ta": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_ta_llama-7B.json b/evals/hellaswag/hellaswag_ta_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..2d69d8dc8c743704b031d2ef3894db3a70bb4c9a --- /dev/null +++ b/evals/hellaswag/hellaswag_ta_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_ta": { + "acc": 0.25329846665874245, + "acc_stderr": 0.004741766564082548, + "acc_norm": 0.28313324616664687, + "acc_norm_stderr": 0.004912075369610396 + } + }, + "versions": { + "hellaswag_ta": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_te_bloom-7b1.json b/evals/hellaswag/hellaswag_te_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..5052ea04c62ee79955014621885295121b68fc76 --- /dev/null +++ b/evals/hellaswag/hellaswag_te_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_te": { + "acc": 0.26123337918386064, + "acc_stderr": 0.00470365034659896, + "acc_norm": 0.2922971114167813, + "acc_norm_stderr": 0.004869729181749992 + } + }, + "versions": { + "hellaswag_te": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_te_llama-7B.json b/evals/hellaswag/hellaswag_te_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..7bce32700aa0c1c9e176ddae4994c8d3a2b22f3b --- /dev/null +++ b/evals/hellaswag/hellaswag_te_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_te": { + "acc": 0.25767996331957815, + "acc_stderr": 0.0046827716491321504, + "acc_norm": 0.28931682714351215, + "acc_norm_stderr": 0.004855030101325898 + } + }, + "versions": { + "hellaswag_te": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_uk_bloom-7b1.json b/evals/hellaswag/hellaswag_uk_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..cd933afdab71857ec060d16116e0f044d23e7a50 --- /dev/null +++ b/evals/hellaswag/hellaswag_uk_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_uk": { + "acc": 0.2781379530237007, + "acc_stderr": 0.004619644722138738, + "acc_norm": 0.30035072802635776, + "acc_norm_stderr": 0.004726132393644123 + } + }, + "versions": { + "hellaswag_uk": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_uk_llama-7B.json b/evals/hellaswag/hellaswag_uk_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..545af16e16507026332c8c8c7836ef6d20ccae00 --- /dev/null +++ b/evals/hellaswag/hellaswag_uk_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_uk": { + "acc": 0.3544720628850648, + "acc_stderr": 0.0049304266046324334, + "acc_norm": 0.4412577012959422, + "acc_norm_stderr": 0.005117854029524533 + } + }, + "versions": { + "hellaswag_uk": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_vi_bloom-7b1.json b/evals/hellaswag/hellaswag_vi_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..686132db373135123f1b9720642bfd294f99f328 --- /dev/null +++ b/evals/hellaswag/hellaswag_vi_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_vi": { + "acc": 0.3836498581095831, + "acc_stderr": 0.0050805394682356675, + "acc_norm": 0.4827548570181183, + "acc_norm_stderr": 0.005220836527919318 + } + }, + "versions": { + "hellaswag_vi": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_vi_llama-7B.json b/evals/hellaswag/hellaswag_vi_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..816307d9258b275603ae30ffb36851a8b3475dd9 --- /dev/null +++ b/evals/hellaswag/hellaswag_vi_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_vi": { + "acc": 0.27865094957432873, + "acc_stderr": 0.004684158200782215, + "acc_norm": 0.31608819035145164, + "acc_norm_stderr": 0.0048577229826674215 + } + }, + "versions": { + "hellaswag_vi": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_zh_bloom-7b1.json b/evals/hellaswag/hellaswag_zh_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..30ac380919e1d6d2c44c46e941ae3dc9929982e1 --- /dev/null +++ b/evals/hellaswag/hellaswag_zh_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_zh": { + "acc": 0.38851715950787824, + "acc_stderr": 0.005063776486157121, + "acc_norm": 0.5115475933520397, + "acc_norm_stderr": 0.005193156826942953 + } + }, + "versions": { + "hellaswag_zh": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/hellaswag/hellaswag_zh_llama-7B.json b/evals/hellaswag/hellaswag_zh_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..b0d393a5879535e46dfb92d3361d469fc71f97b7 --- /dev/null +++ b/evals/hellaswag/hellaswag_zh_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "hellaswag_zh": { + "acc": 0.32358653431160983, + "acc_stderr": 0.004859949552176753, + "acc_norm": 0.3945835131635736, + "acc_norm_stderr": 0.0050772319918162435 + } + }, + "versions": { + "hellaswag_zh": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/mmlu/mmlu_ar-llama-7B.json b/evals/mmlu/mmlu_ar-llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..f601d0a0a213c652ffd5519a7454ba2a537af3fc --- /dev/null +++ b/evals/mmlu/mmlu_ar-llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "mmlu_ar": { + "acc": 0.2589727722772277, + "acc_stderr": 0.0038529667515366556, + "acc_norm": 0.2797803217821782, + "acc_norm_stderr": 0.003948136869379606 + } + }, + "versions": { + "mmlu_ar": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/mmlu/mmlu_bn-llama-7B.json b/evals/mmlu/mmlu_bn-llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..da3322aaf303ad70cf3667aba1a4d73764af5fdc --- /dev/null +++ b/evals/mmlu/mmlu_bn-llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "mmlu_bn": { + "acc": 0.2501022327635561, + "acc_stderr": 0.0039166757490002955, + "acc_norm": 0.28461601374008344, + "acc_norm_stderr": 0.0040809105667388166 + } + }, + "versions": { + "mmlu_bn": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/mmlu/mmlu_ca-llama-7B.json b/evals/mmlu/mmlu_ca-llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..5183b4df5346ae0e0aa74c3166323602507c4598 --- /dev/null +++ b/evals/mmlu/mmlu_ca-llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "mmlu_ca": { + "acc": 0.3038917604134995, + "acc_stderr": 0.004010074337091965, + "acc_norm": 0.3022955305564001, + "acc_norm_stderr": 0.004004111747979521 + } + }, + "versions": { + "mmlu_ca": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/mmlu/mmlu_da-llama-7B.json b/evals/mmlu/mmlu_da-llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..f4957b8b53a4880a0eac49ccabcab4a8c6a584c2 --- /dev/null +++ b/evals/mmlu/mmlu_da-llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "mmlu_da": { + "acc": 0.2997122520066636, + "acc_stderr": 0.003986771176689293, + "acc_norm": 0.2995608056943813, + "acc_norm_stderr": 0.003986194743561357 + } + }, + "versions": { + "mmlu_da": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/mmlu/mmlu_eu-llama-7B.json b/evals/mmlu/mmlu_eu-llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..cbf5d4151c1d0c86b7232d6cbc1cc4623fafce36 --- /dev/null +++ b/evals/mmlu/mmlu_eu-llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "mmlu_eu": { + "acc": 0.2668954809185258, + "acc_stderr": 0.003998838127920185, + "acc_norm": 0.27923510664378526, + "acc_norm_stderr": 0.00405566512057356 + } + }, + "versions": { + "mmlu_eu": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/mmlu/mmlu_fr-llama-7B.json b/evals/mmlu/mmlu_fr-llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..e22bb03037c1bf7eebd47d64ccb10e43eca00210 --- /dev/null +++ b/evals/mmlu/mmlu_fr-llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "mmlu_fr": { + "acc": 0.318997784737606, + "acc_stderr": 0.004073786574740586, + "acc_norm": 0.3054006569398824, + "acc_norm_stderr": 0.00402561598834305 + } + }, + "versions": { + "mmlu_fr": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/mmlu/mmlu_gu-llama-7B.json b/evals/mmlu/mmlu_gu-llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..2236b1f5ac01a2de4772fb6fde41222398119985 --- /dev/null +++ b/evals/mmlu/mmlu_gu-llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "mmlu_gu": { + "acc": 0.24391920928233776, + "acc_stderr": 0.003981461991912142, + "acc_norm": 0.27382896433175763, + "acc_norm_stderr": 0.0041342298983896774 + } + }, + "versions": { + "mmlu_gu": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/mmlu/mmlu_hi-llama-7B.json b/evals/mmlu/mmlu_hi-llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..b9c9d981a7d61e96d94d6c128b4ccfc3f3b0f0e6 --- /dev/null +++ b/evals/mmlu/mmlu_hi-llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "mmlu_hi": { + "acc": 0.2549650237195465, + "acc_stderr": 0.003908303467263245, + "acc_norm": 0.27860416499155743, + "acc_norm_stderr": 0.0040201315154066415 + } + }, + "versions": { + "mmlu_hi": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/mmlu/mmlu_hr-llama-7B.json b/evals/mmlu/mmlu_hr-llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..b2f5ca1c97a96e3d94fd3ae5c2603632e633b975 --- /dev/null +++ b/evals/mmlu/mmlu_hr-llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "mmlu_hr": { + "acc": 0.294721630666261, + "acc_stderr": 0.003976243355939721, + "acc_norm": 0.2931244295710374, + "acc_norm_stderr": 0.003969942004520753 + } + }, + "versions": { + "mmlu_hr": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/mmlu/mmlu_hu-llama-7B.json b/evals/mmlu/mmlu_hu-llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..b74a19de5e6654aef46cf40427dc362a330fa08e --- /dev/null +++ b/evals/mmlu/mmlu_hu-llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "mmlu_hu": { + "acc": 0.27794840294840295, + "acc_stderr": 0.0039256419656824035, + "acc_norm": 0.29000307125307123, + "acc_norm_stderr": 0.0039762530331634354 + } + }, + "versions": { + "mmlu_hu": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/mmlu/mmlu_hy-llama-7B.json b/evals/mmlu/mmlu_hy-llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..c10ca85321ddad4c7be01b48cec4e49a1e214777 --- /dev/null +++ b/evals/mmlu/mmlu_hy-llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "mmlu_hy": { + "acc": 0.24800293820585806, + "acc_stderr": 0.004138305469907604, + "acc_norm": 0.2746304287944174, + "acc_norm_stderr": 0.004277007917763834 + } + }, + "versions": { + "mmlu_hy": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/mmlu/mmlu_id-llama-7B.json b/evals/mmlu/mmlu_id-llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..b6135824ebca4f3650da00511c33cc4a21bfb152 --- /dev/null +++ b/evals/mmlu/mmlu_id-llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "mmlu_id": { + "acc": 0.2795969773299748, + "acc_stderr": 0.003921194198043396, + "acc_norm": 0.2895962140294634, + "acc_norm_stderr": 0.003962902849695825 + } + }, + "versions": { + "mmlu_id": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/mmlu/mmlu_kn-llama-7B.json b/evals/mmlu/mmlu_kn-llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..606fb0050e37b38e833800c8c6787674d6157cca --- /dev/null +++ b/evals/mmlu/mmlu_kn-llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "mmlu_kn": { + "acc": 0.23933209647495363, + "acc_stderr": 0.004010635314254899, + "acc_norm": 0.27096033218482196, + "acc_norm_stderr": 0.004177761014860752 + } + }, + "versions": { + "mmlu_kn": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/mmlu/mmlu_ml-llama-7B.json b/evals/mmlu/mmlu_ml-llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..1dc1ffa8a7a5300db7121f13be137d91ddd33088 --- /dev/null +++ b/evals/mmlu/mmlu_ml-llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "mmlu_ml": { + "acc": 0.24492201668480232, + "acc_stderr": 0.0040952567017621564, + "acc_norm": 0.27529923830250275, + "acc_norm_stderr": 0.004253566006101179 + } + }, + "versions": { + "mmlu_ml": 0 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": 1, + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_ar_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_ar_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..d572bf57654e75d51e028e16a79aa73942dadca1 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_ar_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_ar_mc": { + "mc1": 0.2596899224806202, + "mc1_stderr": 0.01577046983489191, + "mc2": 0.4250856388236661, + "mc2_stderr": 0.01572683307613003 + } + }, + "versions": { + "truthfulqa_ar_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_ar_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_ar_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..5f817545d204b5083023e5456ee8029ce2191005 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_ar_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_ar_mc": { + "mc1": 0.2777777777777778, + "mc1_stderr": 0.016109958670672858, + "mc2": 0.4504998624708924, + "mc2_stderr": 0.01620052408197046 + } + }, + "versions": { + "truthfulqa_ar_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_bn_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_bn_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..9370c174001acd0fca0cddf24e9076e303b9a18d --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_bn_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_bn_mc": { + "mc1": 0.26548672566371684, + "mc1_stderr": 0.015711139487640472, + "mc2": 0.4852587344144857, + "mc2_stderr": 0.01612406516233488 + } + }, + "versions": { + "truthfulqa_bn_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_bn_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_bn_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..16e9590be5e353f400674681f4f4e162bad08d5f --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_bn_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_bn_mc": { + "mc1": 0.27939317319848295, + "mc1_stderr": 0.015964066769100945, + "mc2": 0.513392699496713, + "mc2_stderr": 0.016700880970144227 + } + }, + "versions": { + "truthfulqa_bn_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_ca_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_ca_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..11285119043f95ac0d376ad5c3e9afaeb0e2d7e9 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_ca_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_ca_mc": { + "mc1": 0.24261874197689345, + "mc1_stderr": 0.01536843525152329, + "mc2": 0.39989771937446994, + "mc2_stderr": 0.015246797370718152 + } + }, + "versions": { + "truthfulqa_ca_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_ca_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_ca_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..dd6e11c0a02074e790f1099cbbeb59e13a69f2e1 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_ca_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_ca_mc": { + "mc1": 0.2336328626444159, + "mc1_stderr": 0.015170350095728855, + "mc2": 0.388488309525287, + "mc2_stderr": 0.015026705835089502 + } + }, + "versions": { + "truthfulqa_ca_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_da_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_da_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..e55ee209ca0f7da10707018a73476230d0beb314 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_da_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_da_mc": { + "mc1": 0.26248399487836105, + "mc1_stderr": 0.015753963575796108, + "mc2": 0.4375025988127948, + "mc2_stderr": 0.01662443223981383 + } + }, + "versions": { + "truthfulqa_da_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_da_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_da_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..1b7cb2557be3886ead061adba89f89d50eefb9dd --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_da_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_da_mc": { + "mc1": 0.2573623559539053, + "mc1_stderr": 0.01565358047400349, + "mc2": 0.4161317873775416, + "mc2_stderr": 0.015138516880476799 + } + }, + "versions": { + "truthfulqa_da_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_de_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_de_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..f9009861966dc1cff1e1868b91e2bb41bfccd0f4 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_de_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_de_mc": { + "mc1": 0.24746192893401014, + "mc1_stderr": 0.015382646812261827, + "mc2": 0.4351673407370902, + "mc2_stderr": 0.015914493454090475 + } + }, + "versions": { + "truthfulqa_de_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_de_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_de_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..37147ee36d47e8dd84509b2c477c0c4563f0a7c9 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_de_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_de_mc": { + "mc1": 0.233502538071066, + "mc1_stderr": 0.015080432502225447, + "mc2": 0.383224305558326, + "mc2_stderr": 0.014662714095686993 + } + }, + "versions": { + "truthfulqa_de_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_es_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_es_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..c983b9fd981831059a19411e2f854761bb466743 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_es_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_es_mc": { + "mc1": 0.2468354430379747, + "mc1_stderr": 0.01535006418032032, + "mc2": 0.40446379335454147, + "mc2_stderr": 0.01462209461275691 + } + }, + "versions": { + "truthfulqa_es_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_es_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_es_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..ded6c86f6861c4d0dc091db262fe1d2a25208804 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_es_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_es_mc": { + "mc1": 0.22658227848101264, + "mc1_stderr": 0.014903268563982738, + "mc2": 0.37120532090630015, + "mc2_stderr": 0.014441690126415349 + } + }, + "versions": { + "truthfulqa_es_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_eu_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_eu_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..52f4939ac5fa964406f4eecce983e80178660657 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_eu_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_eu_mc": { + "mc1": 0.26214833759590794, + "mc1_stderr": 0.015737384911607682, + "mc2": 0.4464332201206485, + "mc2_stderr": 0.01621754992783137 + } + }, + "versions": { + "truthfulqa_eu_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_eu_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_eu_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..2591b2575e316599868892fc6541e53cca27f1eb --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_eu_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_eu_mc": { + "mc1": 0.22762148337595908, + "mc1_stderr": 0.01500362498587022, + "mc2": 0.4077400427662786, + "mc2_stderr": 0.01655029094183041 + } + }, + "versions": { + "truthfulqa_eu_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_fr_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_fr_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..74d3041ce242f33429dfa1dec98c70a446ad3459 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_fr_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_fr_mc": { + "mc1": 0.2598225602027883, + "mc1_stderr": 0.015622237721822354, + "mc2": 0.40857191925599595, + "mc2_stderr": 0.01474266494761903 + } + }, + "versions": { + "truthfulqa_fr_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_fr_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_fr_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..800ad2a78b80c2eb4974ba18bc90689969705247 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_fr_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_fr_mc": { + "mc1": 0.23827629911280102, + "mc1_stderr": 0.015176654543722067, + "mc2": 0.39924075017495203, + "mc2_stderr": 0.014258162205908845 + } + }, + "versions": { + "truthfulqa_fr_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_gu_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_gu_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..64f963ad419e8b93cc4134accc25685a3b6c7973 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_gu_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_gu_mc": { + "mc1": 0.2572944297082228, + "mc1_stderr": 0.015930376662111265, + "mc2": 0.4550226506739247, + "mc2_stderr": 0.016990336661822224 + } + }, + "versions": { + "truthfulqa_gu_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_gu_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_gu_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..c069c02eb514218d456bb1424dd8cfe77f48a1ab --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_gu_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_gu_mc": { + "mc1": 0.2572944297082228, + "mc1_stderr": 0.015930376662111265, + "mc2": 0.42704504017782213, + "mc2_stderr": 0.017012444121235887 + } + }, + "versions": { + "truthfulqa_gu_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_hi_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_hi_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..8962a71a352d9b104821eb68a25a8785186a6f80 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_hi_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_hi_mc": { + "mc1": 0.26153846153846155, + "mc1_stderr": 0.0157457370262172, + "mc2": 0.4459427734456273, + "mc2_stderr": 0.015816895972907637 + } + }, + "versions": { + "truthfulqa_hi_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_hi_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_hi_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..2f7c57699fb99f36e65e991419808a451e65b58d --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_hi_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_hi_mc": { + "mc1": 0.28076923076923077, + "mc1_stderr": 0.016100529409585174, + "mc2": 0.47439648196687334, + "mc2_stderr": 0.016645149126511907 + } + }, + "versions": { + "truthfulqa_hi_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_hr_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_hr_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..314546568b9f50af4248c3961474c8f4e4d3b021 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_hr_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_hr_mc": { + "mc1": 0.2805194805194805, + "mc1_stderr": 0.01620047927370478, + "mc2": 0.4799867976765054, + "mc2_stderr": 0.016630823388575047 + } + }, + "versions": { + "truthfulqa_hr_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_hr_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_hr_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..a89b4ca336f2e469df36faf9e3b8bae78e238226 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_hr_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_hr_mc": { + "mc1": 0.24285714285714285, + "mc1_stderr": 0.015463264535393416, + "mc2": 0.4178069276061212, + "mc2_stderr": 0.015457117904740929 + } + }, + "versions": { + "truthfulqa_hr_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_hu_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_hu_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..f0063c59598d9ace87e37889c678b775e7685f4e --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_hu_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_hu_mc": { + "mc1": 0.2664941785252264, + "mc1_stderr": 0.01591244793052595, + "mc2": 0.5012245769743321, + "mc2_stderr": 0.017012659134722635 + } + }, + "versions": { + "truthfulqa_hu_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_hu_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_hu_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..8186b5b669612791a673c1748562011f0fa91aec --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_hu_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_hu_mc": { + "mc1": 0.24579560155239327, + "mc1_stderr": 0.01549611867708382, + "mc2": 0.432092949382587, + "mc2_stderr": 0.015533288486024798 + } + }, + "versions": { + "truthfulqa_hu_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_hy_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_hy_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..ddde03654d791b6a3794476cfb89b83c5ef45e53 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_hy_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_hy_mc": { + "mc1": 0.2629032258064516, + "mc1_stderr": 0.017693546356249937, + "mc2": 0.4681902443615651, + "mc2_stderr": 0.019292338415181538 + } + }, + "versions": { + "truthfulqa_hy_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_hy_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_hy_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..f5ca203decb570b2e7314edadae5c00a4adfc62c --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_hy_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_hy_mc": { + "mc1": 0.2564516129032258, + "mc1_stderr": 0.017551409976203195, + "mc2": 0.46436602760838236, + "mc2_stderr": 0.018999233967880117 + } + }, + "versions": { + "truthfulqa_hy_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_id_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_id_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..e5c70280232e984fefca1f1a8cfe4a29409de1c8 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_id_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_id_mc": { + "mc1": 0.25288831835686776, + "mc1_stderr": 0.015583584105316878, + "mc2": 0.4035395580966099, + "mc2_stderr": 0.015018121460072335 + } + }, + "versions": { + "truthfulqa_id_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_id_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_id_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..6dfd743ea67805acd941cfb18f2d6362f9880f82 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_id_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_id_mc": { + "mc1": 0.25673940949935814, + "mc1_stderr": 0.015661271683095182, + "mc2": 0.39766031480749814, + "mc2_stderr": 0.015508891980724996 + } + }, + "versions": { + "truthfulqa_id_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_it_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_it_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..e83a75ef58f484e4f28d9b48fd6106931bcd7a26 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_it_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_it_mc": { + "mc1": 0.2697201017811705, + "mc1_stderr": 0.015840413061442026, + "mc2": 0.4389841648203799, + "mc2_stderr": 0.015926853851979495 + } + }, + "versions": { + "truthfulqa_it_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_it_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_it_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..b9f0f156188649c4a7542e6d6f2ba9b37c457655 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_it_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_it_mc": { + "mc1": 0.24427480916030533, + "mc1_stderr": 0.015335094706043257, + "mc2": 0.39785622787135533, + "mc2_stderr": 0.014810294602470058 + } + }, + "versions": { + "truthfulqa_it_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_kn_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_kn_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..4fe9dd96dcdc93bdacfb696ab94a78b3f7f7a246 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_kn_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_kn_mc": { + "mc1": 0.28792134831460675, + "mc1_stderr": 0.0169811116006733, + "mc2": 0.4971377207989088, + "mc2_stderr": 0.0171981853340177 + } + }, + "versions": { + "truthfulqa_kn_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_kn_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_kn_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..993e1c25914137bc34c8316cf67a25cc17ab83c4 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_kn_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_kn_mc": { + "mc1": 0.27808988764044945, + "mc1_stderr": 0.01680348492221316, + "mc2": 0.46974001502290064, + "mc2_stderr": 0.017840960060966953 + } + }, + "versions": { + "truthfulqa_kn_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_ml_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_ml_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..24914faf5345d35faa0a1b782d6c784b3edd07d6 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_ml_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_ml_mc": { + "mc1": 0.25831202046035806, + "mc1_stderr": 0.01566236755478916, + "mc2": 0.4909574719052267, + "mc2_stderr": 0.016823307128975565 + } + }, + "versions": { + "truthfulqa_ml_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_ml_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_ml_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..0a3806514d04876fc28bdf3370af03eacd615826 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_ml_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_ml_mc": { + "mc1": 0.2749360613810742, + "mc1_stderr": 0.015976383961112832, + "mc2": 0.5095091855665959, + "mc2_stderr": 0.016954647599861927 + } + }, + "versions": { + "truthfulqa_ml_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_mr_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_mr_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..cf87faf0ec1093a1588fff90b3e24e1f69710c9d --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_mr_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_mr_mc": { + "mc1": 0.2753807106598985, + "mc1_stderr": 0.015923346195889237, + "mc2": 0.47635177057868366, + "mc2_stderr": 0.016517346765693778 + } + }, + "versions": { + "truthfulqa_mr_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_mr_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_mr_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..0ddcccf963e412f25a53de4c98f439abf7a25388 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_mr_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_mr_mc": { + "mc1": 0.28553299492385786, + "mc1_stderr": 0.01610022231189975, + "mc2": 0.4895379243686521, + "mc2_stderr": 0.016741018968357894 + } + }, + "versions": { + "truthfulqa_mr_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_ne_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_ne_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..90e378f71e4638bf2da69d765a15b05858d6e2b9 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_ne_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_ne_mc": { + "mc1": 0.2880710659898477, + "mc1_stderr": 0.016142870973426694, + "mc2": 0.467435004054711, + "mc2_stderr": 0.016544742019032287 + } + }, + "versions": { + "truthfulqa_ne_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_ne_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_ne_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..547c3b78ee0caef9b096972901f0b3d40c939029 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_ne_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_ne_mc": { + "mc1": 0.2906091370558376, + "mc1_stderr": 0.016184901529011933, + "mc2": 0.466774725144191, + "mc2_stderr": 0.01677791483100084 + } + }, + "versions": { + "truthfulqa_ne_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_nl_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_nl_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..3ce8ddbd63d98848d347aa0302b9cbaccb48cbd3 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_nl_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_nl_mc": { + "mc1": 0.25477707006369427, + "mc1_stderr": 0.015561993973145626, + "mc2": 0.4267767591847509, + "mc2_stderr": 0.016186878668566853 + } + }, + "versions": { + "truthfulqa_nl_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_nl_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_nl_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..22e9d2c488076c5884e9224d8636a092fac4fe96 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_nl_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_nl_mc": { + "mc1": 0.24331210191082803, + "mc1_stderr": 0.015324355488601159, + "mc2": 0.40023342153314706, + "mc2_stderr": 0.014679036703865578 + } + }, + "versions": { + "truthfulqa_nl_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_pt_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_pt_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..b684b021a3805f5bd343cabdea341eecc0435e00 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_pt_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_pt_mc": { + "mc1": 0.23857868020304568, + "mc1_stderr": 0.015192910034567015, + "mc2": 0.38894722340741383, + "mc2_stderr": 0.014531269277587647 + } + }, + "versions": { + "truthfulqa_pt_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_pt_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_pt_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..7084df35e971145794041e3080344faabab95729 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_pt_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_pt_mc": { + "mc1": 0.22842639593908629, + "mc1_stderr": 0.014964922033138024, + "mc2": 0.3823261607330551, + "mc2_stderr": 0.01463319398314419 + } + }, + "versions": { + "truthfulqa_pt_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_ro_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_ro_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..af2110ac326a3065b94fa267cded253cc069b3e0 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_ro_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_ro_mc": { + "mc1": 0.2608695652173913, + "mc1_stderr": 0.015712552179082358, + "mc2": 0.46132785760214634, + "mc2_stderr": 0.016284566824666485 + } + }, + "versions": { + "truthfulqa_ro_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_ro_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_ro_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..fe7ed655b7f61f10c4accbd13f5b9fc293536300 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_ro_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_ro_mc": { + "mc1": 0.22762148337595908, + "mc1_stderr": 0.015003624985870205, + "mc2": 0.37160168017693795, + "mc2_stderr": 0.015014785650167688 + } + }, + "versions": { + "truthfulqa_ro_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_ru_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_ru_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..d15e5341b01a6e2876ffb863286387d4dcc69456 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_ru_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_ru_mc": { + "mc1": 0.30632911392405066, + "mc1_stderr": 0.016410898874958186, + "mc2": 0.49751656068823824, + "mc2_stderr": 0.016150279946055047 + } + }, + "versions": { + "truthfulqa_ru_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_ru_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_ru_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..2036782896e35aee07acce858c408720bcb3b9b9 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_ru_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_ru_mc": { + "mc1": 0.24556962025316456, + "mc1_stderr": 0.015323515145952671, + "mc2": 0.40851860840920967, + "mc2_stderr": 0.015225752517489843 + } + }, + "versions": { + "truthfulqa_ru_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_sk_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_sk_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..9bb50aa50589d9959d2accbb09d2d099246f74e5 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_sk_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_sk_mc": { + "mc1": 0.23846153846153847, + "mc1_stderr": 0.015268148070057835, + "mc2": 0.4379856829317774, + "mc2_stderr": 0.016560323561497736 + } + }, + "versions": { + "truthfulqa_sk_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_sk_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_sk_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..13785fc105b2964d3bcf70bb68daf0ddc0ccdbfd --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_sk_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_sk_mc": { + "mc1": 0.22692307692307692, + "mc1_stderr": 0.01500658794494848, + "mc2": 0.40846796746265707, + "mc2_stderr": 0.015828756550364212 + } + }, + "versions": { + "truthfulqa_sk_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_sr_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_sr_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..7a9be337308c1b4de36187d0139341115ab5acc1 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_sr_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_sr_mc": { + "mc1": 0.2875318066157761, + "mc1_stderr": 0.016154400981864346, + "mc2": 0.4611856949025646, + "mc2_stderr": 0.01648960635223338 + } + }, + "versions": { + "truthfulqa_sr_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=bigscience/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_sr_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_sr_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..3a70158ad0bf874c11233369e2b8b2fbd08bb508 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_sr_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_sr_mc": { + "mc1": 0.2684478371501272, + "mc1_stderr": 0.015816769133859612, + "mc2": 0.42343608663478216, + "mc2_stderr": 0.015372831241353751 + } + }, + "versions": { + "truthfulqa_sr_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_sv_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_sv_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..9885cf6375b817aa059b00ca8a5df86a2f6bbce4 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_sv_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_sv_mc": { + "mc1": 0.2622739018087855, + "mc1_stderr": 0.015821052272364522, + "mc2": 0.4457248931967088, + "mc2_stderr": 0.016517364176123605 + } + }, + "versions": { + "truthfulqa_sv_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_sv_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_sv_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..1665d4f2e88a870557fd94395d1d54f58919d85c --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_sv_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_sv_mc": { + "mc1": 0.2596899224806202, + "mc1_stderr": 0.015770469834891904, + "mc2": 0.40528913702963154, + "mc2_stderr": 0.015006798915735541 + } + }, + "versions": { + "truthfulqa_sv_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_ta_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_ta_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..4d2164cc879ad161ad6563ad86aee4884b45ea32 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_ta_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_ta_mc": { + "mc1": 0.26015228426395937, + "mc1_stderr": 0.015638591095633272, + "mc2": 0.4828328722219756, + "mc2_stderr": 0.01641270817636116 + } + }, + "versions": { + "truthfulqa_ta_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_ta_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_ta_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..fee0b1146f0fd8e72ac72b5e05a85a9d0c18afcb --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_ta_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_ta_mc": { + "mc1": 0.27411167512690354, + "mc1_stderr": 0.015900519226497174, + "mc2": 0.5027478455482438, + "mc2_stderr": 0.016693455124890125 + } + }, + "versions": { + "truthfulqa_ta_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_te_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_te_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..cb186a4cf39dc7c369f4adcb4c21742a3bb8d875 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_te_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_te_mc": { + "mc1": 0.2646276595744681, + "mc1_stderr": 0.016097235388949582, + "mc2": 0.4761751419934964, + "mc2_stderr": 0.01699481972514669 + } + }, + "versions": { + "truthfulqa_te_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_te_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_te_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..6a27e1784964b5486d2d2aeb7d5418ef3fbc892d --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_te_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_te_mc": { + "mc1": 0.2898936170212766, + "mc1_stderr": 0.016556215331027437, + "mc2": 0.4950446673992078, + "mc2_stderr": 0.017314129921675917 + } + }, + "versions": { + "truthfulqa_te_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_uk_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_uk_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..2a55f54ab6ab50194bfa1058aacbecc18b36d6e7 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_uk_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_uk_mc": { + "mc1": 0.3082901554404145, + "mc1_stderr": 0.016630856554976103, + "mc2": 0.5156453949784039, + "mc2_stderr": 0.01673540498425732 + } + }, + "versions": { + "truthfulqa_uk_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_uk_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_uk_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..87ffa1a02265b9ec13f193b53fba9b06f985e7a2 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_uk_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_uk_mc": { + "mc1": 0.23575129533678757, + "mc1_stderr": 0.015286822062573322, + "mc2": 0.41551850845167937, + "mc2_stderr": 0.01559551532730194 + } + }, + "versions": { + "truthfulqa_uk_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_vi_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_vi_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..641e07a270f97ae74adc933fcaaf2f17f0cc2720 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_vi_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_vi_mc": { + "mc1": 0.2969543147208122, + "mc1_stderr": 0.01628730493420265, + "mc2": 0.44687544361363724, + "mc2_stderr": 0.015032707389451902 + } + }, + "versions": { + "truthfulqa_vi_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_vi_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_vi_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..281dd4ecf9b86e311de0e817f9acf01943305b44 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_vi_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_vi_mc": { + "mc1": 0.2436548223350254, + "mc1_stderr": 0.015302421509379252, + "mc2": 0.42906776165158894, + "mc2_stderr": 0.016213220197264143 + } + }, + "versions": { + "truthfulqa_vi_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_zh_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_zh_mc_bloom-7b1.json new file mode 100644 index 0000000000000000000000000000000000000000..ccc762b26a77cd8c55bbb320f1c81e3b51e30910 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_zh_mc_bloom-7b1.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_zh_mc": { + "mc1": 0.22727272727272727, + "mc1_stderr": 0.014900421035751319, + "mc2": 0.3872774224063368, + "mc2_stderr": 0.01489618179042084 + } + }, + "versions": { + "truthfulqa_zh_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file diff --git a/evals/truthfulqa-mc/truthfulqa_zh_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_zh_mc_llama-7B.json new file mode 100644 index 0000000000000000000000000000000000000000..5e49b170e61cb016ddf2105ccf2469c2fd884a24 --- /dev/null +++ b/evals/truthfulqa-mc/truthfulqa_zh_mc_llama-7B.json @@ -0,0 +1,23 @@ +{ + "results": { + "truthfulqa_zh_mc": { + "mc1": 0.26515151515151514, + "mc1_stderr": 0.015694869766795665, + "mc2": 0.43429601246293487, + "mc2_stderr": 0.015796890327346987 + } + }, + "versions": { + "truthfulqa_zh_mc": 1 + }, + "config": { + "model": "hf-auto", + "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B", + "batch_size": "1", + "device": "cuda", + "no_cache": false, + "limit": null, + "bootstrap_iters": 100000, + "description_dict": {} + } +} \ No newline at end of file