diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..840d929a00830d99ebd744bf3ad1d39228cc0963
--- /dev/null
+++ b/app.py
@@ -0,0 +1,110 @@
+import os
+import json
+import glob
+from collections import defaultdict
+import gradio as gr
+
+import glob
+
+ARC = "arc_challenge"
+HELLASWAG = "hellaswag"
+MMLU = "mmlu"
+TRUTHFULQA = "truthfulqa-mc"
+BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
+
+METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
+
+
+def collect_results():
+    performance_dict = defaultdict(dict)
+    pretrained_models = set()
+    for file in glob.glob('evals/*/*.json'):
+        with open(file, 'r') as f:
+            data = json.load(f)
+        if 'results' not in data:
+            continue
+        if 'config' not in data:
+            continue
+        results = data['results']
+        config = data['config']
+        if 'model_args' not in config:
+            continue
+
+        model_args = config['model_args'].split(',')
+        pretrained = [x for x in model_args if x.startswith('pretrained=')]
+        if len(pretrained) != 1:
+            continue
+        pretrained = pretrained[0].split('=')[1]
+        pretrained = pretrained.split('/')[-1]
+        pretrained_models.add(pretrained)
+
+        for lang_task, perfs in results.items():
+            if lang_task.startswith('arc_') and lang_task.endswith('_challenge'):
+                lang = lang_task.split('_')[1]
+                task = ARC
+            elif lang_task.startswith('hellaswag_'):
+                _, lang = lang_task.split('_')
+                task = HELLASWAG
+            elif lang_task.startswith('mmlu_'):
+                _, lang = lang_task.split('_')
+                task = MMLU
+            elif lang_task.startswith('truthfulqa_') and lang_task.endswith('_mc'):
+                lang = lang_task.split('_')[1]
+                task = TRUTHFULQA
+
+            if lang and task:
+                metric = METRICS[BENCHMARKS.index(task)]
+                p = round(perfs[metric] * 100, 1)
+                performance_dict[(pretrained, lang)][task] = p
+    return performance_dict, pretrained_models
+
+
+def get_leaderboard_df(performance_dict, pretrained_models):
+    df = list()
+    for (pretrained, lang), perfs in performance_dict.items():
+        arc_perf = perfs.get(ARC, 0.0)
+        hellaswag_perf = perfs.get(HELLASWAG, 0.0)
+        mmlu_perf = perfs.get(MMLU, 0.0)
+        truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
+
+        if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0:
+            continue
+        avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
+        row = [pretrained, lang, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
+        df.append(row)
+    return df
+
+
+MODEL_COL = "Model"
+LANG_COL = "Language"
+AVERAGE_COL = "Average"
+ARC_COL = "ARC (25-shot)"
+HELLASWAG_COL = "HellaSwag (10-shot)️"
+MMLU_COL = "MMLU (5-shot))️"
+TRUTHFULQA_COL = "TruthfulQA (0-shot)"
+
+COLS = [MODEL_COL, LANG_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
+TYPES = ["str", "str", "number", "number", "number", "number", "number"]
+
+args = collect_results()
+leaderboard_df = get_leaderboard_df(*args)
+
+demo = gr.Blocks()
+with demo:
+    gr.HTML('Open Multilingual Large Language Model Evaluation Leaderboard')
+    gr.Markdown('INTRODUCTION TEXT', elem_classes="markdown-text")
+
+    with gr.Box():
+        search_bar = gr.Textbox(
+            placeholder="Search models...", show_label=False, elem_id="search-bar"
+        )
+
+        leaderboard_table = gr.components.Dataframe(
+            value=leaderboard_df,
+            headers=COLS,
+            datatype=TYPES,
+            max_rows=5,
+            elem_id="leaderboard-table",
+        )
+
+demo.launch()
diff --git a/evals/arc-challenge/arc_ar_challenge_bloom-1b7.json b/evals/arc-challenge/arc_ar_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..f11ea3c48ac461ea8df812ba639e5871955a3481
--- /dev/null
+++ b/evals/arc-challenge/arc_ar_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ar_challenge": {
+      "acc": 0.22818791946308725,
+      "acc_stderr": 0.02435139725761051,
+      "acc_norm": 0.2516778523489933,
+      "acc_norm_stderr": 0.025181904610615872
+    }
+  },
+  "versions": {
+    "arc_ar_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ar_challenge_bloom-560.json b/evals/arc-challenge/arc_ar_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..49fe745a2caa93a57a99f2a5d13b829f8544cd13
--- /dev/null
+++ b/evals/arc-challenge/arc_ar_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ar_challenge": {
+      "acc": 0.2550335570469799,
+      "acc_stderr": 0.025292327380712708,
+      "acc_norm": 0.2550335570469799,
+      "acc_norm_stderr": 0.025292327380712708
+    }
+  },
+  "versions": {
+    "arc_ar_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ar_challenge_bloom-7b1.json b/evals/arc-challenge/arc_ar_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b79172a73e91dbbf21909686c17e2c23c1f18bef
--- /dev/null
+++ b/evals/arc-challenge/arc_ar_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ar_challenge": {
+      "acc": 0.28187919463087246,
+      "acc_stderr": 0.026106703750007426,
+      "acc_norm": 0.3087248322147651,
+      "acc_norm_stderr": 0.026806063072940547
+    }
+  },
+  "versions": {
+    "arc_ar_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ar_challenge_gpt2-large.json b/evals/arc-challenge/arc_ar_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..f1aadc6691007c31ca76e985257d9ebfbffa04c5
--- /dev/null
+++ b/evals/arc-challenge/arc_ar_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ar_challenge": {
+      "acc": 0.20134228187919462,
+      "acc_stderr": 0.023268565767685306,
+      "acc_norm": 0.21476510067114093,
+      "acc_norm_stderr": 0.023828868848284352
+    }
+  },
+  "versions": {
+    "arc_ar_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ar_challenge_gpt2-medium.json b/evals/arc-challenge/arc_ar_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..db628063ccf012f4301410acf74c6449499d4a18
--- /dev/null
+++ b/evals/arc-challenge/arc_ar_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ar_challenge": {
+      "acc": 0.19463087248322147,
+      "acc_stderr": 0.022973392306598162,
+      "acc_norm": 0.21140939597315436,
+      "acc_norm_stderr": 0.02369243605357901
+    }
+  },
+  "versions": {
+    "arc_ar_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ar_challenge_gpt2.json b/evals/arc-challenge/arc_ar_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5deb8a5f49f36a08688564ca109ad5160192b56e
--- /dev/null
+++ b/evals/arc-challenge/arc_ar_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ar_challenge": {
+      "acc": 0.20134228187919462,
+      "acc_stderr": 0.023268565767685313,
+      "acc_norm": 0.22483221476510068,
+      "acc_norm_stderr": 0.024224169829650755
+    }
+  },
+  "versions": {
+    "arc_ar_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ar_challenge_llama-7B.json b/evals/arc-challenge/arc_ar_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1b5a76fae32ffadeb87c9a634cef2c6de55e923
--- /dev/null
+++ b/evals/arc-challenge/arc_ar_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ar_challenge": {
+      "acc": 0.22483221476510068,
+      "acc_stderr": 0.02422416982965075,
+      "acc_norm": 0.24161073825503357,
+      "acc_norm_stderr": 0.024838535108028477
+    }
+  },
+  "versions": {
+    "arc_ar_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_bn_challenge_bloom-1b7.json b/evals/arc-challenge/arc_bn_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa55573d46ebd614a4feb5a1aac46df0effefe2f
--- /dev/null
+++ b/evals/arc-challenge/arc_bn_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_bn_challenge": {
+      "acc": 0.20945945945945946,
+      "acc_stderr": 0.023691963473475724,
+      "acc_norm": 0.2533783783783784,
+      "acc_norm_stderr": 0.025323518629100008
+    }
+  },
+  "versions": {
+    "arc_bn_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_bn_challenge_bloom-560.json b/evals/arc-challenge/arc_bn_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..389eeb09c0a92f6b7861501b6a3e0b9caff08e3e
--- /dev/null
+++ b/evals/arc-challenge/arc_bn_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_bn_challenge": {
+      "acc": 0.22972972972972974,
+      "acc_stderr": 0.024491712953916975,
+      "acc_norm": 0.24662162162162163,
+      "acc_norm_stderr": 0.025096383517594287
+    }
+  },
+  "versions": {
+    "arc_bn_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_bn_challenge_bloom-7b1.json b/evals/arc-challenge/arc_bn_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7cf6ca71cd6f8268d0ed709fbff3ff9aa1aa20f9
--- /dev/null
+++ b/evals/arc-challenge/arc_bn_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_bn_challenge": {
+      "acc": 0.23986486486486486,
+      "acc_stderr": 0.02486094967084638,
+      "acc_norm": 0.28040540540540543,
+      "acc_norm_stderr": 0.026153277917823237
+    }
+  },
+  "versions": {
+    "arc_bn_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_bn_challenge_gpt2-large.json b/evals/arc-challenge/arc_bn_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b36e33e7bf7866400a4c7d058836627255b75a8
--- /dev/null
+++ b/evals/arc-challenge/arc_bn_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_bn_challenge": {
+      "acc": 0.2195945945945946,
+      "acc_stderr": 0.024102381106046785,
+      "acc_norm": 0.2668918918918919,
+      "acc_norm_stderr": 0.025753762926257924
+    }
+  },
+  "versions": {
+    "arc_bn_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_bn_challenge_gpt2-medium.json b/evals/arc-challenge/arc_bn_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..69dd44fcae67f0511715af28d9a6762dc0732634
--- /dev/null
+++ b/evals/arc-challenge/arc_bn_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_bn_challenge": {
+      "acc": 0.20608108108108109,
+      "acc_stderr": 0.02355028295929425,
+      "acc_norm": 0.24662162162162163,
+      "acc_norm_stderr": 0.02509638351759427
+    }
+  },
+  "versions": {
+    "arc_bn_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_bn_challenge_gpt2.json b/evals/arc-challenge/arc_bn_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..2de0f9a7b900ac9accabd3ade0c8a4d14d7fda03
--- /dev/null
+++ b/evals/arc-challenge/arc_bn_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_bn_challenge": {
+      "acc": 0.22635135135135134,
+      "acc_stderr": 0.024364215012920555,
+      "acc_norm": 0.2668918918918919,
+      "acc_norm_stderr": 0.025753762926257917
+    }
+  },
+  "versions": {
+    "arc_bn_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_bn_challenge_llama-7B.json b/evals/arc-challenge/arc_bn_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..a3dbec93edb13b0fdf7c70d9a22d0f709e0a25b2
--- /dev/null
+++ b/evals/arc-challenge/arc_bn_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_bn_challenge": {
+      "acc": 0.22635135135135134,
+      "acc_stderr": 0.024364215012920565,
+      "acc_norm": 0.26013513513513514,
+      "acc_norm_stderr": 0.02554257639364025
+    }
+  },
+  "versions": {
+    "arc_bn_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ca_challenge_bloom-1b7.json b/evals/arc-challenge/arc_ca_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..80c6381676cf5f4508fe26a2e71b75de9f5857f5
--- /dev/null
+++ b/evals/arc-challenge/arc_ca_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ca_challenge": {
+      "acc": 0.2356902356902357,
+      "acc_stderr": 0.02466946003490763,
+      "acc_norm": 0.27946127946127947,
+      "acc_norm_stderr": 0.026082164400369843
+    }
+  },
+  "versions": {
+    "arc_ca_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ca_challenge_bloom-560.json b/evals/arc-challenge/arc_ca_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..74ea721d64eabef94a72533148cf4d15946ea667
--- /dev/null
+++ b/evals/arc-challenge/arc_ca_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ca_challenge": {
+      "acc": 0.2053872053872054,
+      "acc_stderr": 0.02348110951859932,
+      "acc_norm": 0.23232323232323232,
+      "acc_norm_stderr": 0.02454650495612789
+    }
+  },
+  "versions": {
+    "arc_ca_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ca_challenge_bloom-7b1.json b/evals/arc-challenge/arc_ca_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..828e5442ee5f197e68f640cec0d3f5a4d2190a86
--- /dev/null
+++ b/evals/arc-challenge/arc_ca_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ca_challenge": {
+      "acc": 0.3164983164983165,
+      "acc_stderr": 0.02703395838420779,
+      "acc_norm": 0.3434343434343434,
+      "acc_norm_stderr": 0.0276003816062635
+    }
+  },
+  "versions": {
+    "arc_ca_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ca_challenge_gpt2-large.json b/evals/arc-challenge/arc_ca_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d1333c44929e8c397db2c9c89aa32f6c849e02f
--- /dev/null
+++ b/evals/arc-challenge/arc_ca_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ca_challenge": {
+      "acc": 0.20875420875420875,
+      "acc_stderr": 0.02362258775627148,
+      "acc_norm": 0.22895622895622897,
+      "acc_norm_stderr": 0.02442136264227106
+    }
+  },
+  "versions": {
+    "arc_ca_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ca_challenge_gpt2-medium.json b/evals/arc-challenge/arc_ca_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..b9427197beac9ba8529aa3e8014b5dee0307e089
--- /dev/null
+++ b/evals/arc-challenge/arc_ca_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ca_challenge": {
+      "acc": 0.20875420875420875,
+      "acc_stderr": 0.023622587756271473,
+      "acc_norm": 0.21212121212121213,
+      "acc_norm_stderr": 0.023761611918761673
+    }
+  },
+  "versions": {
+    "arc_ca_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ca_challenge_gpt2.json b/evals/arc-challenge/arc_ca_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a9ebfd334ce3c7fa9305ddb2650d0c9ed8d727ac
--- /dev/null
+++ b/evals/arc-challenge/arc_ca_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ca_challenge": {
+      "acc": 0.21885521885521886,
+      "acc_stderr": 0.024032467624412215,
+      "acc_norm": 0.21885521885521886,
+      "acc_norm_stderr": 0.02403246762441221
+    }
+  },
+  "versions": {
+    "arc_ca_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ca_challenge_llama-7B.json b/evals/arc-challenge/arc_ca_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b79736bea0e6806983af2b1d26982bb71d2169c
--- /dev/null
+++ b/evals/arc-challenge/arc_ca_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ca_challenge": {
+      "acc": 0.29292929292929293,
+      "acc_stderr": 0.026452514969665927,
+      "acc_norm": 0.29292929292929293,
+      "acc_norm_stderr": 0.02645251496966592
+    }
+  },
+  "versions": {
+    "arc_ca_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_da_challenge_bloom-1b7.json b/evals/arc-challenge/arc_da_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..ad507f37ee73db4c175fcd2ff76b2949c5186f12
--- /dev/null
+++ b/evals/arc-challenge/arc_da_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_da_challenge": {
+      "acc": 0.2255892255892256,
+      "acc_stderr": 0.02429399929295737,
+      "acc_norm": 0.26262626262626265,
+      "acc_norm_stderr": 0.02557802773320011
+    }
+  },
+  "versions": {
+    "arc_da_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_da_challenge_bloom-560.json b/evals/arc-challenge/arc_da_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..76c97cf086a3d4eb479d7ea19745c4f301127a2e
--- /dev/null
+++ b/evals/arc-challenge/arc_da_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_da_challenge": {
+      "acc": 0.25925925925925924,
+      "acc_stderr": 0.025471492792791667,
+      "acc_norm": 0.24579124579124578,
+      "acc_norm_stderr": 0.025025521384235284
+    }
+  },
+  "versions": {
+    "arc_da_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_da_challenge_bloom-7b1.json b/evals/arc-challenge/arc_da_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..38cbbb63a1aa857301e47a632ca28cb48df2b26a
--- /dev/null
+++ b/evals/arc-challenge/arc_da_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_da_challenge": {
+      "acc": 0.24242424242424243,
+      "acc_stderr": 0.02490893747050877,
+      "acc_norm": 0.24915824915824916,
+      "acc_norm_stderr": 0.025140041284626418
+    }
+  },
+  "versions": {
+    "arc_da_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_da_challenge_gpt2-large.json b/evals/arc-challenge/arc_da_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..c8ee21dc7b9e87604443ebe5bc43e5cd6006ac8a
--- /dev/null
+++ b/evals/arc-challenge/arc_da_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_da_challenge": {
+      "acc": 0.23232323232323232,
+      "acc_stderr": 0.02454650495612789,
+      "acc_norm": 0.24242424242424243,
+      "acc_norm_stderr": 0.024908937470508753
+    }
+  },
+  "versions": {
+    "arc_da_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_da_challenge_gpt2-medium.json b/evals/arc-challenge/arc_da_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..df7aa6d8d8bffd69ae15219bdb1f31971d2146b7
--- /dev/null
+++ b/evals/arc-challenge/arc_da_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_da_challenge": {
+      "acc": 0.24579124579124578,
+      "acc_stderr": 0.0250255213842353,
+      "acc_norm": 0.2727272727272727,
+      "acc_norm_stderr": 0.025886127156886297
+    }
+  },
+  "versions": {
+    "arc_da_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_da_challenge_gpt2.json b/evals/arc-challenge/arc_da_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e06d761ac718567edd82446e7cab3db268352caf
--- /dev/null
+++ b/evals/arc-challenge/arc_da_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_da_challenge": {
+      "acc": 0.2222222222222222,
+      "acc_stderr": 0.02416437978893547,
+      "acc_norm": 0.23905723905723905,
+      "acc_norm_stderr": 0.024790260423468984
+    }
+  },
+  "versions": {
+    "arc_da_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_da_challenge_llama-7B.json b/evals/arc-challenge/arc_da_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..0669687f3d0755614d71660a1b71b9c1d16c99af
--- /dev/null
+++ b/evals/arc-challenge/arc_da_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_da_challenge": {
+      "acc": 0.3063973063973064,
+      "acc_stderr": 0.026794891419479452,
+      "acc_norm": 0.3367003367003367,
+      "acc_norm_stderr": 0.02746823841289221
+    }
+  },
+  "versions": {
+    "arc_da_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_de_challenge_bloom-1b7.json b/evals/arc-challenge/arc_de_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c10bc700c0ecb2dfc8bde73b2f3f18879be1571
--- /dev/null
+++ b/evals/arc-challenge/arc_de_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_de_challenge": {
+      "acc": 0.24496644295302014,
+      "acc_stderr": 0.024955035980898946,
+      "acc_norm": 0.2953020134228188,
+      "acc_norm_stderr": 0.026470155629081085
+    }
+  },
+  "versions": {
+    "arc_de_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_de_challenge_bloom-560.json b/evals/arc-challenge/arc_de_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..0c23e9b1eaef780d6a824e7c0f623556d950ca89
--- /dev/null
+++ b/evals/arc-challenge/arc_de_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_de_challenge": {
+      "acc": 0.2348993288590604,
+      "acc_stderr": 0.024599255015999244,
+      "acc_norm": 0.28187919463087246,
+      "acc_norm_stderr": 0.026106703750007426
+    }
+  },
+  "versions": {
+    "arc_de_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_de_challenge_bloom-7b1.json b/evals/arc-challenge/arc_de_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..477d702b1bc9eee6d2f6b2ada459a35f84ed90e2
--- /dev/null
+++ b/evals/arc-challenge/arc_de_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_de_challenge": {
+      "acc": 0.2684563758389262,
+      "acc_stderr": 0.0257145395148175,
+      "acc_norm": 0.2684563758389262,
+      "acc_norm_stderr": 0.0257145395148175
+    }
+  },
+  "versions": {
+    "arc_de_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_de_challenge_gpt2-large.json b/evals/arc-challenge/arc_de_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..2bc523b2a951a72b3cd9a3ca1f364c1880010ab0
--- /dev/null
+++ b/evals/arc-challenge/arc_de_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_de_challenge": {
+      "acc": 0.23825503355704697,
+      "acc_stderr": 0.024719951493159625,
+      "acc_norm": 0.27181208053691275,
+      "acc_norm_stderr": 0.025815342279487567
+    }
+  },
+  "versions": {
+    "arc_de_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_de_challenge_gpt2-medium.json b/evals/arc-challenge/arc_de_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..45b24780309957f9064133758d7f8cccdb182f96
--- /dev/null
+++ b/evals/arc-challenge/arc_de_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_de_challenge": {
+      "acc": 0.23825503355704697,
+      "acc_stderr": 0.024719951493159625,
+      "acc_norm": 0.28859060402684567,
+      "acc_norm_stderr": 0.026291942108676806
+    }
+  },
+  "versions": {
+    "arc_de_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_de_challenge_gpt2.json b/evals/arc-challenge/arc_de_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..dcac4b017ab401c82005ea115725c223d14f4bbb
--- /dev/null
+++ b/evals/arc-challenge/arc_de_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_de_challenge": {
+      "acc": 0.22483221476510068,
+      "acc_stderr": 0.02422416982965075,
+      "acc_norm": 0.21140939597315436,
+      "acc_norm_stderr": 0.02369243605357901
+    }
+  },
+  "versions": {
+    "arc_de_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_de_challenge_llama-7B.json b/evals/arc-challenge/arc_de_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..8cb6300f14d8c556143f550509be7862841dc7c6
--- /dev/null
+++ b/evals/arc-challenge/arc_de_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_de_challenge": {
+      "acc": 0.2785234899328859,
+      "acc_stderr": 0.0260114035784859,
+      "acc_norm": 0.348993288590604,
+      "acc_norm_stderr": 0.027658144793750224
+    }
+  },
+  "versions": {
+    "arc_de_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_es_challenge_bloom-1b7.json b/evals/arc-challenge/arc_es_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..74eba78a722fcedb488ec904b2f0d58171c8a749
--- /dev/null
+++ b/evals/arc-challenge/arc_es_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_es_challenge": {
+      "acc": 0.2356902356902357,
+      "acc_stderr": 0.02466946003490763,
+      "acc_norm": 0.2895622895622896,
+      "acc_norm_stderr": 0.026362594432681956
+    }
+  },
+  "versions": {
+    "arc_es_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_es_challenge_bloom-560.json b/evals/arc-challenge/arc_es_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..f03023ac512f6466bc05adcbbd4b74fafdb0701e
--- /dev/null
+++ b/evals/arc-challenge/arc_es_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_es_challenge": {
+      "acc": 0.2255892255892256,
+      "acc_stderr": 0.024293999292957367,
+      "acc_norm": 0.2356902356902357,
+      "acc_norm_stderr": 0.02466946003490764
+    }
+  },
+  "versions": {
+    "arc_es_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_es_challenge_bloom-7b1.json b/evals/arc-challenge/arc_es_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..42cce52cd279c31092e728aadcc63cb1e0a04b59
--- /dev/null
+++ b/evals/arc-challenge/arc_es_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_es_challenge": {
+      "acc": 0.3265993265993266,
+      "acc_stderr": 0.027258287015652305,
+      "acc_norm": 0.3602693602693603,
+      "acc_norm_stderr": 0.02790399493827167
+    }
+  },
+  "versions": {
+    "arc_es_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_es_challenge_gpt2-large.json b/evals/arc-challenge/arc_es_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..8889a96dc89f373c32d03d03beba715496d3c5cf
--- /dev/null
+++ b/evals/arc-challenge/arc_es_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_es_challenge": {
+      "acc": 0.2222222222222222,
+      "acc_stderr": 0.024164379788935483,
+      "acc_norm": 0.26262626262626265,
+      "acc_norm_stderr": 0.02557802773320012
+    }
+  },
+  "versions": {
+    "arc_es_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_es_challenge_gpt2-medium.json b/evals/arc-challenge/arc_es_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..292e3ed1cc0e8b1b1063554055397c13de7ff5f7
--- /dev/null
+++ b/evals/arc-challenge/arc_es_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_es_challenge": {
+      "acc": 0.1919191919191919,
+      "acc_stderr": 0.022889733897083934,
+      "acc_norm": 0.25252525252525254,
+      "acc_norm_stderr": 0.02525252525252536
+    }
+  },
+  "versions": {
+    "arc_es_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_es_challenge_gpt2.json b/evals/arc-challenge/arc_es_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e71f05e3b44a477a0c85e997c61776163460f160
--- /dev/null
+++ b/evals/arc-challenge/arc_es_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_es_challenge": {
+      "acc": 0.19865319865319866,
+      "acc_stderr": 0.023190610381322127,
+      "acc_norm": 0.24579124579124578,
+      "acc_norm_stderr": 0.0250255213842353
+    }
+  },
+  "versions": {
+    "arc_es_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_es_challenge_llama-7B.json b/evals/arc-challenge/arc_es_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..0fab72d1a1f2e4fd24095bb5ec61c4a1d8f08aee
--- /dev/null
+++ b/evals/arc-challenge/arc_es_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_es_challenge": {
+      "acc": 0.3501683501683502,
+      "acc_stderr": 0.027726370308831506,
+      "acc_norm": 0.3602693602693603,
+      "acc_norm_stderr": 0.02790399493827167
+    }
+  },
+  "versions": {
+    "arc_es_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_eu_challenge_bloom-1b7.json b/evals/arc-challenge/arc_eu_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec1113a347e63807533e24faa9f8f1133a725ba3
--- /dev/null
+++ b/evals/arc-challenge/arc_eu_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_eu_challenge": {
+      "acc": 0.22377622377622378,
+      "acc_stderr": 0.02468755105337312,
+      "acc_norm": 0.2517482517482518,
+      "acc_norm_stderr": 0.02570896966075011
+    }
+  },
+  "versions": {
+    "arc_eu_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_eu_challenge_bloom-560.json b/evals/arc-challenge/arc_eu_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..d21d146ef31af9e17f56082cab45ffcd1938858f
--- /dev/null
+++ b/evals/arc-challenge/arc_eu_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_eu_challenge": {
+      "acc": 0.24475524475524477,
+      "acc_stderr": 0.02546756553847068,
+      "acc_norm": 0.19230769230769232,
+      "acc_norm_stderr": 0.023345268410264786
+    }
+  },
+  "versions": {
+    "arc_eu_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_eu_challenge_bloom-7b1.json b/evals/arc-challenge/arc_eu_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5c3fd12b9223764b5f572dbfa37a6903f058c5e
--- /dev/null
+++ b/evals/arc-challenge/arc_eu_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_eu_challenge": {
+      "acc": 0.23076923076923078,
+      "acc_stderr": 0.024957141712425013,
+      "acc_norm": 0.24125874125874125,
+      "acc_norm_stderr": 0.025343462496583764
+    }
+  },
+  "versions": {
+    "arc_eu_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_eu_challenge_gpt2-large.json b/evals/arc-challenge/arc_eu_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ca1581ef49b197cacfd25186739d7697494240c
--- /dev/null
+++ b/evals/arc-challenge/arc_eu_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_eu_challenge": {
+      "acc": 0.25874125874125875,
+      "acc_stderr": 0.02594151450124707,
+      "acc_norm": 0.24125874125874125,
+      "acc_norm_stderr": 0.025343462496583737
+    }
+  },
+  "versions": {
+    "arc_eu_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_eu_challenge_gpt2-medium.json b/evals/arc-challenge/arc_eu_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..9fcb0f103e4f8b17826dc742c5e2fd7760677501
--- /dev/null
+++ b/evals/arc-challenge/arc_eu_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_eu_challenge": {
+      "acc": 0.2762237762237762,
+      "acc_stderr": 0.026485626798716442,
+      "acc_norm": 0.25874125874125875,
+      "acc_norm_stderr": 0.025941514501247064
+    }
+  },
+  "versions": {
+    "arc_eu_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_eu_challenge_gpt2.json b/evals/arc-challenge/arc_eu_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a6f7747e337535ab8fba538b1b3e6292e596be8
--- /dev/null
+++ b/evals/arc-challenge/arc_eu_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_eu_challenge": {
+      "acc": 0.2762237762237762,
+      "acc_stderr": 0.026485626798716456,
+      "acc_norm": 0.24825174825174826,
+      "acc_norm_stderr": 0.025589390464738234
+    }
+  },
+  "versions": {
+    "arc_eu_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_eu_challenge_llama-7B.json b/evals/arc-challenge/arc_eu_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..748beb769c74d6f45c8e93c5a0151df8949243d5
--- /dev/null
+++ b/evals/arc-challenge/arc_eu_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_eu_challenge": {
+      "acc": 0.26223776223776224,
+      "acc_stderr": 0.026054539173797044,
+      "acc_norm": 0.23426573426573427,
+      "acc_norm_stderr": 0.02508828621716978
+    }
+  },
+  "versions": {
+    "arc_eu_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_fr_challenge_bloom-1b7.json b/evals/arc-challenge/arc_fr_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..e45f16627cad6e7f9c00c5e957f834e5d38c0364
--- /dev/null
+++ b/evals/arc-challenge/arc_fr_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_fr_challenge": {
+      "acc": 0.2550335570469799,
+      "acc_stderr": 0.025292327380712687,
+      "acc_norm": 0.2953020134228188,
+      "acc_norm_stderr": 0.026470155629081078
+    }
+  },
+  "versions": {
+    "arc_fr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_fr_challenge_bloom-560.json b/evals/arc-challenge/arc_fr_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..c6a22e37448b26cc7b45d56b9eb1cb9358ea8a34
--- /dev/null
+++ b/evals/arc-challenge/arc_fr_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_fr_challenge": {
+      "acc": 0.2348993288590604,
+      "acc_stderr": 0.024599255015999244,
+      "acc_norm": 0.25838926174496646,
+      "acc_norm_stderr": 0.025400777524610105
+    }
+  },
+  "versions": {
+    "arc_fr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_fr_challenge_bloom-7b1.json b/evals/arc-challenge/arc_fr_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e7fc02c83acce1c27f68cacb276ebf9d1038459b
--- /dev/null
+++ b/evals/arc-challenge/arc_fr_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_fr_challenge": {
+      "acc": 0.36577181208053694,
+      "acc_stderr": 0.027947930997299652,
+      "acc_norm": 0.3825503355704698,
+      "acc_norm_stderr": 0.02820115194087938
+    }
+  },
+  "versions": {
+    "arc_fr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_fr_challenge_gpt2-large.json b/evals/arc-challenge/arc_fr_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..9aae5d2ce6adfb2eb44ca3f0cdc1108895cd0a83
--- /dev/null
+++ b/evals/arc-challenge/arc_fr_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_fr_challenge": {
+      "acc": 0.1912751677852349,
+      "acc_stderr": 0.02282188225534101,
+      "acc_norm": 0.2684563758389262,
+      "acc_norm_stderr": 0.025714539514817496
+    }
+  },
+  "versions": {
+    "arc_fr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_fr_challenge_gpt2-medium.json b/evals/arc-challenge/arc_fr_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..465234e97d674cd00fa45996ea2f08a2d3e81dff
--- /dev/null
+++ b/evals/arc-challenge/arc_fr_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_fr_challenge": {
+      "acc": 0.2181208053691275,
+      "acc_stderr": 0.023962942745646792,
+      "acc_norm": 0.2785234899328859,
+      "acc_norm_stderr": 0.026011403578485918
+    }
+  },
+  "versions": {
+    "arc_fr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_fr_challenge_gpt2.json b/evals/arc-challenge/arc_fr_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..4e91d18eac5ed9bf7def9d899e70e9280a10d994
--- /dev/null
+++ b/evals/arc-challenge/arc_fr_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_fr_challenge": {
+      "acc": 0.2080536912751678,
+      "acc_stderr": 0.023553603370264107,
+      "acc_norm": 0.2751677852348993,
+      "acc_norm_stderr": 0.025914289910427518
+    }
+  },
+  "versions": {
+    "arc_fr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_fr_challenge_llama-7B.json b/evals/arc-challenge/arc_fr_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..289f9e2b1689351de784a6a0a22e47ebaa0bcc28
--- /dev/null
+++ b/evals/arc-challenge/arc_fr_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_fr_challenge": {
+      "acc": 0.3523489932885906,
+      "acc_stderr": 0.027719080218117063,
+      "acc_norm": 0.3422818791946309,
+      "acc_norm_stderr": 0.027531738303985358
+    }
+  },
+  "versions": {
+    "arc_fr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_gu_challenge_bloom-1b7.json b/evals/arc-challenge/arc_gu_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..a68c6f6a88aaab21388ac0f6f47a96fcad831091
--- /dev/null
+++ b/evals/arc-challenge/arc_gu_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_gu_challenge": {
+      "acc": 0.23693379790940766,
+      "acc_stderr": 0.02514268188080883,
+      "acc_norm": 0.2613240418118467,
+      "acc_norm_stderr": 0.025979671112800046
+    }
+  },
+  "versions": {
+    "arc_gu_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_gu_challenge_bloom-560.json b/evals/arc-challenge/arc_gu_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..8e1e6a4854fc92fa9250450b250a4769a4c3586d
--- /dev/null
+++ b/evals/arc-challenge/arc_gu_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_gu_challenge": {
+      "acc": 0.21951219512195122,
+      "acc_stderr": 0.0244753759026465,
+      "acc_norm": 0.25435540069686413,
+      "acc_norm_stderr": 0.025751551710541783
+    }
+  },
+  "versions": {
+    "arc_gu_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_gu_challenge_bloom-7b1.json b/evals/arc-challenge/arc_gu_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..920acb43e2275592dbf6351e0ee175bbb1a322c1
--- /dev/null
+++ b/evals/arc-challenge/arc_gu_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_gu_challenge": {
+      "acc": 0.23693379790940766,
+      "acc_stderr": 0.02514268188080883,
+      "acc_norm": 0.23693379790940766,
+      "acc_norm_stderr": 0.025142681880808825
+    }
+  },
+  "versions": {
+    "arc_gu_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_gu_challenge_gpt2-large.json b/evals/arc-challenge/arc_gu_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..c441954523c6d4bea5cc1b2cba0305b6c41fee49
--- /dev/null
+++ b/evals/arc-challenge/arc_gu_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_gu_challenge": {
+      "acc": 0.22996515679442509,
+      "acc_stderr": 0.02488302588342452,
+      "acc_norm": 0.23693379790940766,
+      "acc_norm_stderr": 0.025142681880808832
+    }
+  },
+  "versions": {
+    "arc_gu_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_gu_challenge_gpt2-medium.json b/evals/arc-challenge/arc_gu_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..7aaeca4ab77d4bf203d3bf29e50b2c3f50320f78
--- /dev/null
+++ b/evals/arc-challenge/arc_gu_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_gu_challenge": {
+      "acc": 0.2229965156794425,
+      "acc_stderr": 0.02461373413263406,
+      "acc_norm": 0.2508710801393728,
+      "acc_norm_stderr": 0.02563424701238326
+    }
+  },
+  "versions": {
+    "arc_gu_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_gu_challenge_gpt2.json b/evals/arc-challenge/arc_gu_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a988ac9706a7406299e0de78b92c41a2151d0204
--- /dev/null
+++ b/evals/arc-challenge/arc_gu_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_gu_challenge": {
+      "acc": 0.22996515679442509,
+      "acc_stderr": 0.024883025883424517,
+      "acc_norm": 0.24390243902439024,
+      "acc_norm_stderr": 0.025392997717581856
+    }
+  },
+  "versions": {
+    "arc_gu_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_gu_challenge_llama-7B.json b/evals/arc-challenge/arc_gu_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..12e906c731a45f8bd9b92a525fa2d3edc9a6f62e
--- /dev/null
+++ b/evals/arc-challenge/arc_gu_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_gu_challenge": {
+      "acc": 0.20557491289198607,
+      "acc_stderr": 0.023896181928798988,
+      "acc_norm": 0.26480836236933797,
+      "acc_norm_stderr": 0.026090542561414385
+    }
+  },
+  "versions": {
+    "arc_gu_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hi_challenge_bloom-1b7.json b/evals/arc-challenge/arc_hi_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..474da43c63438f6e87405fb3780c9b001241b895
--- /dev/null
+++ b/evals/arc-challenge/arc_hi_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hi_challenge": {
+      "acc": 0.21140939597315436,
+      "acc_stderr": 0.02369243605357901,
+      "acc_norm": 0.23825503355704697,
+      "acc_norm_stderr": 0.024719951493159625
+    }
+  },
+  "versions": {
+    "arc_hi_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hi_challenge_bloom-560.json b/evals/arc-challenge/arc_hi_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..1606ed0007915536346cb01b3395ab2cb67b09a9
--- /dev/null
+++ b/evals/arc-challenge/arc_hi_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hi_challenge": {
+      "acc": 0.19798657718120805,
+      "acc_stderr": 0.023122269968056355,
+      "acc_norm": 0.2181208053691275,
+      "acc_norm_stderr": 0.023962942745646806
+    }
+  },
+  "versions": {
+    "arc_hi_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hi_challenge_bloom-7b1.json b/evals/arc-challenge/arc_hi_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b5660d5853f1219cfdbd0d886a4fccd9e6a3ab2b
--- /dev/null
+++ b/evals/arc-challenge/arc_hi_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hi_challenge": {
+      "acc": 0.25838926174496646,
+      "acc_stderr": 0.025400777524610105,
+      "acc_norm": 0.29194630872483224,
+      "acc_norm_stderr": 0.026381917944561784
+    }
+  },
+  "versions": {
+    "arc_hi_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hi_challenge_gpt2-large.json b/evals/arc-challenge/arc_hi_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6870360e984b19d105ccc86592d36a7564ff98a
--- /dev/null
+++ b/evals/arc-challenge/arc_hi_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hi_challenge": {
+      "acc": 0.22818791946308725,
+      "acc_stderr": 0.024351397257610513,
+      "acc_norm": 0.25838926174496646,
+      "acc_norm_stderr": 0.025400777524610105
+    }
+  },
+  "versions": {
+    "arc_hi_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hi_challenge_gpt2-medium.json b/evals/arc-challenge/arc_hi_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..f64cba429b30075841311a50303cbff1487551af
--- /dev/null
+++ b/evals/arc-challenge/arc_hi_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hi_challenge": {
+      "acc": 0.24161073825503357,
+      "acc_stderr": 0.02483853510802848,
+      "acc_norm": 0.27181208053691275,
+      "acc_norm_stderr": 0.025815342279487567
+    }
+  },
+  "versions": {
+    "arc_hi_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hi_challenge_gpt2.json b/evals/arc-challenge/arc_hi_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..9ccb8fb7bd3bc4c523ed703b76c3d2526c010107
--- /dev/null
+++ b/evals/arc-challenge/arc_hi_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hi_challenge": {
+      "acc": 0.2181208053691275,
+      "acc_stderr": 0.023962942745646785,
+      "acc_norm": 0.2785234899328859,
+      "acc_norm_stderr": 0.026011403578485925
+    }
+  },
+  "versions": {
+    "arc_hi_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hi_challenge_llama-7B.json b/evals/arc-challenge/arc_hi_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..90d5c1ec99c8e977e4997800431e69a1dc078659
--- /dev/null
+++ b/evals/arc-challenge/arc_hi_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hi_challenge": {
+      "acc": 0.20469798657718122,
+      "acc_stderr": 0.02341232810510543,
+      "acc_norm": 0.2751677852348993,
+      "acc_norm_stderr": 0.025914289910427518
+    }
+  },
+  "versions": {
+    "arc_hi_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hr_challenge_bloom-1b7.json b/evals/arc-challenge/arc_hr_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4ea79c0ffc6047bb74b51d401771a577f7b2a2e
--- /dev/null
+++ b/evals/arc-challenge/arc_hr_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hr_challenge": {
+      "acc": 0.24579124579124578,
+      "acc_stderr": 0.025025521384235302,
+      "acc_norm": 0.25925925925925924,
+      "acc_norm_stderr": 0.025471492792791692
+    }
+  },
+  "versions": {
+    "arc_hr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hr_challenge_bloom-560.json b/evals/arc-challenge/arc_hr_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..d0388389e9fdfe66978f0bb663af6b9c14905b74
--- /dev/null
+++ b/evals/arc-challenge/arc_hr_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hr_challenge": {
+      "acc": 0.19865319865319866,
+      "acc_stderr": 0.023190610381322117,
+      "acc_norm": 0.2558922558922559,
+      "acc_norm_stderr": 0.025363000375801963
+    }
+  },
+  "versions": {
+    "arc_hr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hr_challenge_bloom-7b1.json b/evals/arc-challenge/arc_hr_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..27a6b5e7862ae33a52b4fcee86a333d1819e8514
--- /dev/null
+++ b/evals/arc-challenge/arc_hr_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hr_challenge": {
+      "acc": 0.23905723905723905,
+      "acc_stderr": 0.02479026042346899,
+      "acc_norm": 0.2962962962962963,
+      "acc_norm_stderr": 0.026540687854980666
+    }
+  },
+  "versions": {
+    "arc_hr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hr_challenge_gpt2-large.json b/evals/arc-challenge/arc_hr_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..daac6d38e4cc4974c0a8b524053297e0971694a9
--- /dev/null
+++ b/evals/arc-challenge/arc_hr_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hr_challenge": {
+      "acc": 0.18855218855218855,
+      "acc_stderr": 0.0227352759557704,
+      "acc_norm": 0.2255892255892256,
+      "acc_norm_stderr": 0.02429399929295737
+    }
+  },
+  "versions": {
+    "arc_hr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hr_challenge_gpt2-medium.json b/evals/arc-challenge/arc_hr_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..b69e7a89e1d024529a1ccfa184f0ed211ab024e6
--- /dev/null
+++ b/evals/arc-challenge/arc_hr_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hr_challenge": {
+      "acc": 0.18855218855218855,
+      "acc_stderr": 0.0227352759557704,
+      "acc_norm": 0.2255892255892256,
+      "acc_norm_stderr": 0.024293999292957367
+    }
+  },
+  "versions": {
+    "arc_hr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hr_challenge_gpt2.json b/evals/arc-challenge/arc_hr_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d27da666a194a216383a01fe3c520895dbaada29
--- /dev/null
+++ b/evals/arc-challenge/arc_hr_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hr_challenge": {
+      "acc": 0.19528619528619529,
+      "acc_stderr": 0.02304149438665811,
+      "acc_norm": 0.24242424242424243,
+      "acc_norm_stderr": 0.02490893747050875
+    }
+  },
+  "versions": {
+    "arc_hr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hr_challenge_llama-7B.json b/evals/arc-challenge/arc_hr_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc0a77d97f36393c01b3325f7f341ed832c808cb
--- /dev/null
+++ b/evals/arc-challenge/arc_hr_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hr_challenge": {
+      "acc": 0.2996632996632997,
+      "acc_stderr": 0.026627130450114996,
+      "acc_norm": 0.3468013468013468,
+      "acc_norm_stderr": 0.027664139917201607
+    }
+  },
+  "versions": {
+    "arc_hr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hu_challenge_bloom-1b7.json b/evals/arc-challenge/arc_hu_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6ee518fa194a5cab2b0fcc73ab71cfa9a4c7938
--- /dev/null
+++ b/evals/arc-challenge/arc_hu_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hu_challenge": {
+      "acc": 0.20875420875420875,
+      "acc_stderr": 0.023622587756271476,
+      "acc_norm": 0.21212121212121213,
+      "acc_norm_stderr": 0.023761611918761676
+    }
+  },
+  "versions": {
+    "arc_hu_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hu_challenge_bloom-560.json b/evals/arc-challenge/arc_hu_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..4326e9a449bfff5b4bffcb01ae73902068b16858
--- /dev/null
+++ b/evals/arc-challenge/arc_hu_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hu_challenge": {
+      "acc": 0.20202020202020202,
+      "acc_stderr": 0.023337132573282595,
+      "acc_norm": 0.23905723905723905,
+      "acc_norm_stderr": 0.024790260423468987
+    }
+  },
+  "versions": {
+    "arc_hu_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hu_challenge_bloom-7b1.json b/evals/arc-challenge/arc_hu_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7638b2f77f7140b0c0af0df71d4b9e1fd457bfb3
--- /dev/null
+++ b/evals/arc-challenge/arc_hu_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hu_challenge": {
+      "acc": 0.2222222222222222,
+      "acc_stderr": 0.02416437978893547,
+      "acc_norm": 0.265993265993266,
+      "acc_norm_stderr": 0.025682629556652854
+    }
+  },
+  "versions": {
+    "arc_hu_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hu_challenge_gpt2-large.json b/evals/arc-challenge/arc_hu_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..9a7113da6667b32d4460a28d91f71e3e716239d0
--- /dev/null
+++ b/evals/arc-challenge/arc_hu_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hu_challenge": {
+      "acc": 0.21212121212121213,
+      "acc_stderr": 0.023761611918761655,
+      "acc_norm": 0.24242424242424243,
+      "acc_norm_stderr": 0.02490893747050876
+    }
+  },
+  "versions": {
+    "arc_hu_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hu_challenge_gpt2-medium.json b/evals/arc-challenge/arc_hu_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..9f05d0f663b1d94cfc4087ba1aae889603546e4a
--- /dev/null
+++ b/evals/arc-challenge/arc_hu_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hu_challenge": {
+      "acc": 0.2356902356902357,
+      "acc_stderr": 0.02466946003490763,
+      "acc_norm": 0.2828282828282828,
+      "acc_norm_stderr": 0.026177438014745417
+    }
+  },
+  "versions": {
+    "arc_hu_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hu_challenge_gpt2.json b/evals/arc-challenge/arc_hu_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..3cdc244f3a355351f2b2e8826aed014e23f29fab
--- /dev/null
+++ b/evals/arc-challenge/arc_hu_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hu_challenge": {
+      "acc": 0.2053872053872054,
+      "acc_stderr": 0.023481109518599295,
+      "acc_norm": 0.25252525252525254,
+      "acc_norm_stderr": 0.025252525252525353
+    }
+  },
+  "versions": {
+    "arc_hu_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hu_challenge_llama-7B.json b/evals/arc-challenge/arc_hu_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..d0add74575f51f34aaed4497cfc6e42d0d8d9bc9
--- /dev/null
+++ b/evals/arc-challenge/arc_hu_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hu_challenge": {
+      "acc": 0.24915824915824916,
+      "acc_stderr": 0.025140041284626418,
+      "acc_norm": 0.30976430976430974,
+      "acc_norm_stderr": 0.0268762417790141
+    }
+  },
+  "versions": {
+    "arc_hu_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hy_challenge_bloom-1b7.json b/evals/arc-challenge/arc_hy_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..c569232cfdeeffa2b9c398fa8102342e55669d6d
--- /dev/null
+++ b/evals/arc-challenge/arc_hy_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hy_challenge": {
+      "acc": 0.2206896551724138,
+      "acc_stderr": 0.024394801425351647,
+      "acc_norm": 0.27241379310344827,
+      "acc_norm_stderr": 0.026188332965202905
+    }
+  },
+  "versions": {
+    "arc_hy_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hy_challenge_bloom-560.json b/evals/arc-challenge/arc_hy_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..38b99f7004830ebf484274ad893c53cff9de33a4
--- /dev/null
+++ b/evals/arc-challenge/arc_hy_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hy_challenge": {
+      "acc": 0.19655172413793104,
+      "acc_stderr": 0.023375906908472157,
+      "acc_norm": 0.2482758620689655,
+      "acc_norm_stderr": 0.02541251077219611
+    }
+  },
+  "versions": {
+    "arc_hy_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hy_challenge_bloom-7b1.json b/evals/arc-challenge/arc_hy_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c5bcfbaa2c0570aa97441fc418e71f242460803
--- /dev/null
+++ b/evals/arc-challenge/arc_hy_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hy_challenge": {
+      "acc": 0.18620689655172415,
+      "acc_stderr": 0.022898443475326664,
+      "acc_norm": 0.2689655172413793,
+      "acc_norm_stderr": 0.02608364690576629
+    }
+  },
+  "versions": {
+    "arc_hy_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hy_challenge_gpt2-large.json b/evals/arc-challenge/arc_hy_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..d3fa3d404e18049ccef76e50f8abe3deed88b1e6
--- /dev/null
+++ b/evals/arc-challenge/arc_hy_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hy_challenge": {
+      "acc": 0.19310344827586207,
+      "acc_stderr": 0.02321961545031108,
+      "acc_norm": 0.23793103448275862,
+      "acc_norm_stderr": 0.025048040852790374
+    }
+  },
+  "versions": {
+    "arc_hy_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hy_challenge_gpt2-medium.json b/evals/arc-challenge/arc_hy_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..a8f1fd794a777a25dca5bd3d54b52082a503039d
--- /dev/null
+++ b/evals/arc-challenge/arc_hy_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hy_challenge": {
+      "acc": 0.20689655172413793,
+      "acc_stderr": 0.02382827611454507,
+      "acc_norm": 0.25862068965517243,
+      "acc_norm_stderr": 0.025757454562272446
+    }
+  },
+  "versions": {
+    "arc_hy_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hy_challenge_gpt2.json b/evals/arc-challenge/arc_hy_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..a6b0c05a8a5c5112ef3326264ffa348cbe02c2ff
--- /dev/null
+++ b/evals/arc-challenge/arc_hy_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hy_challenge": {
+      "acc": 0.1793103448275862,
+      "acc_stderr": 0.022565410117928373,
+      "acc_norm": 0.27241379310344827,
+      "acc_norm_stderr": 0.026188332965202905
+    }
+  },
+  "versions": {
+    "arc_hy_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hy_challenge_llama-7B.json b/evals/arc-challenge/arc_hy_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..76c60ed9c16ffa50256b3420a3d1c544d27d0f8a
--- /dev/null
+++ b/evals/arc-challenge/arc_hy_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hy_challenge": {
+      "acc": 0.2206896551724138,
+      "acc_stderr": 0.024394801425351637,
+      "acc_norm": 0.30344827586206896,
+      "acc_norm_stderr": 0.02704394858012006
+    }
+  },
+  "versions": {
+    "arc_hy_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_id_challenge_bloom-1b7.json b/evals/arc-challenge/arc_id_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..8edb6191b5ef4693fcf7dfc5cfad9800d7044c56
--- /dev/null
+++ b/evals/arc-challenge/arc_id_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_id_challenge": {
+      "acc": 0.2986577181208054,
+      "acc_stderr": 0.026556672487880535,
+      "acc_norm": 0.2751677852348993,
+      "acc_norm_stderr": 0.025914289910427518
+    }
+  },
+  "versions": {
+    "arc_id_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_id_challenge_bloom-560.json b/evals/arc-challenge/arc_id_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d88eb711d44c2d77c4554d4f4d6e553aa1209eb
--- /dev/null
+++ b/evals/arc-challenge/arc_id_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_id_challenge": {
+      "acc": 0.24496644295302014,
+      "acc_stderr": 0.024955035980898963,
+      "acc_norm": 0.28187919463087246,
+      "acc_norm_stderr": 0.026106703750007423
+    }
+  },
+  "versions": {
+    "arc_id_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_id_challenge_bloom-7b1.json b/evals/arc-challenge/arc_id_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9d6908c8177308068c88e133ad1287687c46dcce
--- /dev/null
+++ b/evals/arc-challenge/arc_id_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_id_challenge": {
+      "acc": 0.3187919463087248,
+      "acc_stderr": 0.027040538296634997,
+      "acc_norm": 0.3825503355704698,
+      "acc_norm_stderr": 0.028201151940879375
+    }
+  },
+  "versions": {
+    "arc_id_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_id_challenge_gpt2-large.json b/evals/arc-challenge/arc_id_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..ab5432ed0c027006e5940d1dbd8e9231eccd5ab0
--- /dev/null
+++ b/evals/arc-challenge/arc_id_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_id_challenge": {
+      "acc": 0.23825503355704697,
+      "acc_stderr": 0.02471995149315962,
+      "acc_norm": 0.2684563758389262,
+      "acc_norm_stderr": 0.025714539514817496
+    }
+  },
+  "versions": {
+    "arc_id_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_id_challenge_gpt2-medium.json b/evals/arc-challenge/arc_id_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..156b2294f71673c6950d132b56805c5e36900b92
--- /dev/null
+++ b/evals/arc-challenge/arc_id_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_id_challenge": {
+      "acc": 0.2080536912751678,
+      "acc_stderr": 0.023553603370264114,
+      "acc_norm": 0.2483221476510067,
+      "acc_norm_stderr": 0.025069483148037884
+    }
+  },
+  "versions": {
+    "arc_id_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_id_challenge_gpt2.json b/evals/arc-challenge/arc_id_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef1ed97c321fe9cc50de905c218517b2d6bb812d
--- /dev/null
+++ b/evals/arc-challenge/arc_id_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_id_challenge": {
+      "acc": 0.23825503355704697,
+      "acc_stderr": 0.024719951493159628,
+      "acc_norm": 0.2785234899328859,
+      "acc_norm_stderr": 0.026011403578485907
+    }
+  },
+  "versions": {
+    "arc_id_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_id_challenge_llama-7B.json b/evals/arc-challenge/arc_id_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..531f6f81397ca5506b0f36d1291417201eb9b72e
--- /dev/null
+++ b/evals/arc-challenge/arc_id_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_id_challenge": {
+      "acc": 0.23154362416107382,
+      "acc_stderr": 0.024476414420146617,
+      "acc_norm": 0.28523489932885904,
+      "acc_norm_stderr": 0.02620021021413825
+    }
+  },
+  "versions": {
+    "arc_id_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_it_challenge_bloom-1b7.json b/evals/arc-challenge/arc_it_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..c38c75e09195bcf94e26d180f17837747473c6f7
--- /dev/null
+++ b/evals/arc-challenge/arc_it_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_it_challenge": {
+      "acc": 0.2558922558922559,
+      "acc_stderr": 0.025363000375801963,
+      "acc_norm": 0.24579124579124578,
+      "acc_norm_stderr": 0.025025521384235284
+    }
+  },
+  "versions": {
+    "arc_it_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_it_challenge_bloom-560.json b/evals/arc-challenge/arc_it_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1001fcc2f2df8d064ae2cefca3cbcf0212ed670
--- /dev/null
+++ b/evals/arc-challenge/arc_it_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_it_challenge": {
+      "acc": 0.20202020202020202,
+      "acc_stderr": 0.023337132573282612,
+      "acc_norm": 0.23232323232323232,
+      "acc_norm_stderr": 0.02454650495612789
+    }
+  },
+  "versions": {
+    "arc_it_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_it_challenge_bloom-7b1.json b/evals/arc-challenge/arc_it_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe8c476fe99201a63e06353589f9b571026510a6
--- /dev/null
+++ b/evals/arc-challenge/arc_it_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_it_challenge": {
+      "acc": 0.24242424242424243,
+      "acc_stderr": 0.02490893747050875,
+      "acc_norm": 0.23232323232323232,
+      "acc_norm_stderr": 0.02454650495612789
+    }
+  },
+  "versions": {
+    "arc_it_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_it_challenge_gpt2-large.json b/evals/arc-challenge/arc_it_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..2508d33a6975391a9665c19ebb10213e84bd23da
--- /dev/null
+++ b/evals/arc-challenge/arc_it_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_it_challenge": {
+      "acc": 0.2255892255892256,
+      "acc_stderr": 0.02429399929295737,
+      "acc_norm": 0.25252525252525254,
+      "acc_norm_stderr": 0.025252525252525342
+    }
+  },
+  "versions": {
+    "arc_it_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_it_challenge_gpt2-medium.json b/evals/arc-challenge/arc_it_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..2663af9d466539843f48e70d58dd9a236db69c79
--- /dev/null
+++ b/evals/arc-challenge/arc_it_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_it_challenge": {
+      "acc": 0.2255892255892256,
+      "acc_stderr": 0.02429399929295737,
+      "acc_norm": 0.2727272727272727,
+      "acc_norm_stderr": 0.025886127156886297
+    }
+  },
+  "versions": {
+    "arc_it_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_it_challenge_gpt2.json b/evals/arc-challenge/arc_it_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..611874b61c1374b902d583cf5cefbc4492ed6ac6
--- /dev/null
+++ b/evals/arc-challenge/arc_it_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_it_challenge": {
+      "acc": 0.22895622895622897,
+      "acc_stderr": 0.024421362642271068,
+      "acc_norm": 0.24579124579124578,
+      "acc_norm_stderr": 0.025025521384235284
+    }
+  },
+  "versions": {
+    "arc_it_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_it_challenge_llama-7B.json b/evals/arc-challenge/arc_it_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..026bc2c2a59b0b1e397e34c3f50a439cc3237e6c
--- /dev/null
+++ b/evals/arc-challenge/arc_it_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_it_challenge": {
+      "acc": 0.3164983164983165,
+      "acc_stderr": 0.02703395838420781,
+      "acc_norm": 0.3367003367003367,
+      "acc_norm_stderr": 0.02746823841289221
+    }
+  },
+  "versions": {
+    "arc_it_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_kn_challenge_bloom-1b7.json b/evals/arc-challenge/arc_kn_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..d30129acdd6c23d97224155d05ff525778afc39a
--- /dev/null
+++ b/evals/arc-challenge/arc_kn_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_kn_challenge": {
+      "acc": 0.2097902097902098,
+      "acc_stderr": 0.024118005042923673,
+      "acc_norm": 0.25874125874125875,
+      "acc_norm_stderr": 0.025941514501247074
+    }
+  },
+  "versions": {
+    "arc_kn_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_kn_challenge_bloom-560.json b/evals/arc-challenge/arc_kn_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..9061ffd18bb78ef2415b46937475b366aaba5e70
--- /dev/null
+++ b/evals/arc-challenge/arc_kn_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_kn_challenge": {
+      "acc": 0.2097902097902098,
+      "acc_stderr": 0.024118005042923676,
+      "acc_norm": 0.2727272727272727,
+      "acc_norm_stderr": 0.026380954549454924
+    }
+  },
+  "versions": {
+    "arc_kn_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_kn_challenge_bloom-7b1.json b/evals/arc-challenge/arc_kn_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..083303db0d99abb50df9664e66431757fcbc34cf
--- /dev/null
+++ b/evals/arc-challenge/arc_kn_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_kn_challenge": {
+      "acc": 0.2062937062937063,
+      "acc_stderr": 0.023969030679396822,
+      "acc_norm": 0.27972027972027974,
+      "acc_norm_stderr": 0.02658827368712313
+    }
+  },
+  "versions": {
+    "arc_kn_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_kn_challenge_gpt2-large.json b/evals/arc-challenge/arc_kn_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..cc1d0795f8679f5f353a8fe04a823ce8944d6180
--- /dev/null
+++ b/evals/arc-challenge/arc_kn_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_kn_challenge": {
+      "acc": 0.24125874125874125,
+      "acc_stderr": 0.02534346249658375,
+      "acc_norm": 0.2062937062937063,
+      "acc_norm_stderr": 0.02396903067939682
+    }
+  },
+  "versions": {
+    "arc_kn_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_kn_challenge_gpt2-medium.json b/evals/arc-challenge/arc_kn_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..3272316d0c0fa316ff58bd4f0a3c248c27457501
--- /dev/null
+++ b/evals/arc-challenge/arc_kn_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_kn_challenge": {
+      "acc": 0.23076923076923078,
+      "acc_stderr": 0.02495714171242502,
+      "acc_norm": 0.23426573426573427,
+      "acc_norm_stderr": 0.025088286217169773
+    }
+  },
+  "versions": {
+    "arc_kn_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_kn_challenge_gpt2.json b/evals/arc-challenge/arc_kn_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..06e41e33136f376ee8441914155f63301d2b3150
--- /dev/null
+++ b/evals/arc-challenge/arc_kn_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_kn_challenge": {
+      "acc": 0.21678321678321677,
+      "acc_stderr": 0.02440795482238759,
+      "acc_norm": 0.1993006993006993,
+      "acc_norm_stderr": 0.023662831210753306
+    }
+  },
+  "versions": {
+    "arc_kn_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_kn_challenge_llama-7B.json b/evals/arc-challenge/arc_kn_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..54ade592ef4b8faca4ac733019e8a288ffcd7080
--- /dev/null
+++ b/evals/arc-challenge/arc_kn_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_kn_challenge": {
+      "acc": 0.25524475524475526,
+      "acc_stderr": 0.025826334320570847,
+      "acc_norm": 0.2762237762237762,
+      "acc_norm_stderr": 0.026485626798716456
+    }
+  },
+  "versions": {
+    "arc_kn_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ml_challenge_bloom-1b7.json b/evals/arc-challenge/arc_ml_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..237a4de001e4d03d3a5da1bd85ff383ee5ed3641
--- /dev/null
+++ b/evals/arc-challenge/arc_ml_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ml_challenge": {
+      "acc": 0.20270270270270271,
+      "acc_stderr": 0.023406091994174035,
+      "acc_norm": 0.20945945945945946,
+      "acc_norm_stderr": 0.023691963473475734
+    }
+  },
+  "versions": {
+    "arc_ml_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ml_challenge_bloom-560.json b/evals/arc-challenge/arc_ml_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..b276b36482cf0a1c5ed243c8a17297e981587426
--- /dev/null
+++ b/evals/arc-challenge/arc_ml_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ml_challenge": {
+      "acc": 0.19932432432432431,
+      "acc_stderr": 0.02325934388926828,
+      "acc_norm": 0.23310810810810811,
+      "acc_norm_stderr": 0.024616978985669728
+    }
+  },
+  "versions": {
+    "arc_ml_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ml_challenge_bloom-7b1.json b/evals/arc-challenge/arc_ml_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..57e340993dc80aab56386e3c1ade388f4d786241
--- /dev/null
+++ b/evals/arc-challenge/arc_ml_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ml_challenge": {
+      "acc": 0.22635135135135134,
+      "acc_stderr": 0.024364215012920545,
+      "acc_norm": 0.22297297297297297,
+      "acc_norm_stderr": 0.02423444993634421
+    }
+  },
+  "versions": {
+    "arc_ml_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ml_challenge_gpt2-large.json b/evals/arc-challenge/arc_ml_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..a23148b0cf58ef04dc9ab3bb8d26aedadda9296f
--- /dev/null
+++ b/evals/arc-challenge/arc_ml_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ml_challenge": {
+      "acc": 0.22972972972972974,
+      "acc_stderr": 0.024491712953916972,
+      "acc_norm": 0.22297297297297297,
+      "acc_norm_stderr": 0.024234449936344216
+    }
+  },
+  "versions": {
+    "arc_ml_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ml_challenge_gpt2-medium.json b/evals/arc-challenge/arc_ml_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..9aa842f5ce9d59030c7aae3de538f9b3ea816580
--- /dev/null
+++ b/evals/arc-challenge/arc_ml_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ml_challenge": {
+      "acc": 0.2533783783783784,
+      "acc_stderr": 0.0253235186291,
+      "acc_norm": 0.21283783783783783,
+      "acc_norm_stderr": 0.0238311783119674
+    }
+  },
+  "versions": {
+    "arc_ml_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ml_challenge_gpt2.json b/evals/arc-challenge/arc_ml_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..0c8fc7d983c690076289a5040bce6204cb0b9146
--- /dev/null
+++ b/evals/arc-challenge/arc_ml_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ml_challenge": {
+      "acc": 0.25,
+      "acc_stderr": 0.025210974204480537,
+      "acc_norm": 0.21283783783783783,
+      "acc_norm_stderr": 0.023831178311967415
+    }
+  },
+  "versions": {
+    "arc_ml_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ml_challenge_llama-7B.json b/evals/arc-challenge/arc_ml_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f4555f5009cd795dea8981be98bec45e2ed9369
--- /dev/null
+++ b/evals/arc-challenge/arc_ml_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ml_challenge": {
+      "acc": 0.21621621621621623,
+      "acc_stderr": 0.023967970439477224,
+      "acc_norm": 0.20270270270270271,
+      "acc_norm_stderr": 0.023406091994174035
+    }
+  },
+  "versions": {
+    "arc_ml_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_mr_challenge_bloom-1b7.json b/evals/arc-challenge/arc_mr_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..c8b3bb6a26b22a95c0a8de8ae3221f476963428f
--- /dev/null
+++ b/evals/arc-challenge/arc_mr_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_mr_challenge": {
+      "acc": 0.24067796610169492,
+      "acc_stderr": 0.02493202205172924,
+      "acc_norm": 0.2440677966101695,
+      "acc_norm_stderr": 0.02505088069031971
+    }
+  },
+  "versions": {
+    "arc_mr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_mr_challenge_bloom-560.json b/evals/arc-challenge/arc_mr_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..213f904f45633d7bdef01eef045a28ec2636faf5
--- /dev/null
+++ b/evals/arc-challenge/arc_mr_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_mr_challenge": {
+      "acc": 0.2440677966101695,
+      "acc_stderr": 0.025050880690319716,
+      "acc_norm": 0.22372881355932203,
+      "acc_norm_stderr": 0.02430491058853199
+    }
+  },
+  "versions": {
+    "arc_mr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_mr_challenge_bloom-7b1.json b/evals/arc-challenge/arc_mr_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4a6cfb61ab6cccf8da1ad0ec46c1bde46e11be82
--- /dev/null
+++ b/evals/arc-challenge/arc_mr_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_mr_challenge": {
+      "acc": 0.23389830508474577,
+      "acc_stderr": 0.024687839412166384,
+      "acc_norm": 0.2440677966101695,
+      "acc_norm_stderr": 0.025050880690319702
+    }
+  },
+  "versions": {
+    "arc_mr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_mr_challenge_gpt2-large.json b/evals/arc-challenge/arc_mr_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..380f5aee1d555e85568122130af494663cb3123f
--- /dev/null
+++ b/evals/arc-challenge/arc_mr_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_mr_challenge": {
+      "acc": 0.2,
+      "acc_stderr": 0.023328473740792135,
+      "acc_norm": 0.2440677966101695,
+      "acc_norm_stderr": 0.025050880690319702
+    }
+  },
+  "versions": {
+    "arc_mr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_mr_challenge_gpt2-medium.json b/evals/arc-challenge/arc_mr_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..7df5889da7e82e2529e4532947c4e0e8507ba94c
--- /dev/null
+++ b/evals/arc-challenge/arc_mr_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_mr_challenge": {
+      "acc": 0.2,
+      "acc_stderr": 0.023328473740792135,
+      "acc_norm": 0.22372881355932203,
+      "acc_norm_stderr": 0.024304910588531993
+    }
+  },
+  "versions": {
+    "arc_mr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_mr_challenge_gpt2.json b/evals/arc-challenge/arc_mr_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..8344c19a2efa7d7c252e94ea149ef5b421b34214
--- /dev/null
+++ b/evals/arc-challenge/arc_mr_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_mr_challenge": {
+      "acc": 0.18305084745762712,
+      "acc_stderr": 0.02255328043040195,
+      "acc_norm": 0.2033898305084746,
+      "acc_norm_stderr": 0.023475447251410726
+    }
+  },
+  "versions": {
+    "arc_mr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_mr_challenge_llama-7B.json b/evals/arc-challenge/arc_mr_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..f1cf03e6c1c130bd7352dd7963fe03ae5f4303fe
--- /dev/null
+++ b/evals/arc-challenge/arc_mr_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_mr_challenge": {
+      "acc": 0.2271186440677966,
+      "acc_stderr": 0.024434819973932945,
+      "acc_norm": 0.2711864406779661,
+      "acc_norm_stderr": 0.025927971596786177
+    }
+  },
+  "versions": {
+    "arc_mr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ne_challenge_bloom-1b7.json b/evals/arc-challenge/arc_ne_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..9ef6fea604fc9172e63676717b7455a756bbbd4e
--- /dev/null
+++ b/evals/arc-challenge/arc_ne_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ne_challenge": {
+      "acc": 0.2222222222222222,
+      "acc_stderr": 0.024164379788935486,
+      "acc_norm": 0.30303030303030304,
+      "acc_norm_stderr": 0.026711859553317677
+    }
+  },
+  "versions": {
+    "arc_ne_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ne_challenge_bloom-560.json b/evals/arc-challenge/arc_ne_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..490a9ae38f7edf0f013f898d0c075db2184dc99b
--- /dev/null
+++ b/evals/arc-challenge/arc_ne_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ne_challenge": {
+      "acc": 0.25925925925925924,
+      "acc_stderr": 0.02547149279279167,
+      "acc_norm": 0.28619528619528617,
+      "acc_norm_stderr": 0.02627090829835463
+    }
+  },
+  "versions": {
+    "arc_ne_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ne_challenge_bloom-7b1.json b/evals/arc-challenge/arc_ne_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..0b1c6c30b759cb29ce78c358d0d709a7b53f16f3
--- /dev/null
+++ b/evals/arc-challenge/arc_ne_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ne_challenge": {
+      "acc": 0.24242424242424243,
+      "acc_stderr": 0.024908937470508766,
+      "acc_norm": 0.2996632996632997,
+      "acc_norm_stderr": 0.02662713045011499
+    }
+  },
+  "versions": {
+    "arc_ne_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ne_challenge_gpt2-large.json b/evals/arc-challenge/arc_ne_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..82b4b764b3fb7ef15563ca6d2c27830e3aef8d51
--- /dev/null
+++ b/evals/arc-challenge/arc_ne_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ne_challenge": {
+      "acc": 0.23905723905723905,
+      "acc_stderr": 0.024790260423468984,
+      "acc_norm": 0.23905723905723905,
+      "acc_norm_stderr": 0.02479026042346898
+    }
+  },
+  "versions": {
+    "arc_ne_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ne_challenge_gpt2-medium.json b/evals/arc-challenge/arc_ne_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..18464b4f845260d9e4122a7c74c4fc758519296a
--- /dev/null
+++ b/evals/arc-challenge/arc_ne_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ne_challenge": {
+      "acc": 0.23905723905723905,
+      "acc_stderr": 0.024790260423468984,
+      "acc_norm": 0.24579124579124578,
+      "acc_norm_stderr": 0.025025521384235295
+    }
+  },
+  "versions": {
+    "arc_ne_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ne_challenge_gpt2.json b/evals/arc-challenge/arc_ne_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..669e0661f7894b2bdc02512e274ab12a340e6f2c
--- /dev/null
+++ b/evals/arc-challenge/arc_ne_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ne_challenge": {
+      "acc": 0.2356902356902357,
+      "acc_stderr": 0.024669460034907637,
+      "acc_norm": 0.2255892255892256,
+      "acc_norm_stderr": 0.02429399929295737
+    }
+  },
+  "versions": {
+    "arc_ne_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ne_challenge_llama-7B.json b/evals/arc-challenge/arc_ne_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..a22c844ed32434eb2d404f76e104c502e7218625
--- /dev/null
+++ b/evals/arc-challenge/arc_ne_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ne_challenge": {
+      "acc": 0.2255892255892256,
+      "acc_stderr": 0.024293999292957367,
+      "acc_norm": 0.265993265993266,
+      "acc_norm_stderr": 0.025682629556652858
+    }
+  },
+  "versions": {
+    "arc_ne_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_nl_challenge_bloom-1b7.json b/evals/arc-challenge/arc_nl_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..de6df0fa84c07702ad9d3005757f4412e835e175
--- /dev/null
+++ b/evals/arc-challenge/arc_nl_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_nl_challenge": {
+      "acc": 0.20469798657718122,
+      "acc_stderr": 0.02341232810510543,
+      "acc_norm": 0.24161073825503357,
+      "acc_norm_stderr": 0.024838535108028484
+    }
+  },
+  "versions": {
+    "arc_nl_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_nl_challenge_bloom-560.json b/evals/arc-challenge/arc_nl_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..4bd9dec46927eea8709a44925f7f7f5e4d35c055
--- /dev/null
+++ b/evals/arc-challenge/arc_nl_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_nl_challenge": {
+      "acc": 0.22483221476510068,
+      "acc_stderr": 0.024224169829650748,
+      "acc_norm": 0.2651006711409396,
+      "acc_norm_stderr": 0.025611859712206003
+    }
+  },
+  "versions": {
+    "arc_nl_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_nl_challenge_bloom-7b1.json b/evals/arc-challenge/arc_nl_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5360e3ed9ed9f43f4cbddc65166e1d83d89a29e6
--- /dev/null
+++ b/evals/arc-challenge/arc_nl_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_nl_challenge": {
+      "acc": 0.20134228187919462,
+      "acc_stderr": 0.0232685657676853,
+      "acc_norm": 0.2684563758389262,
+      "acc_norm_stderr": 0.025714539514817496
+    }
+  },
+  "versions": {
+    "arc_nl_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_nl_challenge_gpt2-large.json b/evals/arc-challenge/arc_nl_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..432863c5e4840c2d01bdac986765c61050413f9f
--- /dev/null
+++ b/evals/arc-challenge/arc_nl_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_nl_challenge": {
+      "acc": 0.2080536912751678,
+      "acc_stderr": 0.023553603370264114,
+      "acc_norm": 0.2516778523489933,
+      "acc_norm_stderr": 0.025181904610615855
+    }
+  },
+  "versions": {
+    "arc_nl_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_nl_challenge_gpt2-medium.json b/evals/arc-challenge/arc_nl_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..65d7c05ced99e1bd53aa3110a033d9c0975025fa
--- /dev/null
+++ b/evals/arc-challenge/arc_nl_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_nl_challenge": {
+      "acc": 0.23154362416107382,
+      "acc_stderr": 0.024476414420146628,
+      "acc_norm": 0.2550335570469799,
+      "acc_norm_stderr": 0.025292327380712687
+    }
+  },
+  "versions": {
+    "arc_nl_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_nl_challenge_gpt2.json b/evals/arc-challenge/arc_nl_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..bce39d9e1424be6bf01a0c15447e59c3348a08d6
--- /dev/null
+++ b/evals/arc-challenge/arc_nl_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_nl_challenge": {
+      "acc": 0.21476510067114093,
+      "acc_stderr": 0.023828868848284373,
+      "acc_norm": 0.24496644295302014,
+      "acc_norm_stderr": 0.024955035980898956
+    }
+  },
+  "versions": {
+    "arc_nl_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_nl_challenge_llama-7B.json b/evals/arc-challenge/arc_nl_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..a9b3e1e927abac3aba0720a5085b3a1b041af85b
--- /dev/null
+++ b/evals/arc-challenge/arc_nl_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_nl_challenge": {
+      "acc": 0.2953020134228188,
+      "acc_stderr": 0.026470155629081078,
+      "acc_norm": 0.32550335570469796,
+      "acc_norm_stderr": 0.027188760373954457
+    }
+  },
+  "versions": {
+    "arc_nl_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_pt_challenge_bloom-1b7.json b/evals/arc-challenge/arc_pt_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..86206aa4c02654dee089146263800252a9280415
--- /dev/null
+++ b/evals/arc-challenge/arc_pt_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_pt_challenge": {
+      "acc": 0.22483221476510068,
+      "acc_stderr": 0.024224169829650755,
+      "acc_norm": 0.28187919463087246,
+      "acc_norm_stderr": 0.026106703750007426
+    }
+  },
+  "versions": {
+    "arc_pt_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_pt_challenge_bloom-560.json b/evals/arc-challenge/arc_pt_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..11021802d7ffa732fc84739fd8ec1d531dc637b6
--- /dev/null
+++ b/evals/arc-challenge/arc_pt_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_pt_challenge": {
+      "acc": 0.22483221476510068,
+      "acc_stderr": 0.02422416982965075,
+      "acc_norm": 0.23154362416107382,
+      "acc_norm_stderr": 0.02447641442014662
+    }
+  },
+  "versions": {
+    "arc_pt_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_pt_challenge_bloom-7b1.json b/evals/arc-challenge/arc_pt_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e9f27045095eca6ce035e90605bdff561f37a5a8
--- /dev/null
+++ b/evals/arc-challenge/arc_pt_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_pt_challenge": {
+      "acc": 0.348993288590604,
+      "acc_stderr": 0.02765814479375022,
+      "acc_norm": 0.3724832214765101,
+      "acc_norm_stderr": 0.02805354855477509
+    }
+  },
+  "versions": {
+    "arc_pt_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_pt_challenge_gpt2-large.json b/evals/arc-challenge/arc_pt_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd1a4b8d1948d7ebf686b68f03b68fae0c5e41de
--- /dev/null
+++ b/evals/arc-challenge/arc_pt_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_pt_challenge": {
+      "acc": 0.18791946308724833,
+      "acc_stderr": 0.022667687029933926,
+      "acc_norm": 0.24161073825503357,
+      "acc_norm_stderr": 0.024838535108028477
+    }
+  },
+  "versions": {
+    "arc_pt_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_pt_challenge_gpt2-medium.json b/evals/arc-challenge/arc_pt_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..0380aff06ff37610aa48dddf5d15f62376f1d08b
--- /dev/null
+++ b/evals/arc-challenge/arc_pt_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_pt_challenge": {
+      "acc": 0.18120805369127516,
+      "acc_stderr": 0.02235101779623449,
+      "acc_norm": 0.2348993288590604,
+      "acc_norm_stderr": 0.024599255015999244
+    }
+  },
+  "versions": {
+    "arc_pt_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_pt_challenge_gpt2.json b/evals/arc-challenge/arc_pt_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6a1952ed53a80de06750b3d6155487089a0672bd
--- /dev/null
+++ b/evals/arc-challenge/arc_pt_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_pt_challenge": {
+      "acc": 0.19463087248322147,
+      "acc_stderr": 0.022973392306598166,
+      "acc_norm": 0.2483221476510067,
+      "acc_norm_stderr": 0.025069483148037884
+    }
+  },
+  "versions": {
+    "arc_pt_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_pt_challenge_llama-7B.json b/evals/arc-challenge/arc_pt_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..e49526aa9a3f1e1f7fda72f9bf9b3a58227a95ce
--- /dev/null
+++ b/evals/arc-challenge/arc_pt_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_pt_challenge": {
+      "acc": 0.32550335570469796,
+      "acc_stderr": 0.027188760373954457,
+      "acc_norm": 0.33557046979865773,
+      "acc_norm_stderr": 0.027399214125091453
+    }
+  },
+  "versions": {
+    "arc_pt_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ro_challenge_bloom-1b7.json b/evals/arc-challenge/arc_ro_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd189e9050be188d43e3bac19cd42c400c5df7c8
--- /dev/null
+++ b/evals/arc-challenge/arc_ro_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ro_challenge": {
+      "acc": 0.24915824915824916,
+      "acc_stderr": 0.025140041284626418,
+      "acc_norm": 0.28619528619528617,
+      "acc_norm_stderr": 0.026270908298354635
+    }
+  },
+  "versions": {
+    "arc_ro_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ro_challenge_bloom-560.json b/evals/arc-challenge/arc_ro_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..a797f1ebfa7d92e0c78e624b99da52e77c92822c
--- /dev/null
+++ b/evals/arc-challenge/arc_ro_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ro_challenge": {
+      "acc": 0.20875420875420875,
+      "acc_stderr": 0.023622587756271473,
+      "acc_norm": 0.26936026936026936,
+      "acc_norm_stderr": 0.025785321789052268
+    }
+  },
+  "versions": {
+    "arc_ro_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ro_challenge_bloom-7b1.json b/evals/arc-challenge/arc_ro_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7e63a3d72b4f1a770523a9859787818e4e1ed26e
--- /dev/null
+++ b/evals/arc-challenge/arc_ro_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ro_challenge": {
+      "acc": 0.25252525252525254,
+      "acc_stderr": 0.025252525252525346,
+      "acc_norm": 0.30303030303030304,
+      "acc_norm_stderr": 0.02671185955331767
+    }
+  },
+  "versions": {
+    "arc_ro_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ro_challenge_gpt2-large.json b/evals/arc-challenge/arc_ro_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..68f4f45196bec82ad2ec165f33cae93bfbedbe44
--- /dev/null
+++ b/evals/arc-challenge/arc_ro_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ro_challenge": {
+      "acc": 0.18855218855218855,
+      "acc_stderr": 0.022735275955770386,
+      "acc_norm": 0.2828282828282828,
+      "acc_norm_stderr": 0.026177438014745407
+    }
+  },
+  "versions": {
+    "arc_ro_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ro_challenge_gpt2-medium.json b/evals/arc-challenge/arc_ro_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..5df0a11438afe98b491a6e5528d70eacb48652cf
--- /dev/null
+++ b/evals/arc-challenge/arc_ro_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ro_challenge": {
+      "acc": 0.18855218855218855,
+      "acc_stderr": 0.022735275955770375,
+      "acc_norm": 0.2558922558922559,
+      "acc_norm_stderr": 0.025363000375801976
+    }
+  },
+  "versions": {
+    "arc_ro_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ro_challenge_gpt2.json b/evals/arc-challenge/arc_ro_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..37203889a39601337bd2d8ffcd85a3e4693013ad
--- /dev/null
+++ b/evals/arc-challenge/arc_ro_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ro_challenge": {
+      "acc": 0.20875420875420875,
+      "acc_stderr": 0.02362258775627147,
+      "acc_norm": 0.2962962962962963,
+      "acc_norm_stderr": 0.026540687854980673
+    }
+  },
+  "versions": {
+    "arc_ro_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ro_challenge_llama-7B.json b/evals/arc-challenge/arc_ro_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..37d943e737472a25d2c879425d478f6dd746e1f4
--- /dev/null
+++ b/evals/arc-challenge/arc_ro_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ro_challenge": {
+      "acc": 0.2828282828282828,
+      "acc_stderr": 0.02617743801474542,
+      "acc_norm": 0.3164983164983165,
+      "acc_norm_stderr": 0.027033958384207805
+    }
+  },
+  "versions": {
+    "arc_ro_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ru_challenge_bloom-1b7.json b/evals/arc-challenge/arc_ru_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc9a3f783edc283ec79c7906da73bc8a27f80a9d
--- /dev/null
+++ b/evals/arc-challenge/arc_ru_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ru_challenge": {
+      "acc": 0.25252525252525254,
+      "acc_stderr": 0.02525252525252537,
+      "acc_norm": 0.3569023569023569,
+      "acc_norm_stderr": 0.027846288057490554
+    }
+  },
+  "versions": {
+    "arc_ru_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ru_challenge_bloom-560.json b/evals/arc-challenge/arc_ru_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..863c94dcc4459d25ef7faec70a11d6199434c8af
--- /dev/null
+++ b/evals/arc-challenge/arc_ru_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ru_challenge": {
+      "acc": 0.24915824915824916,
+      "acc_stderr": 0.025140041284626418,
+      "acc_norm": 0.3333333333333333,
+      "acc_norm_stderr": 0.027399831217559588
+    }
+  },
+  "versions": {
+    "arc_ru_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ru_challenge_bloom-7b1.json b/evals/arc-challenge/arc_ru_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b61e526e728d5523f1e61b4fe49307c1c872c4c
--- /dev/null
+++ b/evals/arc-challenge/arc_ru_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ru_challenge": {
+      "acc": 0.25925925925925924,
+      "acc_stderr": 0.025471492792791674,
+      "acc_norm": 0.32996632996632996,
+      "acc_norm_stderr": 0.02732985145570343
+    }
+  },
+  "versions": {
+    "arc_ru_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ru_challenge_gpt2-large.json b/evals/arc-challenge/arc_ru_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..fd367513e4157fb1556348f212a5c6e94922beee
--- /dev/null
+++ b/evals/arc-challenge/arc_ru_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ru_challenge": {
+      "acc": 0.24579124579124578,
+      "acc_stderr": 0.02502552138423529,
+      "acc_norm": 0.29292929292929293,
+      "acc_norm_stderr": 0.026452514969665924
+    }
+  },
+  "versions": {
+    "arc_ru_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ru_challenge_gpt2-medium.json b/evals/arc-challenge/arc_ru_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..8a7b6aee643ab931ddd7a2528c36075699604170
--- /dev/null
+++ b/evals/arc-challenge/arc_ru_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ru_challenge": {
+      "acc": 0.21548821548821548,
+      "acc_stderr": 0.023898224834697,
+      "acc_norm": 0.2558922558922559,
+      "acc_norm_stderr": 0.025363000375801963
+    }
+  },
+  "versions": {
+    "arc_ru_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ru_challenge_gpt2.json b/evals/arc-challenge/arc_ru_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c01167509035c09b2ab40ba64c6f23d0d3b61c6
--- /dev/null
+++ b/evals/arc-challenge/arc_ru_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ru_challenge": {
+      "acc": 0.19865319865319866,
+      "acc_stderr": 0.023190610381322137,
+      "acc_norm": 0.26936026936026936,
+      "acc_norm_stderr": 0.025785321789052268
+    }
+  },
+  "versions": {
+    "arc_ru_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ru_challenge_llama-7B.json b/evals/arc-challenge/arc_ru_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..c6af8bacc84e8232e587af0b1b62f0360595f5b8
--- /dev/null
+++ b/evals/arc-challenge/arc_ru_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ru_challenge": {
+      "acc": 0.2895622895622896,
+      "acc_stderr": 0.026362594432681956,
+      "acc_norm": 0.3333333333333333,
+      "acc_norm_stderr": 0.027399831217559577
+    }
+  },
+  "versions": {
+    "arc_ru_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sk_challenge_bloom-1b7.json b/evals/arc-challenge/arc_sk_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..5c061cbf7e912082f72face7e42633294acb46b4
--- /dev/null
+++ b/evals/arc-challenge/arc_sk_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sk_challenge": {
+      "acc": 0.2516778523489933,
+      "acc_stderr": 0.02518190461061586,
+      "acc_norm": 0.2516778523489933,
+      "acc_norm_stderr": 0.025181904610615865
+    }
+  },
+  "versions": {
+    "arc_sk_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sk_challenge_bloom-560.json b/evals/arc-challenge/arc_sk_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..77221ca57be5ff0cc96e73fc774d0670d7c7208c
--- /dev/null
+++ b/evals/arc-challenge/arc_sk_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sk_challenge": {
+      "acc": 0.24161073825503357,
+      "acc_stderr": 0.02483853510802848,
+      "acc_norm": 0.22483221476510068,
+      "acc_norm_stderr": 0.02422416982965075
+    }
+  },
+  "versions": {
+    "arc_sk_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sk_challenge_bloom-7b1.json b/evals/arc-challenge/arc_sk_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d78271208e5af3f6496e645f8b79b3b7394aa34
--- /dev/null
+++ b/evals/arc-challenge/arc_sk_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sk_challenge": {
+      "acc": 0.2348993288590604,
+      "acc_stderr": 0.024599255015999244,
+      "acc_norm": 0.25838926174496646,
+      "acc_norm_stderr": 0.025400777524610105
+    }
+  },
+  "versions": {
+    "arc_sk_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sk_challenge_gpt2-large.json b/evals/arc-challenge/arc_sk_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..128f662c32c44780afb9fd950815540a151364d6
--- /dev/null
+++ b/evals/arc-challenge/arc_sk_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sk_challenge": {
+      "acc": 0.24161073825503357,
+      "acc_stderr": 0.02483853510802848,
+      "acc_norm": 0.2516778523489933,
+      "acc_norm_stderr": 0.025181904610615858
+    }
+  },
+  "versions": {
+    "arc_sk_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sk_challenge_gpt2-medium.json b/evals/arc-challenge/arc_sk_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..75bc31afba2a470fbe33869562f865ae458240c8
--- /dev/null
+++ b/evals/arc-challenge/arc_sk_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sk_challenge": {
+      "acc": 0.23825503355704697,
+      "acc_stderr": 0.02471995149315962,
+      "acc_norm": 0.24496644295302014,
+      "acc_norm_stderr": 0.02495503598089895
+    }
+  },
+  "versions": {
+    "arc_sk_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sk_challenge_gpt2.json b/evals/arc-challenge/arc_sk_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..28459f8e1e1dc32e8d92343933fa438b717eb85b
--- /dev/null
+++ b/evals/arc-challenge/arc_sk_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sk_challenge": {
+      "acc": 0.2348993288590604,
+      "acc_stderr": 0.024599255015999244,
+      "acc_norm": 0.23154362416107382,
+      "acc_norm_stderr": 0.02447641442014662
+    }
+  },
+  "versions": {
+    "arc_sk_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sk_challenge_llama-7B.json b/evals/arc-challenge/arc_sk_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..3701c2f5034fd64259683639da7b904f8bf0d1d1
--- /dev/null
+++ b/evals/arc-challenge/arc_sk_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sk_challenge": {
+      "acc": 0.2348993288590604,
+      "acc_stderr": 0.024599255015999244,
+      "acc_norm": 0.2550335570469799,
+      "acc_norm_stderr": 0.025292327380712683
+    }
+  },
+  "versions": {
+    "arc_sk_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sr_challenge_bloom-1b7.json b/evals/arc-challenge/arc_sr_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..dbdcdb6f40e4a2a2d630ac6967d84266a19ee386
--- /dev/null
+++ b/evals/arc-challenge/arc_sr_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sr_challenge": {
+      "acc": 0.23986486486486486,
+      "acc_stderr": 0.024860949670846393,
+      "acc_norm": 0.2635135135135135,
+      "acc_norm_stderr": 0.025649141242391035
+    }
+  },
+  "versions": {
+    "arc_sr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sr_challenge_bloom-560.json b/evals/arc-challenge/arc_sr_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4e4aafa24a952d05d4ff3efde104237233e2747
--- /dev/null
+++ b/evals/arc-challenge/arc_sr_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sr_challenge": {
+      "acc": 0.22972972972972974,
+      "acc_stderr": 0.02449171295391697,
+      "acc_norm": 0.27702702702702703,
+      "acc_norm_stderr": 0.02605620088360472
+    }
+  },
+  "versions": {
+    "arc_sr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sr_challenge_bloom-7b1.json b/evals/arc-challenge/arc_sr_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e70cc59ff97ac76e9506b0a8c29249c91543af45
--- /dev/null
+++ b/evals/arc-challenge/arc_sr_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sr_challenge": {
+      "acc": 0.26013513513513514,
+      "acc_stderr": 0.025542576393640232,
+      "acc_norm": 0.30067567567567566,
+      "acc_norm_stderr": 0.026697921821786215
+    }
+  },
+  "versions": {
+    "arc_sr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sr_challenge_gpt2-large.json b/evals/arc-challenge/arc_sr_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..381e33947c532c85c78a23c4986d737ed19bc7e1
--- /dev/null
+++ b/evals/arc-challenge/arc_sr_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sr_challenge": {
+      "acc": 0.1891891891891892,
+      "acc_stderr": 0.022803258753373676,
+      "acc_norm": 0.24324324324324326,
+      "acc_norm_stderr": 0.024979718407699757
+    }
+  },
+  "versions": {
+    "arc_sr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sr_challenge_gpt2-medium.json b/evals/arc-challenge/arc_sr_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..d59206fddbda1dfd8cd1e6514ca6cba7f09dd45b
--- /dev/null
+++ b/evals/arc-challenge/arc_sr_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sr_challenge": {
+      "acc": 0.20608108108108109,
+      "acc_stderr": 0.023550282959294247,
+      "acc_norm": 0.24662162162162163,
+      "acc_norm_stderr": 0.02509638351759426
+    }
+  },
+  "versions": {
+    "arc_sr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sr_challenge_gpt2.json b/evals/arc-challenge/arc_sr_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..ed4d03dcbbbdb78f9e36972c6c09ea65f958accf
--- /dev/null
+++ b/evals/arc-challenge/arc_sr_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sr_challenge": {
+      "acc": 0.18243243243243243,
+      "acc_stderr": 0.0224854634796718,
+      "acc_norm": 0.22972972972972974,
+      "acc_norm_stderr": 0.024491712953916972
+    }
+  },
+  "versions": {
+    "arc_sr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sr_challenge_llama-7B.json b/evals/arc-challenge/arc_sr_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..9a1c5c3f8986ce3acbf704e6d2fbd4d82fbcc724
--- /dev/null
+++ b/evals/arc-challenge/arc_sr_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sr_challenge": {
+      "acc": 0.2905405405405405,
+      "acc_stderr": 0.026433590266607382,
+      "acc_norm": 0.2972972972972973,
+      "acc_norm_stderr": 0.02661155695908287
+    }
+  },
+  "versions": {
+    "arc_sr_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sv_challenge_bloom-1b7.json b/evals/arc-challenge/arc_sv_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..962c6f1d023be86a6fa7adf0d018a08eda14f1b8
--- /dev/null
+++ b/evals/arc-challenge/arc_sv_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sv_challenge": {
+      "acc": 0.20202020202020202,
+      "acc_stderr": 0.023337132573282605,
+      "acc_norm": 0.23232323232323232,
+      "acc_norm_stderr": 0.02454650495612789
+    }
+  },
+  "versions": {
+    "arc_sv_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sv_challenge_bloom-560.json b/evals/arc-challenge/arc_sv_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..9477cbe0f42a6cdde99f9a0af2293c4b1c23cf00
--- /dev/null
+++ b/evals/arc-challenge/arc_sv_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sv_challenge": {
+      "acc": 0.21212121212121213,
+      "acc_stderr": 0.02376161191876168,
+      "acc_norm": 0.2053872053872054,
+      "acc_norm_stderr": 0.023481109518599313
+    }
+  },
+  "versions": {
+    "arc_sv_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sv_challenge_bloom-7b1.json b/evals/arc-challenge/arc_sv_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c89c1d01bfea674f9f7d9549f8abf2abe32192f8
--- /dev/null
+++ b/evals/arc-challenge/arc_sv_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sv_challenge": {
+      "acc": 0.2255892255892256,
+      "acc_stderr": 0.024293999292957367,
+      "acc_norm": 0.265993265993266,
+      "acc_norm_stderr": 0.02568262955665285
+    }
+  },
+  "versions": {
+    "arc_sv_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sv_challenge_gpt2-large.json b/evals/arc-challenge/arc_sv_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..c090b83981933a41b620746123d08d4ba90f53a2
--- /dev/null
+++ b/evals/arc-challenge/arc_sv_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sv_challenge": {
+      "acc": 0.22895622895622897,
+      "acc_stderr": 0.02442136264227106,
+      "acc_norm": 0.23232323232323232,
+      "acc_norm_stderr": 0.02454650495612789
+    }
+  },
+  "versions": {
+    "arc_sv_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sv_challenge_gpt2-medium.json b/evals/arc-challenge/arc_sv_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..31f537c4fb8157ec63b8cbcb4d2001cfd08e1533
--- /dev/null
+++ b/evals/arc-challenge/arc_sv_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sv_challenge": {
+      "acc": 0.2255892255892256,
+      "acc_stderr": 0.024293999292957367,
+      "acc_norm": 0.24242424242424243,
+      "acc_norm_stderr": 0.02490893747050876
+    }
+  },
+  "versions": {
+    "arc_sv_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sv_challenge_gpt2.json b/evals/arc-challenge/arc_sv_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..718a97a6d9df935c9f0818257fda43ef3bfc7996
--- /dev/null
+++ b/evals/arc-challenge/arc_sv_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sv_challenge": {
+      "acc": 0.2255892255892256,
+      "acc_stderr": 0.024293999292957367,
+      "acc_norm": 0.2356902356902357,
+      "acc_norm_stderr": 0.024669460034907637
+    }
+  },
+  "versions": {
+    "arc_sv_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sv_challenge_llama-7B.json b/evals/arc-challenge/arc_sv_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..c2c4e7550c402c4d3dbaf7d6ea56dbf864c439ce
--- /dev/null
+++ b/evals/arc-challenge/arc_sv_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sv_challenge": {
+      "acc": 0.2962962962962963,
+      "acc_stderr": 0.026540687854980646,
+      "acc_norm": 0.30303030303030304,
+      "acc_norm_stderr": 0.02671185955331767
+    }
+  },
+  "versions": {
+    "arc_sv_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ta_challenge_bloom-1b7.json b/evals/arc-challenge/arc_ta_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..a937aa6dd9066efa74a5b88515612f7dc4ba6691
--- /dev/null
+++ b/evals/arc-challenge/arc_ta_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ta_challenge": {
+      "acc": 0.21283783783783783,
+      "acc_stderr": 0.02383117831196738,
+      "acc_norm": 0.25675675675675674,
+      "acc_norm_stderr": 0.025434043955304575
+    }
+  },
+  "versions": {
+    "arc_ta_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ta_challenge_bloom-560.json b/evals/arc-challenge/arc_ta_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b1c389d448803dd7a2c483cec6aa7ff1876c4a6
--- /dev/null
+++ b/evals/arc-challenge/arc_ta_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ta_challenge": {
+      "acc": 0.19932432432432431,
+      "acc_stderr": 0.02325934388926828,
+      "acc_norm": 0.2533783783783784,
+      "acc_norm_stderr": 0.025323518629100025
+    }
+  },
+  "versions": {
+    "arc_ta_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ta_challenge_bloom-7b1.json b/evals/arc-challenge/arc_ta_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5da07219683283eaafbda47b1ed0957be400dda
--- /dev/null
+++ b/evals/arc-challenge/arc_ta_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ta_challenge": {
+      "acc": 0.23310810810810811,
+      "acc_stderr": 0.024616978985669728,
+      "acc_norm": 0.24324324324324326,
+      "acc_norm_stderr": 0.02497971840769973
+    }
+  },
+  "versions": {
+    "arc_ta_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ta_challenge_gpt2-large.json b/evals/arc-challenge/arc_ta_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..918cb1c7f6be3a7693ecf8713714c664843cfc38
--- /dev/null
+++ b/evals/arc-challenge/arc_ta_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ta_challenge": {
+      "acc": 0.21283783783783783,
+      "acc_stderr": 0.02383117831196738,
+      "acc_norm": 0.23310810810810811,
+      "acc_norm_stderr": 0.024616978985669724
+    }
+  },
+  "versions": {
+    "arc_ta_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ta_challenge_gpt2-medium.json b/evals/arc-challenge/arc_ta_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..6af3ab31fdcf16311ec8594bad8ee052c05b16bc
--- /dev/null
+++ b/evals/arc-challenge/arc_ta_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ta_challenge": {
+      "acc": 0.2195945945945946,
+      "acc_stderr": 0.02410238110604679,
+      "acc_norm": 0.2668918918918919,
+      "acc_norm_stderr": 0.025753762926257903
+    }
+  },
+  "versions": {
+    "arc_ta_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ta_challenge_gpt2.json b/evals/arc-challenge/arc_ta_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5245a03aac201f65f42e53dcabf6d1f7c0717d52
--- /dev/null
+++ b/evals/arc-challenge/arc_ta_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ta_challenge": {
+      "acc": 0.23986486486486486,
+      "acc_stderr": 0.024860949670846396,
+      "acc_norm": 0.26013513513513514,
+      "acc_norm_stderr": 0.025542576393640246
+    }
+  },
+  "versions": {
+    "arc_ta_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ta_challenge_llama-7B.json b/evals/arc-challenge/arc_ta_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..241feef032d750202d858fbc9162e3549a178160
--- /dev/null
+++ b/evals/arc-challenge/arc_ta_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ta_challenge": {
+      "acc": 0.20270270270270271,
+      "acc_stderr": 0.02340609199417405,
+      "acc_norm": 0.22297297297297297,
+      "acc_norm_stderr": 0.02423444993634422
+    }
+  },
+  "versions": {
+    "arc_ta_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_te_challenge_bloom-1b7.json b/evals/arc-challenge/arc_te_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..ce9a2c9841dcb9e494770a8c9199b82c8ab4c9f7
--- /dev/null
+++ b/evals/arc-challenge/arc_te_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_te_challenge": {
+      "acc": 0.21897810218978103,
+      "acc_stderr": 0.02502941075517834,
+      "acc_norm": 0.2591240875912409,
+      "acc_norm_stderr": 0.026518277256436896
+    }
+  },
+  "versions": {
+    "arc_te_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_te_challenge_bloom-560.json b/evals/arc-challenge/arc_te_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..0d326f4a1b5d45a12a085af0588dc48da1242b19
--- /dev/null
+++ b/evals/arc-challenge/arc_te_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_te_challenge": {
+      "acc": 0.22627737226277372,
+      "acc_stderr": 0.02532397574413385,
+      "acc_norm": 0.24087591240875914,
+      "acc_norm_stderr": 0.025880445559939208
+    }
+  },
+  "versions": {
+    "arc_te_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_te_challenge_bloom-7b1.json b/evals/arc-challenge/arc_te_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c6d34bb9da6f86f1a4494caba49a2d1bab46bcf
--- /dev/null
+++ b/evals/arc-challenge/arc_te_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_te_challenge": {
+      "acc": 0.20072992700729927,
+      "acc_stderr": 0.024242171306158907,
+      "acc_norm": 0.25547445255474455,
+      "acc_norm_stderr": 0.026395641265678074
+    }
+  },
+  "versions": {
+    "arc_te_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_te_challenge_gpt2-large.json b/evals/arc-challenge/arc_te_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..226ed83458102ea0a3f4161159558d6ae8875357
--- /dev/null
+++ b/evals/arc-challenge/arc_te_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_te_challenge": {
+      "acc": 0.22627737226277372,
+      "acc_stderr": 0.02532397574413385,
+      "acc_norm": 0.24087591240875914,
+      "acc_norm_stderr": 0.025880445559939208
+    }
+  },
+  "versions": {
+    "arc_te_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_te_challenge_gpt2-medium.json b/evals/arc-challenge/arc_te_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..a5bd92092ab22f31db2d36d69626c32b485ab331
--- /dev/null
+++ b/evals/arc-challenge/arc_te_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_te_challenge": {
+      "acc": 0.2116788321167883,
+      "acc_stderr": 0.02472344500978517,
+      "acc_norm": 0.22992700729927007,
+      "acc_norm_stderr": 0.025467107178386465
+    }
+  },
+  "versions": {
+    "arc_te_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_te_challenge_gpt2.json b/evals/arc-challenge/arc_te_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..c6b5f06c5f92b644a3c4ac037330810277460f0a
--- /dev/null
+++ b/evals/arc-challenge/arc_te_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_te_challenge": {
+      "acc": 0.22627737226277372,
+      "acc_stderr": 0.02532397574413385,
+      "acc_norm": 0.24087591240875914,
+      "acc_norm_stderr": 0.025880445559939215
+    }
+  },
+  "versions": {
+    "arc_te_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_te_challenge_llama-7B.json b/evals/arc-challenge/arc_te_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..a20fb71e7ce5932ff220ab3a23466714b469cd51
--- /dev/null
+++ b/evals/arc-challenge/arc_te_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_te_challenge": {
+      "acc": 0.24087591240875914,
+      "acc_stderr": 0.025880445559939215,
+      "acc_norm": 0.26277372262773724,
+      "acc_norm_stderr": 0.026638517193281797
+    }
+  },
+  "versions": {
+    "arc_te_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_uk_challenge_bloom-1b7.json b/evals/arc-challenge/arc_uk_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..72eee1e288b03359fecf649039ec7e1a796086ee
--- /dev/null
+++ b/evals/arc-challenge/arc_uk_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_uk_challenge": {
+      "acc": 0.24579124579124578,
+      "acc_stderr": 0.025025521384235305,
+      "acc_norm": 0.28619528619528617,
+      "acc_norm_stderr": 0.026270908298354635
+    }
+  },
+  "versions": {
+    "arc_uk_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_uk_challenge_bloom-560.json b/evals/arc-challenge/arc_uk_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef5e9d5a99c327e81413b16eb715a91e70b6c5b3
--- /dev/null
+++ b/evals/arc-challenge/arc_uk_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_uk_challenge": {
+      "acc": 0.265993265993266,
+      "acc_stderr": 0.02568262955665285,
+      "acc_norm": 0.2895622895622896,
+      "acc_norm_stderr": 0.026362594432681956
+    }
+  },
+  "versions": {
+    "arc_uk_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_uk_challenge_bloom-7b1.json b/evals/arc-challenge/arc_uk_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c2cc6b833fb7540bcca14af70e018d3eb236524
--- /dev/null
+++ b/evals/arc-challenge/arc_uk_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_uk_challenge": {
+      "acc": 0.2222222222222222,
+      "acc_stderr": 0.02416437978893547,
+      "acc_norm": 0.265993265993266,
+      "acc_norm_stderr": 0.02568262955665285
+    }
+  },
+  "versions": {
+    "arc_uk_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_uk_challenge_gpt2-large.json b/evals/arc-challenge/arc_uk_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..c03f6ddf265c02f0fc83f91f5c16d2586666d682
--- /dev/null
+++ b/evals/arc-challenge/arc_uk_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_uk_challenge": {
+      "acc": 0.23232323232323232,
+      "acc_stderr": 0.02454650495612789,
+      "acc_norm": 0.27946127946127947,
+      "acc_norm_stderr": 0.026082164400369843
+    }
+  },
+  "versions": {
+    "arc_uk_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_uk_challenge_gpt2-medium.json b/evals/arc-challenge/arc_uk_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..51083b7158f2de8700c8c253b7e5e98eba1626a9
--- /dev/null
+++ b/evals/arc-challenge/arc_uk_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_uk_challenge": {
+      "acc": 0.2222222222222222,
+      "acc_stderr": 0.02416437978893546,
+      "acc_norm": 0.265993265993266,
+      "acc_norm_stderr": 0.02568262955665285
+    }
+  },
+  "versions": {
+    "arc_uk_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_uk_challenge_gpt2.json b/evals/arc-challenge/arc_uk_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..e32104934ab1fe23828d680bf766e04e93ea044a
--- /dev/null
+++ b/evals/arc-challenge/arc_uk_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_uk_challenge": {
+      "acc": 0.21212121212121213,
+      "acc_stderr": 0.023761611918761662,
+      "acc_norm": 0.24242424242424243,
+      "acc_norm_stderr": 0.02490893747050876
+    }
+  },
+  "versions": {
+    "arc_uk_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_uk_challenge_llama-7B.json b/evals/arc-challenge/arc_uk_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..a02491cf171678a4ddc940caa47d4c778b0e3cf5
--- /dev/null
+++ b/evals/arc-challenge/arc_uk_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_uk_challenge": {
+      "acc": 0.30976430976430974,
+      "acc_stderr": 0.026876241779014095,
+      "acc_norm": 0.3367003367003367,
+      "acc_norm_stderr": 0.027468238412892212
+    }
+  },
+  "versions": {
+    "arc_uk_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_vi_challenge_bloom-1b7.json b/evals/arc-challenge/arc_vi_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..508c46f8cd77b71773ecc8623d362eae91a1dc3f
--- /dev/null
+++ b/evals/arc-challenge/arc_vi_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_vi_challenge": {
+      "acc": 0.24496644295302014,
+      "acc_stderr": 0.024955035980898942,
+      "acc_norm": 0.28187919463087246,
+      "acc_norm_stderr": 0.026106703750007423
+    }
+  },
+  "versions": {
+    "arc_vi_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_vi_challenge_bloom-560.json b/evals/arc-challenge/arc_vi_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..70d9cffdbf7b3adea2bbded15e8a36d7f930b24b
--- /dev/null
+++ b/evals/arc-challenge/arc_vi_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_vi_challenge": {
+      "acc": 0.2483221476510067,
+      "acc_stderr": 0.025069483148037874,
+      "acc_norm": 0.25838926174496646,
+      "acc_norm_stderr": 0.025400777524610105
+    }
+  },
+  "versions": {
+    "arc_vi_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_vi_challenge_bloom-7b1.json b/evals/arc-challenge/arc_vi_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f1588613ea4565257bfb7f46328c5e696a1434de
--- /dev/null
+++ b/evals/arc-challenge/arc_vi_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_vi_challenge": {
+      "acc": 0.3087248322147651,
+      "acc_stderr": 0.02680606307294056,
+      "acc_norm": 0.3288590604026846,
+      "acc_norm_stderr": 0.02726048303556786
+    }
+  },
+  "versions": {
+    "arc_vi_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_vi_challenge_gpt2-large.json b/evals/arc-challenge/arc_vi_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..c071ea16496ed3627a0dc0840835a827894a8a61
--- /dev/null
+++ b/evals/arc-challenge/arc_vi_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_vi_challenge": {
+      "acc": 0.18120805369127516,
+      "acc_stderr": 0.02235101779623446,
+      "acc_norm": 0.23825503355704697,
+      "acc_norm_stderr": 0.024719951493159628
+    }
+  },
+  "versions": {
+    "arc_vi_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_vi_challenge_gpt2-medium.json b/evals/arc-challenge/arc_vi_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..0cb1f34c59a21cb916520b7e956a1bd193ba1395
--- /dev/null
+++ b/evals/arc-challenge/arc_vi_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_vi_challenge": {
+      "acc": 0.2080536912751678,
+      "acc_stderr": 0.023553603370264103,
+      "acc_norm": 0.23825503355704697,
+      "acc_norm_stderr": 0.024719951493159628
+    }
+  },
+  "versions": {
+    "arc_vi_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_vi_challenge_gpt2.json b/evals/arc-challenge/arc_vi_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..6f912cfc57fb3d8efe3773d82b7a95532a6f69b0
--- /dev/null
+++ b/evals/arc-challenge/arc_vi_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_vi_challenge": {
+      "acc": 0.2080536912751678,
+      "acc_stderr": 0.0235536033702641,
+      "acc_norm": 0.2080536912751678,
+      "acc_norm_stderr": 0.023553603370264124
+    }
+  },
+  "versions": {
+    "arc_vi_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_vi_challenge_llama-7B.json b/evals/arc-challenge/arc_vi_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..8427c0ad1958ea7ad114255f020f43c5d50d076c
--- /dev/null
+++ b/evals/arc-challenge/arc_vi_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_vi_challenge": {
+      "acc": 0.1912751677852349,
+      "acc_stderr": 0.022821882255340997,
+      "acc_norm": 0.2516778523489933,
+      "acc_norm_stderr": 0.025181904610615855
+    }
+  },
+  "versions": {
+    "arc_vi_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_zh_challenge_bloom-1b7.json b/evals/arc-challenge/arc_zh_challenge_bloom-1b7.json
new file mode 100644
index 0000000000000000000000000000000000000000..4626e7c607b4dd4f9c82472abe983c30203c245c
--- /dev/null
+++ b/evals/arc-challenge/arc_zh_challenge_bloom-1b7.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_zh_challenge": {
+      "acc": 0.25252525252525254,
+      "acc_stderr": 0.025252525252525356,
+      "acc_norm": 0.25925925925925924,
+      "acc_norm_stderr": 0.025471492792791674
+    }
+  },
+  "versions": {
+    "arc_zh_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-1b7",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_zh_challenge_bloom-560.json b/evals/arc-challenge/arc_zh_challenge_bloom-560.json
new file mode 100644
index 0000000000000000000000000000000000000000..127c0ce8f0b322902ecae312152c6905394bf82e
--- /dev/null
+++ b/evals/arc-challenge/arc_zh_challenge_bloom-560.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_zh_challenge": {
+      "acc": 0.24242424242424243,
+      "acc_stderr": 0.024908937470508753,
+      "acc_norm": 0.26936026936026936,
+      "acc_norm_stderr": 0.025785321789052268
+    }
+  },
+  "versions": {
+    "arc_zh_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-560m",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_zh_challenge_bloom-7b1.json b/evals/arc-challenge/arc_zh_challenge_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b488311a8cccbd9e611c8abe983c979453acd882
--- /dev/null
+++ b/evals/arc-challenge/arc_zh_challenge_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_zh_challenge": {
+      "acc": 0.3400673400673401,
+      "acc_stderr": 0.027535084762190663,
+      "acc_norm": 0.367003367003367,
+      "acc_norm_stderr": 0.028014951100692458
+    }
+  },
+  "versions": {
+    "arc_zh_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_zh_challenge_gpt2-large.json b/evals/arc-challenge/arc_zh_challenge_gpt2-large.json
new file mode 100644
index 0000000000000000000000000000000000000000..b20ff9d4fb351205e7abdc821a99a7a9c62aa9c6
--- /dev/null
+++ b/evals/arc-challenge/arc_zh_challenge_gpt2-large.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_zh_challenge": {
+      "acc": 0.21548821548821548,
+      "acc_stderr": 0.023898224834697,
+      "acc_norm": 0.24915824915824916,
+      "acc_norm_stderr": 0.025140041284626418
+    }
+  },
+  "versions": {
+    "arc_zh_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-large",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_zh_challenge_gpt2-medium.json b/evals/arc-challenge/arc_zh_challenge_gpt2-medium.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe9d9b64694a7c0355b5de8e14577532c3e16db0
--- /dev/null
+++ b/evals/arc-challenge/arc_zh_challenge_gpt2-medium.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_zh_challenge": {
+      "acc": 0.21548821548821548,
+      "acc_stderr": 0.023898224834697005,
+      "acc_norm": 0.23232323232323232,
+      "acc_norm_stderr": 0.02454650495612789
+    }
+  },
+  "versions": {
+    "arc_zh_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2-medium",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_zh_challenge_gpt2.json b/evals/arc-challenge/arc_zh_challenge_gpt2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8da342e3dfff17d37f9f34a3f90753cb4850243
--- /dev/null
+++ b/evals/arc-challenge/arc_zh_challenge_gpt2.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_zh_challenge": {
+      "acc": 0.20875420875420875,
+      "acc_stderr": 0.023622587756271476,
+      "acc_norm": 0.22895622895622897,
+      "acc_norm_stderr": 0.02442136264227106
+    }
+  },
+  "versions": {
+    "arc_zh_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=gpt2",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_zh_challenge_llama-7B.json b/evals/arc-challenge/arc_zh_challenge_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..51e82fa68d852ff2bafe284c29d895d2422b66e9
--- /dev/null
+++ b/evals/arc-challenge/arc_zh_challenge_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_zh_challenge": {
+      "acc": 0.2558922558922559,
+      "acc_stderr": 0.02536300037580196,
+      "acc_norm": 0.27946127946127947,
+      "acc_norm_stderr": 0.026082164400369843
+    }
+  },
+  "versions": {
+    "arc_zh_challenge": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ar_bloom-7b1.json b/evals/hellaswag/hellaswag_ar_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..69248e00b845c50b1eb8379e9d0ec05aaffc075d
--- /dev/null
+++ b/evals/hellaswag/hellaswag_ar_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_ar": {
+      "acc": 0.3561464690496949,
+      "acc_stderr": 0.004999249661771764,
+      "acc_norm": 0.43341325196163905,
+      "acc_norm_stderr": 0.005173461992734505
+    }
+  },
+  "versions": {
+    "hellaswag_ar": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ar_llama-7B.json b/evals/hellaswag/hellaswag_ar_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..53797549241b15b072b9f0ce5f8b12ea57bce437
--- /dev/null
+++ b/evals/hellaswag/hellaswag_ar_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_ar": {
+      "acc": 0.28040540540540543,
+      "acc_stderr": 0.004689581635445738,
+      "acc_norm": 0.3085222319093287,
+      "acc_norm_stderr": 0.004822023322058258
+    }
+  },
+  "versions": {
+    "hellaswag_ar": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_bn_bloom-7b1.json b/evals/hellaswag/hellaswag_bn_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7e6f1a343c04d236c977fa61b55e3bd8c74fa3f1
--- /dev/null
+++ b/evals/hellaswag/hellaswag_bn_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_bn": {
+      "acc": 0.28381302748322873,
+      "acc_stderr": 0.004689968075947356,
+      "acc_norm": 0.3277429127894395,
+      "acc_norm_stderr": 0.004882866652334284
+    }
+  },
+  "versions": {
+    "hellaswag_bn": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_bn_llama-7B.json b/evals/hellaswag/hellaswag_bn_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb1676e09ecdce592c17a4ff25f63c87e2a2a971
--- /dev/null
+++ b/evals/hellaswag/hellaswag_bn_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_bn": {
+      "acc": 0.26011685782298205,
+      "acc_stderr": 0.00456358696087763,
+      "acc_norm": 0.28251460722787275,
+      "acc_norm_stderr": 0.004683467388784859
+    }
+  },
+  "versions": {
+    "hellaswag_bn": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ca_bloom-7b1.json b/evals/hellaswag/hellaswag_ca_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..fa322ff2eccfdf62925b1b79ced281791b64de0e
--- /dev/null
+++ b/evals/hellaswag/hellaswag_ca_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_ca": {
+      "acc": 0.40186712983065564,
+      "acc_stderr": 0.005108421054557395,
+      "acc_norm": 0.5120495006513244,
+      "acc_norm_stderr": 0.005208233728494265
+    }
+  },
+  "versions": {
+    "hellaswag_ca": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ca_llama-7B.json b/evals/hellaswag/hellaswag_ca_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..4e0b22ebaf8ac031767a3f3ab1e4789d623a3c02
--- /dev/null
+++ b/evals/hellaswag/hellaswag_ca_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_ca": {
+      "acc": 0.38460703430308296,
+      "acc_stderr": 0.0050691072999641,
+      "acc_norm": 0.49565783760312637,
+      "acc_norm_stderr": 0.005209550302588167
+    }
+  },
+  "versions": {
+    "hellaswag_ca": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_da_bloom-7b1.json b/evals/hellaswag/hellaswag_da_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..248065e86f7721ea28ce5b176e014af8e2c365bf
--- /dev/null
+++ b/evals/hellaswag/hellaswag_da_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_da": {
+      "acc": 0.2806018269747448,
+      "acc_stderr": 0.00465795256586935,
+      "acc_norm": 0.31176786673831275,
+      "acc_norm_stderr": 0.004802289060894963
+    }
+  },
+  "versions": {
+    "hellaswag_da": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_da_llama-7B.json b/evals/hellaswag/hellaswag_da_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..158172ac8091f5c183cde64b120c8c32ef6b2da7
--- /dev/null
+++ b/evals/hellaswag/hellaswag_da_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_da": {
+      "acc": 0.3730252552391188,
+      "acc_stderr": 0.005013710932255912,
+      "acc_norm": 0.46695325094035467,
+      "acc_norm_stderr": 0.005172309453152385
+    }
+  },
+  "versions": {
+    "hellaswag_da": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_de_bloom-7b1.json b/evals/hellaswag/hellaswag_de_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1a42078cb7cf48cd71502713357d1faa121702cc
--- /dev/null
+++ b/evals/hellaswag/hellaswag_de_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_de": {
+      "acc": 0.2982493595217763,
+      "acc_stderr": 0.004726948912322779,
+      "acc_norm": 0.32418872758326217,
+      "acc_norm_stderr": 0.004836279708509382
+    }
+  },
+  "versions": {
+    "hellaswag_de": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_de_llama-7B.json b/evals/hellaswag/hellaswag_de_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..a027e43f548e49b4fd7dd60cc606b68dc314cb9d
--- /dev/null
+++ b/evals/hellaswag/hellaswag_de_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_de": {
+      "acc": 0.39427900523001386,
+      "acc_stderr": 0.005049108443939032,
+      "acc_norm": 0.49855907780979825,
+      "acc_norm_stderr": 0.005165885308732062
+    }
+  },
+  "versions": {
+    "hellaswag_de": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_es_bloom-7b1.json b/evals/hellaswag/hellaswag_es_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7fd9710255ac60d17ca496eac2cdcfe416fd02be
--- /dev/null
+++ b/evals/hellaswag/hellaswag_es_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_es": {
+      "acc": 0.4372733091529763,
+      "acc_stderr": 0.0051237264293392815,
+      "acc_norm": 0.566567100490719,
+      "acc_norm_stderr": 0.005118554174253425
+    }
+  },
+  "versions": {
+    "hellaswag_es": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_es_llama-7B.json b/evals/hellaswag/hellaswag_es_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..571b2651d1c438f6d95ef828887f685893f506ff
--- /dev/null
+++ b/evals/hellaswag/hellaswag_es_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_es": {
+      "acc": 0.4311466666666667,
+      "acc_stderr": 0.005115053675969629,
+      "acc_norm": 0.5640533333333333,
+      "acc_norm_stderr": 0.0051217018246512425
+    }
+  },
+  "versions": {
+    "hellaswag_es": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_eu_bloom-7b1.json b/evals/hellaswag/hellaswag_eu_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..aaa2bac442dd619e5e485a1c9bb7770c1aaad3e8
--- /dev/null
+++ b/evals/hellaswag/hellaswag_eu_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_eu": {
+      "acc": 0.27380695314187,
+      "acc_stderr": 0.004633608505053738,
+      "acc_norm": 0.31235154394299286,
+      "acc_norm_stderr": 0.00481588516396214
+    }
+  },
+  "versions": {
+    "hellaswag_eu": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_eu_llama-7B.json b/evals/hellaswag/hellaswag_eu_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..f969135230558d6c41262c2194dbbe0e29c848f6
--- /dev/null
+++ b/evals/hellaswag/hellaswag_eu_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_eu": {
+      "acc": 0.25847549125458863,
+      "acc_stderr": 0.004549288692503547,
+      "acc_norm": 0.28719499028287626,
+      "acc_norm_stderr": 0.004701591142825526
+    }
+  },
+  "versions": {
+    "hellaswag_eu": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_fr_bloom-7b1.json b/evals/hellaswag/hellaswag_fr_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..737e5f885ea8810e330462182af605bac6f7338e
--- /dev/null
+++ b/evals/hellaswag/hellaswag_fr_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_fr": {
+      "acc": 0.4255729278218034,
+      "acc_stderr": 0.005116827391881862,
+      "acc_norm": 0.5656457485542943,
+      "acc_norm_stderr": 0.005129684120180618
+    }
+  },
+  "versions": {
+    "hellaswag_fr": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_fr_llama-7B.json b/evals/hellaswag/hellaswag_fr_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..3f0fd2446e8e689f67cfb568e162f7b4dba1a617
--- /dev/null
+++ b/evals/hellaswag/hellaswag_fr_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_fr": {
+      "acc": 0.4255729278218034,
+      "acc_stderr": 0.00511682739188186,
+      "acc_norm": 0.5566502463054187,
+      "acc_norm_stderr": 0.005141155729141772
+    }
+  },
+  "versions": {
+    "hellaswag_fr": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_gu_bloom-7b1.json b/evals/hellaswag/hellaswag_gu_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ef2b298131daf31fa9c77d37366818ba539e0bb
--- /dev/null
+++ b/evals/hellaswag/hellaswag_gu_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_gu": {
+      "acc": 0.2683176189935249,
+      "acc_stderr": 0.004722752779022285,
+      "acc_norm": 0.30625922980802,
+      "acc_norm_stderr": 0.0049130651137809294
+    }
+  },
+  "versions": {
+    "hellaswag_gu": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_gu_llama-7B.json b/evals/hellaswag/hellaswag_gu_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..a610259f2ef19c9db88847c04b399dfcbcc4a463
--- /dev/null
+++ b/evals/hellaswag/hellaswag_gu_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_gu": {
+      "acc": 0.2560490741792571,
+      "acc_stderr": 0.004652036002377334,
+      "acc_norm": 0.28899238895830964,
+      "acc_norm_stderr": 0.004831585233585411
+    }
+  },
+  "versions": {
+    "hellaswag_gu": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_hi_bloom-7b1.json b/evals/hellaswag/hellaswag_hi_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..63eeb2a2481895efb7ecade2660f0911184073b6
--- /dev/null
+++ b/evals/hellaswag/hellaswag_hi_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_hi": {
+      "acc": 0.31202209005947323,
+      "acc_stderr": 0.004774960194792877,
+      "acc_norm": 0.36363636363636365,
+      "acc_norm_stderr": 0.004957653483174718
+    }
+  },
+  "versions": {
+    "hellaswag_hi": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_hi_llama-7B.json b/evals/hellaswag/hellaswag_hi_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..35969545033bac79e8237b539ab99ce740103734
--- /dev/null
+++ b/evals/hellaswag/hellaswag_hi_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_hi": {
+      "acc": 0.2729396771452846,
+      "acc_stderr": 0.0045910116736375154,
+      "acc_norm": 0.2917374681393373,
+      "acc_norm_stderr": 0.004684713934059222
+    }
+  },
+  "versions": {
+    "hellaswag_hi": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_hr_bloom-7b1.json b/evals/hellaswag/hellaswag_hr_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2571f200efda69d65fed248bfa1462accaa0e80f
--- /dev/null
+++ b/evals/hellaswag/hellaswag_hr_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_hr": {
+      "acc": 0.27478095640240685,
+      "acc_stderr": 0.004586771132918674,
+      "acc_norm": 0.3000105563179563,
+      "acc_norm_stderr": 0.004708614858618206
+    }
+  },
+  "versions": {
+    "hellaswag_hr": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_hr_llama-7B.json b/evals/hellaswag/hellaswag_hr_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..0c8aa308a99a4d9917300d2b6bca88d4fbd44a07
--- /dev/null
+++ b/evals/hellaswag/hellaswag_hr_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_hr": {
+      "acc": 0.3393856222949435,
+      "acc_stderr": 0.004865190903217322,
+      "acc_norm": 0.41148527393645096,
+      "acc_norm_stderr": 0.005056324888258699
+    }
+  },
+  "versions": {
+    "hellaswag_hr": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_hu_bloom-7b1.json b/evals/hellaswag/hellaswag_hu_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..cfb0859d6479ddd7e6caa9ab28436da9061fafe0
--- /dev/null
+++ b/evals/hellaswag/hellaswag_hu_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_hu": {
+      "acc": 0.2749780893952673,
+      "acc_stderr": 0.004673697346652944,
+      "acc_norm": 0.30127081507449605,
+      "acc_norm_stderr": 0.004802517407348953
+    }
+  },
+  "versions": {
+    "hellaswag_hu": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_hu_llama-7B.json b/evals/hellaswag/hellaswag_hu_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..7f1300419e0e8727d8da4787e4074336d82c6d64
--- /dev/null
+++ b/evals/hellaswag/hellaswag_hu_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_hu": {
+      "acc": 0.31879929886064856,
+      "acc_stderr": 0.004877892181685683,
+      "acc_norm": 0.3785056967572305,
+      "acc_norm_stderr": 0.005076808255387223
+    }
+  },
+  "versions": {
+    "hellaswag_hu": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_hy_bloom-7b1.json b/evals/hellaswag/hellaswag_hy_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7aadfc69e7d37e7a69be9da8de7f6f479daa078
--- /dev/null
+++ b/evals/hellaswag/hellaswag_hy_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_hy": {
+      "acc": 0.2517377201112141,
+      "acc_stderr": 0.00467165233929534,
+      "acc_norm": 0.2761816496756256,
+      "acc_norm_stderr": 0.004812620824973181
+    }
+  },
+  "versions": {
+    "hellaswag_hy": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_hy_llama-7B.json b/evals/hellaswag/hellaswag_hy_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..85198baf9a0a8e2dcb229b74cd9c22b5421c95b3
--- /dev/null
+++ b/evals/hellaswag/hellaswag_hy_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_hy": {
+      "acc": 0.2545180722891566,
+      "acc_stderr": 0.004688644596808388,
+      "acc_norm": 0.2849860982391103,
+      "acc_norm_stderr": 0.004858906279128767
+    }
+  },
+  "versions": {
+    "hellaswag_hy": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_id_bloom-7b1.json b/evals/hellaswag/hellaswag_id_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b4bcc31e157c6a9c8fc29d08fd6088001c2a4e2b
--- /dev/null
+++ b/evals/hellaswag/hellaswag_id_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_id": {
+      "acc": 0.3894849785407725,
+      "acc_stderr": 0.005051366474018924,
+      "acc_norm": 0.49484978540772534,
+      "acc_norm_stderr": 0.005179195541251435
+    }
+  },
+  "versions": {
+    "hellaswag_id": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_id_llama-7B.json b/evals/hellaswag/hellaswag_id_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..d408a6b8209abf2afa7b33e28f960ce7cf71596b
--- /dev/null
+++ b/evals/hellaswag/hellaswag_id_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_id": {
+      "acc": 0.3017167381974249,
+      "acc_stderr": 0.004754784760510309,
+      "acc_norm": 0.34431330472103006,
+      "acc_norm_stderr": 0.004921986658657097
+    }
+  },
+  "versions": {
+    "hellaswag_id": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_it_bloom-7b1.json b/evals/hellaswag/hellaswag_it_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f071bbb39cf2e6048f33a2ac1444d8d24657c9ab
--- /dev/null
+++ b/evals/hellaswag/hellaswag_it_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_it": {
+      "acc": 0.33380465520991953,
+      "acc_stderr": 0.004918337887582365,
+      "acc_norm": 0.40765716771807703,
+      "acc_norm_stderr": 0.005125137013353996
+    }
+  },
+  "versions": {
+    "hellaswag_it": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_it_llama-7B.json b/evals/hellaswag/hellaswag_it_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..2698d8e1b02654e67b142631369916d337041789
--- /dev/null
+++ b/evals/hellaswag/hellaswag_it_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_it": {
+      "acc": 0.3975851191123681,
+      "acc_stderr": 0.0051045551272873,
+      "acc_norm": 0.5201783966061133,
+      "acc_norm_stderr": 0.005210879697577827
+    }
+  },
+  "versions": {
+    "hellaswag_it": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_kn_bloom-7b1.json b/evals/hellaswag/hellaswag_kn_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec110ed487575de37a4630739da2ee9264bd8d08
--- /dev/null
+++ b/evals/hellaswag/hellaswag_kn_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_kn": {
+      "acc": 0.26337169939065674,
+      "acc_stderr": 0.004679154494054024,
+      "acc_norm": 0.30275332881967953,
+      "acc_norm_stderr": 0.004880859653925846
+    }
+  },
+  "versions": {
+    "hellaswag_kn": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_kn_llama-7B.json b/evals/hellaswag/hellaswag_kn_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..219c76670fe5ee2040cfa43d6e6360e4684a6fe4
--- /dev/null
+++ b/evals/hellaswag/hellaswag_kn_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_kn": {
+      "acc": 0.25603701196118256,
+      "acc_stderr": 0.004636450973386679,
+      "acc_norm": 0.2887610020311442,
+      "acc_norm_stderr": 0.0048143280788988845
+    }
+  },
+  "versions": {
+    "hellaswag_kn": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ml_bloom-7b1.json b/evals/hellaswag/hellaswag_ml_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a4de930d07f3cb8e48668e5b5f1b53560c0ff7f1
--- /dev/null
+++ b/evals/hellaswag/hellaswag_ml_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_ml": {
+      "acc": 0.25444979290272024,
+      "acc_stderr": 0.004608558887983242,
+      "acc_norm": 0.2878092466136796,
+      "acc_norm_stderr": 0.004790448543019756
+    }
+  },
+  "versions": {
+    "hellaswag_ml": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ml_llama-7B.json b/evals/hellaswag/hellaswag_ml_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..d0fff179c59dc9c44b1a6de207bcba30d72726a7
--- /dev/null
+++ b/evals/hellaswag/hellaswag_ml_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_ml": {
+      "acc": 0.2510914586365163,
+      "acc_stderr": 0.004588344357712618,
+      "acc_norm": 0.2890406358446211,
+      "acc_norm_stderr": 0.004796533523475371
+    }
+  },
+  "versions": {
+    "hellaswag_ml": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_mr_bloom-7b1.json b/evals/hellaswag/hellaswag_mr_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5768dcee263277655dc8087f17858a884c937b53
--- /dev/null
+++ b/evals/hellaswag/hellaswag_mr_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_mr": {
+      "acc": 0.2701799762905486,
+      "acc_stderr": 0.004610067484763786,
+      "acc_norm": 0.3100549628192693,
+      "acc_norm_stderr": 0.004801748474056546
+    }
+  },
+  "versions": {
+    "hellaswag_mr": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_mr_llama-7B.json b/evals/hellaswag/hellaswag_mr_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c3e2cc455a43fee3f289e2eab0831003b552a30
--- /dev/null
+++ b/evals/hellaswag/hellaswag_mr_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_mr": {
+      "acc": 0.2592951826705464,
+      "acc_stderr": 0.004549803334314971,
+      "acc_norm": 0.2879620648776808,
+      "acc_norm_stderr": 0.004701019162604622
+    }
+  },
+  "versions": {
+    "hellaswag_mr": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ne_bloom-7b1.json b/evals/hellaswag/hellaswag_ne_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b95e1d5f31b1e69f29c233339889469700c84bd
--- /dev/null
+++ b/evals/hellaswag/hellaswag_ne_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_ne": {
+      "acc": 0.27441511053874224,
+      "acc_stderr": 0.004622852940386713,
+      "acc_norm": 0.30897188237819273,
+      "acc_norm_stderr": 0.004787064632332303
+    }
+  },
+  "versions": {
+    "hellaswag_ne": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ne_llama-7B.json b/evals/hellaswag/hellaswag_ne_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..8c4989d19a23d592896ca0b4e6fded1f62cc01f3
--- /dev/null
+++ b/evals/hellaswag/hellaswag_ne_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_ne": {
+      "acc": 0.264112470487229,
+      "acc_stderr": 0.004567327225923831,
+      "acc_norm": 0.28171281390856406,
+      "acc_norm_stderr": 0.00466030469849661
+    }
+  },
+  "versions": {
+    "hellaswag_ne": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_nl_bloom-7b1.json b/evals/hellaswag/hellaswag_nl_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..599727af50ee8ce9a291e94ceb4f493ad958f009
--- /dev/null
+++ b/evals/hellaswag/hellaswag_nl_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_nl": {
+      "acc": 0.28667026443604965,
+      "acc_stderr": 0.004698261813459453,
+      "acc_norm": 0.3172153264975715,
+      "acc_norm_stderr": 0.004835258421184045
+    }
+  },
+  "versions": {
+    "hellaswag_nl": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_nl_llama-7B.json b/evals/hellaswag/hellaswag_nl_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..f41371bf4d5b2933a8b16a029b69b62675604b1f
--- /dev/null
+++ b/evals/hellaswag/hellaswag_nl_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_nl": {
+      "acc": 0.38117850205050724,
+      "acc_stderr": 0.0050457320519523,
+      "acc_norm": 0.48748111374919056,
+      "acc_norm_stderr": 0.00519291390537233
+    }
+  },
+  "versions": {
+    "hellaswag_nl": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_pt_bloom-7b1.json b/evals/hellaswag/hellaswag_pt_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5050ad2ec66e4750cc93be5c7e0c4c942051e7a9
--- /dev/null
+++ b/evals/hellaswag/hellaswag_pt_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_pt": {
+      "acc": 0.4227977028930545,
+      "acc_stderr": 0.005142526543466809,
+      "acc_norm": 0.5511973128182902,
+      "acc_norm_stderr": 0.005177587858629525
+    }
+  },
+  "versions": {
+    "hellaswag_pt": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_pt_llama-7B.json b/evals/hellaswag/hellaswag_pt_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..7ec9536f323c0aa592fcadb1d9e1333cd323941d
--- /dev/null
+++ b/evals/hellaswag/hellaswag_pt_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_pt": {
+      "acc": 0.4037273810813739,
+      "acc_stderr": 0.005107551363682552,
+      "acc_norm": 0.532343699209015,
+      "acc_norm_stderr": 0.005194044440586472
+    }
+  },
+  "versions": {
+    "hellaswag_pt": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ro_bloom-7b1.json b/evals/hellaswag/hellaswag_ro_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..dafe7356bdb6ae258020ac1efcc6169d4f31dd20
--- /dev/null
+++ b/evals/hellaswag/hellaswag_ro_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_ro": {
+      "acc": 0.2795024337479719,
+      "acc_stderr": 0.00466744369483023,
+      "acc_norm": 0.3182260681449432,
+      "acc_norm_stderr": 0.004844601996973363
+    }
+  },
+  "versions": {
+    "hellaswag_ro": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ro_llama-7B.json b/evals/hellaswag/hellaswag_ro_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..03cce6eee60bd007c3835cca157c9f654b0774a7
--- /dev/null
+++ b/evals/hellaswag/hellaswag_ro_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_ro": {
+      "acc": 0.36041103299080585,
+      "acc_stderr": 0.004993666697380137,
+      "acc_norm": 0.4491076257436452,
+      "acc_norm_stderr": 0.005173430588992903
+    }
+  },
+  "versions": {
+    "hellaswag_ro": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ru_bloom-7b1.json b/evals/hellaswag/hellaswag_ru_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1114c4bc91539820ff9a813a92206eb0b0aaf89
--- /dev/null
+++ b/evals/hellaswag/hellaswag_ru_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_ru": {
+      "acc": 0.2975625539257981,
+      "acc_stderr": 0.004748207348707273,
+      "acc_norm": 0.32538826574633306,
+      "acc_norm_stderr": 0.004865915900810558
+    }
+  },
+  "versions": {
+    "hellaswag_ru": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ru_llama-7B.json b/evals/hellaswag/hellaswag_ru_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..9da4ad4e94c2effcad5429b563495f832b369727
--- /dev/null
+++ b/evals/hellaswag/hellaswag_ru_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_ru": {
+      "acc": 0.370685936151855,
+      "acc_stderr": 0.005016184279255606,
+      "acc_norm": 0.4568593615185505,
+      "acc_norm_stderr": 0.005173496063169706
+    }
+  },
+  "versions": {
+    "hellaswag_ru": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_sk_bloom-7b1.json b/evals/hellaswag/hellaswag_sk_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a452682d669ca439c37ef65351b2482280cb6a25
--- /dev/null
+++ b/evals/hellaswag/hellaswag_sk_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_sk": {
+      "acc": 0.27053241960991037,
+      "acc_stderr": 0.004561596675422169,
+      "acc_norm": 0.2981549815498155,
+      "acc_norm_stderr": 0.004697273773957717
+    }
+  },
+  "versions": {
+    "hellaswag_sk": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_sk_llama-7B.json b/evals/hellaswag/hellaswag_sk_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..7720fc7912fd16392b3c8ddc4e66fd5530405fce
--- /dev/null
+++ b/evals/hellaswag/hellaswag_sk_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_sk": {
+      "acc": 0.30173958882445967,
+      "acc_stderr": 0.004713343422332119,
+      "acc_norm": 0.35888244596731683,
+      "acc_norm_stderr": 0.004925486913523139
+    }
+  },
+  "versions": {
+    "hellaswag_sk": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_sr_bloom-7b1.json b/evals/hellaswag/hellaswag_sr_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d4dc8c27d8e4e36fa4c1a0c3d9a5431716e620d
--- /dev/null
+++ b/evals/hellaswag/hellaswag_sr_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_sr": {
+      "acc": 0.27748968144777225,
+      "acc_stderr": 0.004606546970716383,
+      "acc_norm": 0.29855011112287017,
+      "acc_norm_stderr": 0.004708005935082949
+    }
+  },
+  "versions": {
+    "hellaswag_sr": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_sr_llama-7B.json b/evals/hellaswag/hellaswag_sr_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..05dc0fdc8921cb49fe2182f475f6d81e20eb5990
--- /dev/null
+++ b/evals/hellaswag/hellaswag_sr_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_sr": {
+      "acc": 0.3437400783151656,
+      "acc_stderr": 0.004886333271945336,
+      "acc_norm": 0.41147211345115886,
+      "acc_norm_stderr": 0.005062718548853834
+    }
+  },
+  "versions": {
+    "hellaswag_sr": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_sv_bloom-7b1.json b/evals/hellaswag/hellaswag_sv_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ebba6534a5e9e09a423a4422dbf2aae81a1bc02
--- /dev/null
+++ b/evals/hellaswag/hellaswag_sv_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_sv": {
+      "acc": 0.27647445735584303,
+      "acc_stderr": 0.0046830976447929905,
+      "acc_norm": 0.3101293575970182,
+      "acc_norm_stderr": 0.0048432182915872585
+    }
+  },
+  "versions": {
+    "hellaswag_sv": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_sv_llama-7B.json b/evals/hellaswag/hellaswag_sv_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee471bcb53bf2f1459136089da2f6e7ae0cdafd1
--- /dev/null
+++ b/evals/hellaswag/hellaswag_sv_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_sv": {
+      "acc": 0.3857706643279982,
+      "acc_stderr": 0.005096929762325147,
+      "acc_norm": 0.5051523788642841,
+      "acc_norm_stderr": 0.005235108858635741
+    }
+  },
+  "versions": {
+    "hellaswag_sv": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ta_bloom-7b1.json b/evals/hellaswag/hellaswag_ta_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..584724a6119d2433aab6e11c1971faf29ca9ce8f
--- /dev/null
+++ b/evals/hellaswag/hellaswag_ta_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_ta": {
+      "acc": 0.2588850588375134,
+      "acc_stderr": 0.004775805657688067,
+      "acc_norm": 0.29406870319743256,
+      "acc_norm_stderr": 0.0049677071891109335
+    }
+  },
+  "versions": {
+    "hellaswag_ta": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_ta_llama-7B.json b/evals/hellaswag/hellaswag_ta_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d69d8dc8c743704b031d2ef3894db3a70bb4c9a
--- /dev/null
+++ b/evals/hellaswag/hellaswag_ta_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_ta": {
+      "acc": 0.25329846665874245,
+      "acc_stderr": 0.004741766564082548,
+      "acc_norm": 0.28313324616664687,
+      "acc_norm_stderr": 0.004912075369610396
+    }
+  },
+  "versions": {
+    "hellaswag_ta": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_te_bloom-7b1.json b/evals/hellaswag/hellaswag_te_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5052ea04c62ee79955014621885295121b68fc76
--- /dev/null
+++ b/evals/hellaswag/hellaswag_te_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_te": {
+      "acc": 0.26123337918386064,
+      "acc_stderr": 0.00470365034659896,
+      "acc_norm": 0.2922971114167813,
+      "acc_norm_stderr": 0.004869729181749992
+    }
+  },
+  "versions": {
+    "hellaswag_te": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_te_llama-7B.json b/evals/hellaswag/hellaswag_te_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..7bce32700aa0c1c9e176ddae4994c8d3a2b22f3b
--- /dev/null
+++ b/evals/hellaswag/hellaswag_te_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_te": {
+      "acc": 0.25767996331957815,
+      "acc_stderr": 0.0046827716491321504,
+      "acc_norm": 0.28931682714351215,
+      "acc_norm_stderr": 0.004855030101325898
+    }
+  },
+  "versions": {
+    "hellaswag_te": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_uk_bloom-7b1.json b/evals/hellaswag/hellaswag_uk_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..cd933afdab71857ec060d16116e0f044d23e7a50
--- /dev/null
+++ b/evals/hellaswag/hellaswag_uk_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_uk": {
+      "acc": 0.2781379530237007,
+      "acc_stderr": 0.004619644722138738,
+      "acc_norm": 0.30035072802635776,
+      "acc_norm_stderr": 0.004726132393644123
+    }
+  },
+  "versions": {
+    "hellaswag_uk": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_uk_llama-7B.json b/evals/hellaswag/hellaswag_uk_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..545af16e16507026332c8c8c7836ef6d20ccae00
--- /dev/null
+++ b/evals/hellaswag/hellaswag_uk_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_uk": {
+      "acc": 0.3544720628850648,
+      "acc_stderr": 0.0049304266046324334,
+      "acc_norm": 0.4412577012959422,
+      "acc_norm_stderr": 0.005117854029524533
+    }
+  },
+  "versions": {
+    "hellaswag_uk": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_vi_bloom-7b1.json b/evals/hellaswag/hellaswag_vi_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..686132db373135123f1b9720642bfd294f99f328
--- /dev/null
+++ b/evals/hellaswag/hellaswag_vi_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_vi": {
+      "acc": 0.3836498581095831,
+      "acc_stderr": 0.0050805394682356675,
+      "acc_norm": 0.4827548570181183,
+      "acc_norm_stderr": 0.005220836527919318
+    }
+  },
+  "versions": {
+    "hellaswag_vi": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_vi_llama-7B.json b/evals/hellaswag/hellaswag_vi_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..816307d9258b275603ae30ffb36851a8b3475dd9
--- /dev/null
+++ b/evals/hellaswag/hellaswag_vi_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_vi": {
+      "acc": 0.27865094957432873,
+      "acc_stderr": 0.004684158200782215,
+      "acc_norm": 0.31608819035145164,
+      "acc_norm_stderr": 0.0048577229826674215
+    }
+  },
+  "versions": {
+    "hellaswag_vi": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_zh_bloom-7b1.json b/evals/hellaswag/hellaswag_zh_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..30ac380919e1d6d2c44c46e941ae3dc9929982e1
--- /dev/null
+++ b/evals/hellaswag/hellaswag_zh_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_zh": {
+      "acc": 0.38851715950787824,
+      "acc_stderr": 0.005063776486157121,
+      "acc_norm": 0.5115475933520397,
+      "acc_norm_stderr": 0.005193156826942953
+    }
+  },
+  "versions": {
+    "hellaswag_zh": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/hellaswag/hellaswag_zh_llama-7B.json b/evals/hellaswag/hellaswag_zh_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..b0d393a5879535e46dfb92d3361d469fc71f97b7
--- /dev/null
+++ b/evals/hellaswag/hellaswag_zh_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "hellaswag_zh": {
+      "acc": 0.32358653431160983,
+      "acc_stderr": 0.004859949552176753,
+      "acc_norm": 0.3945835131635736,
+      "acc_norm_stderr": 0.0050772319918162435
+    }
+  },
+  "versions": {
+    "hellaswag_zh": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ar-llama-7B.json b/evals/mmlu/mmlu_ar-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..f601d0a0a213c652ffd5519a7454ba2a537af3fc
--- /dev/null
+++ b/evals/mmlu/mmlu_ar-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_ar": {
+      "acc": 0.2589727722772277,
+      "acc_stderr": 0.0038529667515366556,
+      "acc_norm": 0.2797803217821782,
+      "acc_norm_stderr": 0.003948136869379606
+    }
+  },
+  "versions": {
+    "mmlu_ar": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_bn-llama-7B.json b/evals/mmlu/mmlu_bn-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..da3322aaf303ad70cf3667aba1a4d73764af5fdc
--- /dev/null
+++ b/evals/mmlu/mmlu_bn-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_bn": {
+      "acc": 0.2501022327635561,
+      "acc_stderr": 0.0039166757490002955,
+      "acc_norm": 0.28461601374008344,
+      "acc_norm_stderr": 0.0040809105667388166
+    }
+  },
+  "versions": {
+    "mmlu_bn": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ca-llama-7B.json b/evals/mmlu/mmlu_ca-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..5183b4df5346ae0e0aa74c3166323602507c4598
--- /dev/null
+++ b/evals/mmlu/mmlu_ca-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_ca": {
+      "acc": 0.3038917604134995,
+      "acc_stderr": 0.004010074337091965,
+      "acc_norm": 0.3022955305564001,
+      "acc_norm_stderr": 0.004004111747979521
+    }
+  },
+  "versions": {
+    "mmlu_ca": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_da-llama-7B.json b/evals/mmlu/mmlu_da-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..f4957b8b53a4880a0eac49ccabcab4a8c6a584c2
--- /dev/null
+++ b/evals/mmlu/mmlu_da-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_da": {
+      "acc": 0.2997122520066636,
+      "acc_stderr": 0.003986771176689293,
+      "acc_norm": 0.2995608056943813,
+      "acc_norm_stderr": 0.003986194743561357
+    }
+  },
+  "versions": {
+    "mmlu_da": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_eu-llama-7B.json b/evals/mmlu/mmlu_eu-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..cbf5d4151c1d0c86b7232d6cbc1cc4623fafce36
--- /dev/null
+++ b/evals/mmlu/mmlu_eu-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_eu": {
+      "acc": 0.2668954809185258,
+      "acc_stderr": 0.003998838127920185,
+      "acc_norm": 0.27923510664378526,
+      "acc_norm_stderr": 0.00405566512057356
+    }
+  },
+  "versions": {
+    "mmlu_eu": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_fr-llama-7B.json b/evals/mmlu/mmlu_fr-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..e22bb03037c1bf7eebd47d64ccb10e43eca00210
--- /dev/null
+++ b/evals/mmlu/mmlu_fr-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_fr": {
+      "acc": 0.318997784737606,
+      "acc_stderr": 0.004073786574740586,
+      "acc_norm": 0.3054006569398824,
+      "acc_norm_stderr": 0.00402561598834305
+    }
+  },
+  "versions": {
+    "mmlu_fr": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_gu-llama-7B.json b/evals/mmlu/mmlu_gu-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..2236b1f5ac01a2de4772fb6fde41222398119985
--- /dev/null
+++ b/evals/mmlu/mmlu_gu-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_gu": {
+      "acc": 0.24391920928233776,
+      "acc_stderr": 0.003981461991912142,
+      "acc_norm": 0.27382896433175763,
+      "acc_norm_stderr": 0.0041342298983896774
+    }
+  },
+  "versions": {
+    "mmlu_gu": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_hi-llama-7B.json b/evals/mmlu/mmlu_hi-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..b9c9d981a7d61e96d94d6c128b4ccfc3f3b0f0e6
--- /dev/null
+++ b/evals/mmlu/mmlu_hi-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_hi": {
+      "acc": 0.2549650237195465,
+      "acc_stderr": 0.003908303467263245,
+      "acc_norm": 0.27860416499155743,
+      "acc_norm_stderr": 0.0040201315154066415
+    }
+  },
+  "versions": {
+    "mmlu_hi": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_hr-llama-7B.json b/evals/mmlu/mmlu_hr-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..b2f5ca1c97a96e3d94fd3ae5c2603632e633b975
--- /dev/null
+++ b/evals/mmlu/mmlu_hr-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_hr": {
+      "acc": 0.294721630666261,
+      "acc_stderr": 0.003976243355939721,
+      "acc_norm": 0.2931244295710374,
+      "acc_norm_stderr": 0.003969942004520753
+    }
+  },
+  "versions": {
+    "mmlu_hr": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_hu-llama-7B.json b/evals/mmlu/mmlu_hu-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..b74a19de5e6654aef46cf40427dc362a330fa08e
--- /dev/null
+++ b/evals/mmlu/mmlu_hu-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_hu": {
+      "acc": 0.27794840294840295,
+      "acc_stderr": 0.0039256419656824035,
+      "acc_norm": 0.29000307125307123,
+      "acc_norm_stderr": 0.0039762530331634354
+    }
+  },
+  "versions": {
+    "mmlu_hu": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_hy-llama-7B.json b/evals/mmlu/mmlu_hy-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..c10ca85321ddad4c7be01b48cec4e49a1e214777
--- /dev/null
+++ b/evals/mmlu/mmlu_hy-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_hy": {
+      "acc": 0.24800293820585806,
+      "acc_stderr": 0.004138305469907604,
+      "acc_norm": 0.2746304287944174,
+      "acc_norm_stderr": 0.004277007917763834
+    }
+  },
+  "versions": {
+    "mmlu_hy": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_id-llama-7B.json b/evals/mmlu/mmlu_id-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6135824ebca4f3650da00511c33cc4a21bfb152
--- /dev/null
+++ b/evals/mmlu/mmlu_id-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_id": {
+      "acc": 0.2795969773299748,
+      "acc_stderr": 0.003921194198043396,
+      "acc_norm": 0.2895962140294634,
+      "acc_norm_stderr": 0.003962902849695825
+    }
+  },
+  "versions": {
+    "mmlu_id": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_kn-llama-7B.json b/evals/mmlu/mmlu_kn-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..606fb0050e37b38e833800c8c6787674d6157cca
--- /dev/null
+++ b/evals/mmlu/mmlu_kn-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_kn": {
+      "acc": 0.23933209647495363,
+      "acc_stderr": 0.004010635314254899,
+      "acc_norm": 0.27096033218482196,
+      "acc_norm_stderr": 0.004177761014860752
+    }
+  },
+  "versions": {
+    "mmlu_kn": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ml-llama-7B.json b/evals/mmlu/mmlu_ml-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..1dc1ffa8a7a5300db7121f13be137d91ddd33088
--- /dev/null
+++ b/evals/mmlu/mmlu_ml-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_ml": {
+      "acc": 0.24492201668480232,
+      "acc_stderr": 0.0040952567017621564,
+      "acc_norm": 0.27529923830250275,
+      "acc_norm_stderr": 0.004253566006101179
+    }
+  },
+  "versions": {
+    "mmlu_ml": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ar_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_ar_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d572bf57654e75d51e028e16a79aa73942dadca1
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_ar_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ar_mc": {
+      "mc1": 0.2596899224806202,
+      "mc1_stderr": 0.01577046983489191,
+      "mc2": 0.4250856388236661,
+      "mc2_stderr": 0.01572683307613003
+    }
+  },
+  "versions": {
+    "truthfulqa_ar_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ar_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_ar_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f817545d204b5083023e5456ee8029ce2191005
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_ar_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ar_mc": {
+      "mc1": 0.2777777777777778,
+      "mc1_stderr": 0.016109958670672858,
+      "mc2": 0.4504998624708924,
+      "mc2_stderr": 0.01620052408197046
+    }
+  },
+  "versions": {
+    "truthfulqa_ar_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_bn_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_bn_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9370c174001acd0fca0cddf24e9076e303b9a18d
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_bn_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_bn_mc": {
+      "mc1": 0.26548672566371684,
+      "mc1_stderr": 0.015711139487640472,
+      "mc2": 0.4852587344144857,
+      "mc2_stderr": 0.01612406516233488
+    }
+  },
+  "versions": {
+    "truthfulqa_bn_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_bn_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_bn_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..16e9590be5e353f400674681f4f4e162bad08d5f
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_bn_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_bn_mc": {
+      "mc1": 0.27939317319848295,
+      "mc1_stderr": 0.015964066769100945,
+      "mc2": 0.513392699496713,
+      "mc2_stderr": 0.016700880970144227
+    }
+  },
+  "versions": {
+    "truthfulqa_bn_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ca_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_ca_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..11285119043f95ac0d376ad5c3e9afaeb0e2d7e9
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_ca_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ca_mc": {
+      "mc1": 0.24261874197689345,
+      "mc1_stderr": 0.01536843525152329,
+      "mc2": 0.39989771937446994,
+      "mc2_stderr": 0.015246797370718152
+    }
+  },
+  "versions": {
+    "truthfulqa_ca_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ca_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_ca_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..dd6e11c0a02074e790f1099cbbeb59e13a69f2e1
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_ca_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ca_mc": {
+      "mc1": 0.2336328626444159,
+      "mc1_stderr": 0.015170350095728855,
+      "mc2": 0.388488309525287,
+      "mc2_stderr": 0.015026705835089502
+    }
+  },
+  "versions": {
+    "truthfulqa_ca_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_da_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_da_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e55ee209ca0f7da10707018a73476230d0beb314
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_da_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_da_mc": {
+      "mc1": 0.26248399487836105,
+      "mc1_stderr": 0.015753963575796108,
+      "mc2": 0.4375025988127948,
+      "mc2_stderr": 0.01662443223981383
+    }
+  },
+  "versions": {
+    "truthfulqa_da_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_da_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_da_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..1b7cb2557be3886ead061adba89f89d50eefb9dd
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_da_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_da_mc": {
+      "mc1": 0.2573623559539053,
+      "mc1_stderr": 0.01565358047400349,
+      "mc2": 0.4161317873775416,
+      "mc2_stderr": 0.015138516880476799
+    }
+  },
+  "versions": {
+    "truthfulqa_da_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_de_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_de_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f9009861966dc1cff1e1868b91e2bb41bfccd0f4
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_de_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_de_mc": {
+      "mc1": 0.24746192893401014,
+      "mc1_stderr": 0.015382646812261827,
+      "mc2": 0.4351673407370902,
+      "mc2_stderr": 0.015914493454090475
+    }
+  },
+  "versions": {
+    "truthfulqa_de_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_de_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_de_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..37147ee36d47e8dd84509b2c477c0c4563f0a7c9
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_de_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_de_mc": {
+      "mc1": 0.233502538071066,
+      "mc1_stderr": 0.015080432502225447,
+      "mc2": 0.383224305558326,
+      "mc2_stderr": 0.014662714095686993
+    }
+  },
+  "versions": {
+    "truthfulqa_de_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_es_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_es_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c983b9fd981831059a19411e2f854761bb466743
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_es_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_es_mc": {
+      "mc1": 0.2468354430379747,
+      "mc1_stderr": 0.01535006418032032,
+      "mc2": 0.40446379335454147,
+      "mc2_stderr": 0.01462209461275691
+    }
+  },
+  "versions": {
+    "truthfulqa_es_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_es_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_es_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..ded6c86f6861c4d0dc091db262fe1d2a25208804
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_es_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_es_mc": {
+      "mc1": 0.22658227848101264,
+      "mc1_stderr": 0.014903268563982738,
+      "mc2": 0.37120532090630015,
+      "mc2_stderr": 0.014441690126415349
+    }
+  },
+  "versions": {
+    "truthfulqa_es_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_eu_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_eu_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..52f4939ac5fa964406f4eecce983e80178660657
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_eu_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_eu_mc": {
+      "mc1": 0.26214833759590794,
+      "mc1_stderr": 0.015737384911607682,
+      "mc2": 0.4464332201206485,
+      "mc2_stderr": 0.01621754992783137
+    }
+  },
+  "versions": {
+    "truthfulqa_eu_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_eu_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_eu_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..2591b2575e316599868892fc6541e53cca27f1eb
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_eu_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_eu_mc": {
+      "mc1": 0.22762148337595908,
+      "mc1_stderr": 0.01500362498587022,
+      "mc2": 0.4077400427662786,
+      "mc2_stderr": 0.01655029094183041
+    }
+  },
+  "versions": {
+    "truthfulqa_eu_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_fr_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_fr_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..74d3041ce242f33429dfa1dec98c70a446ad3459
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_fr_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_fr_mc": {
+      "mc1": 0.2598225602027883,
+      "mc1_stderr": 0.015622237721822354,
+      "mc2": 0.40857191925599595,
+      "mc2_stderr": 0.01474266494761903
+    }
+  },
+  "versions": {
+    "truthfulqa_fr_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_fr_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_fr_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..800ad2a78b80c2eb4974ba18bc90689969705247
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_fr_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_fr_mc": {
+      "mc1": 0.23827629911280102,
+      "mc1_stderr": 0.015176654543722067,
+      "mc2": 0.39924075017495203,
+      "mc2_stderr": 0.014258162205908845
+    }
+  },
+  "versions": {
+    "truthfulqa_fr_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_gu_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_gu_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..64f963ad419e8b93cc4134accc25685a3b6c7973
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_gu_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_gu_mc": {
+      "mc1": 0.2572944297082228,
+      "mc1_stderr": 0.015930376662111265,
+      "mc2": 0.4550226506739247,
+      "mc2_stderr": 0.016990336661822224
+    }
+  },
+  "versions": {
+    "truthfulqa_gu_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_gu_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_gu_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..c069c02eb514218d456bb1424dd8cfe77f48a1ab
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_gu_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_gu_mc": {
+      "mc1": 0.2572944297082228,
+      "mc1_stderr": 0.015930376662111265,
+      "mc2": 0.42704504017782213,
+      "mc2_stderr": 0.017012444121235887
+    }
+  },
+  "versions": {
+    "truthfulqa_gu_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_hi_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_hi_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8962a71a352d9b104821eb68a25a8785186a6f80
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_hi_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_hi_mc": {
+      "mc1": 0.26153846153846155,
+      "mc1_stderr": 0.0157457370262172,
+      "mc2": 0.4459427734456273,
+      "mc2_stderr": 0.015816895972907637
+    }
+  },
+  "versions": {
+    "truthfulqa_hi_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_hi_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_hi_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..2f7c57699fb99f36e65e991419808a451e65b58d
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_hi_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_hi_mc": {
+      "mc1": 0.28076923076923077,
+      "mc1_stderr": 0.016100529409585174,
+      "mc2": 0.47439648196687334,
+      "mc2_stderr": 0.016645149126511907
+    }
+  },
+  "versions": {
+    "truthfulqa_hi_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_hr_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_hr_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..314546568b9f50af4248c3961474c8f4e4d3b021
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_hr_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_hr_mc": {
+      "mc1": 0.2805194805194805,
+      "mc1_stderr": 0.01620047927370478,
+      "mc2": 0.4799867976765054,
+      "mc2_stderr": 0.016630823388575047
+    }
+  },
+  "versions": {
+    "truthfulqa_hr_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_hr_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_hr_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..a89b4ca336f2e469df36faf9e3b8bae78e238226
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_hr_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_hr_mc": {
+      "mc1": 0.24285714285714285,
+      "mc1_stderr": 0.015463264535393416,
+      "mc2": 0.4178069276061212,
+      "mc2_stderr": 0.015457117904740929
+    }
+  },
+  "versions": {
+    "truthfulqa_hr_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_hu_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_hu_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f0063c59598d9ace87e37889c678b775e7685f4e
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_hu_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_hu_mc": {
+      "mc1": 0.2664941785252264,
+      "mc1_stderr": 0.01591244793052595,
+      "mc2": 0.5012245769743321,
+      "mc2_stderr": 0.017012659134722635
+    }
+  },
+  "versions": {
+    "truthfulqa_hu_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_hu_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_hu_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..8186b5b669612791a673c1748562011f0fa91aec
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_hu_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_hu_mc": {
+      "mc1": 0.24579560155239327,
+      "mc1_stderr": 0.01549611867708382,
+      "mc2": 0.432092949382587,
+      "mc2_stderr": 0.015533288486024798
+    }
+  },
+  "versions": {
+    "truthfulqa_hu_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_hy_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_hy_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ddde03654d791b6a3794476cfb89b83c5ef45e53
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_hy_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_hy_mc": {
+      "mc1": 0.2629032258064516,
+      "mc1_stderr": 0.017693546356249937,
+      "mc2": 0.4681902443615651,
+      "mc2_stderr": 0.019292338415181538
+    }
+  },
+  "versions": {
+    "truthfulqa_hy_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_hy_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_hy_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..f5ca203decb570b2e7314edadae5c00a4adfc62c
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_hy_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_hy_mc": {
+      "mc1": 0.2564516129032258,
+      "mc1_stderr": 0.017551409976203195,
+      "mc2": 0.46436602760838236,
+      "mc2_stderr": 0.018999233967880117
+    }
+  },
+  "versions": {
+    "truthfulqa_hy_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_id_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_id_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e5c70280232e984fefca1f1a8cfe4a29409de1c8
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_id_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_id_mc": {
+      "mc1": 0.25288831835686776,
+      "mc1_stderr": 0.015583584105316878,
+      "mc2": 0.4035395580966099,
+      "mc2_stderr": 0.015018121460072335
+    }
+  },
+  "versions": {
+    "truthfulqa_id_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_id_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_id_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..6dfd743ea67805acd941cfb18f2d6362f9880f82
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_id_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_id_mc": {
+      "mc1": 0.25673940949935814,
+      "mc1_stderr": 0.015661271683095182,
+      "mc2": 0.39766031480749814,
+      "mc2_stderr": 0.015508891980724996
+    }
+  },
+  "versions": {
+    "truthfulqa_id_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_it_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_it_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e83a75ef58f484e4f28d9b48fd6106931bcd7a26
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_it_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_it_mc": {
+      "mc1": 0.2697201017811705,
+      "mc1_stderr": 0.015840413061442026,
+      "mc2": 0.4389841648203799,
+      "mc2_stderr": 0.015926853851979495
+    }
+  },
+  "versions": {
+    "truthfulqa_it_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_it_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_it_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..b9f0f156188649c4a7542e6d6f2ba9b37c457655
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_it_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_it_mc": {
+      "mc1": 0.24427480916030533,
+      "mc1_stderr": 0.015335094706043257,
+      "mc2": 0.39785622787135533,
+      "mc2_stderr": 0.014810294602470058
+    }
+  },
+  "versions": {
+    "truthfulqa_it_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_kn_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_kn_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4fe9dd96dcdc93bdacfb696ab94a78b3f7f7a246
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_kn_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_kn_mc": {
+      "mc1": 0.28792134831460675,
+      "mc1_stderr": 0.0169811116006733,
+      "mc2": 0.4971377207989088,
+      "mc2_stderr": 0.0171981853340177
+    }
+  },
+  "versions": {
+    "truthfulqa_kn_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_kn_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_kn_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..993e1c25914137bc34c8316cf67a25cc17ab83c4
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_kn_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_kn_mc": {
+      "mc1": 0.27808988764044945,
+      "mc1_stderr": 0.01680348492221316,
+      "mc2": 0.46974001502290064,
+      "mc2_stderr": 0.017840960060966953
+    }
+  },
+  "versions": {
+    "truthfulqa_kn_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ml_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_ml_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..24914faf5345d35faa0a1b782d6c784b3edd07d6
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_ml_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ml_mc": {
+      "mc1": 0.25831202046035806,
+      "mc1_stderr": 0.01566236755478916,
+      "mc2": 0.4909574719052267,
+      "mc2_stderr": 0.016823307128975565
+    }
+  },
+  "versions": {
+    "truthfulqa_ml_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ml_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_ml_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a3806514d04876fc28bdf3370af03eacd615826
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_ml_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ml_mc": {
+      "mc1": 0.2749360613810742,
+      "mc1_stderr": 0.015976383961112832,
+      "mc2": 0.5095091855665959,
+      "mc2_stderr": 0.016954647599861927
+    }
+  },
+  "versions": {
+    "truthfulqa_ml_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_mr_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_mr_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf87faf0ec1093a1588fff90b3e24e1f69710c9d
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_mr_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_mr_mc": {
+      "mc1": 0.2753807106598985,
+      "mc1_stderr": 0.015923346195889237,
+      "mc2": 0.47635177057868366,
+      "mc2_stderr": 0.016517346765693778
+    }
+  },
+  "versions": {
+    "truthfulqa_mr_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_mr_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_mr_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ddcccf963e412f25a53de4c98f439abf7a25388
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_mr_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_mr_mc": {
+      "mc1": 0.28553299492385786,
+      "mc1_stderr": 0.01610022231189975,
+      "mc2": 0.4895379243686521,
+      "mc2_stderr": 0.016741018968357894
+    }
+  },
+  "versions": {
+    "truthfulqa_mr_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ne_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_ne_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..90e378f71e4638bf2da69d765a15b05858d6e2b9
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_ne_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ne_mc": {
+      "mc1": 0.2880710659898477,
+      "mc1_stderr": 0.016142870973426694,
+      "mc2": 0.467435004054711,
+      "mc2_stderr": 0.016544742019032287
+    }
+  },
+  "versions": {
+    "truthfulqa_ne_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ne_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_ne_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..547c3b78ee0caef9b096972901f0b3d40c939029
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_ne_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ne_mc": {
+      "mc1": 0.2906091370558376,
+      "mc1_stderr": 0.016184901529011933,
+      "mc2": 0.466774725144191,
+      "mc2_stderr": 0.01677791483100084
+    }
+  },
+  "versions": {
+    "truthfulqa_ne_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_nl_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_nl_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3ce8ddbd63d98848d347aa0302b9cbaccb48cbd3
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_nl_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_nl_mc": {
+      "mc1": 0.25477707006369427,
+      "mc1_stderr": 0.015561993973145626,
+      "mc2": 0.4267767591847509,
+      "mc2_stderr": 0.016186878668566853
+    }
+  },
+  "versions": {
+    "truthfulqa_nl_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_nl_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_nl_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..22e9d2c488076c5884e9224d8636a092fac4fe96
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_nl_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_nl_mc": {
+      "mc1": 0.24331210191082803,
+      "mc1_stderr": 0.015324355488601159,
+      "mc2": 0.40023342153314706,
+      "mc2_stderr": 0.014679036703865578
+    }
+  },
+  "versions": {
+    "truthfulqa_nl_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_pt_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_pt_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b684b021a3805f5bd343cabdea341eecc0435e00
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_pt_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_pt_mc": {
+      "mc1": 0.23857868020304568,
+      "mc1_stderr": 0.015192910034567015,
+      "mc2": 0.38894722340741383,
+      "mc2_stderr": 0.014531269277587647
+    }
+  },
+  "versions": {
+    "truthfulqa_pt_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_pt_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_pt_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..7084df35e971145794041e3080344faabab95729
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_pt_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_pt_mc": {
+      "mc1": 0.22842639593908629,
+      "mc1_stderr": 0.014964922033138024,
+      "mc2": 0.3823261607330551,
+      "mc2_stderr": 0.01463319398314419
+    }
+  },
+  "versions": {
+    "truthfulqa_pt_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ro_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_ro_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..af2110ac326a3065b94fa267cded253cc069b3e0
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_ro_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ro_mc": {
+      "mc1": 0.2608695652173913,
+      "mc1_stderr": 0.015712552179082358,
+      "mc2": 0.46132785760214634,
+      "mc2_stderr": 0.016284566824666485
+    }
+  },
+  "versions": {
+    "truthfulqa_ro_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ro_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_ro_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe7ed655b7f61f10c4accbd13f5b9fc293536300
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_ro_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ro_mc": {
+      "mc1": 0.22762148337595908,
+      "mc1_stderr": 0.015003624985870205,
+      "mc2": 0.37160168017693795,
+      "mc2_stderr": 0.015014785650167688
+    }
+  },
+  "versions": {
+    "truthfulqa_ro_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ru_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_ru_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d15e5341b01a6e2876ffb863286387d4dcc69456
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_ru_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ru_mc": {
+      "mc1": 0.30632911392405066,
+      "mc1_stderr": 0.016410898874958186,
+      "mc2": 0.49751656068823824,
+      "mc2_stderr": 0.016150279946055047
+    }
+  },
+  "versions": {
+    "truthfulqa_ru_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ru_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_ru_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..2036782896e35aee07acce858c408720bcb3b9b9
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_ru_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ru_mc": {
+      "mc1": 0.24556962025316456,
+      "mc1_stderr": 0.015323515145952671,
+      "mc2": 0.40851860840920967,
+      "mc2_stderr": 0.015225752517489843
+    }
+  },
+  "versions": {
+    "truthfulqa_ru_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_sk_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_sk_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9bb50aa50589d9959d2accbb09d2d099246f74e5
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_sk_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_sk_mc": {
+      "mc1": 0.23846153846153847,
+      "mc1_stderr": 0.015268148070057835,
+      "mc2": 0.4379856829317774,
+      "mc2_stderr": 0.016560323561497736
+    }
+  },
+  "versions": {
+    "truthfulqa_sk_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_sk_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_sk_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..13785fc105b2964d3bcf70bb68daf0ddc0ccdbfd
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_sk_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_sk_mc": {
+      "mc1": 0.22692307692307692,
+      "mc1_stderr": 0.01500658794494848,
+      "mc2": 0.40846796746265707,
+      "mc2_stderr": 0.015828756550364212
+    }
+  },
+  "versions": {
+    "truthfulqa_sk_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_sr_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_sr_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a9be337308c1b4de36187d0139341115ab5acc1
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_sr_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_sr_mc": {
+      "mc1": 0.2875318066157761,
+      "mc1_stderr": 0.016154400981864346,
+      "mc2": 0.4611856949025646,
+      "mc2_stderr": 0.01648960635223338
+    }
+  },
+  "versions": {
+    "truthfulqa_sr_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_sr_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_sr_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a70158ad0bf874c11233369e2b8b2fbd08bb508
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_sr_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_sr_mc": {
+      "mc1": 0.2684478371501272,
+      "mc1_stderr": 0.015816769133859612,
+      "mc2": 0.42343608663478216,
+      "mc2_stderr": 0.015372831241353751
+    }
+  },
+  "versions": {
+    "truthfulqa_sr_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_sv_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_sv_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9885cf6375b817aa059b00ca8a5df86a2f6bbce4
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_sv_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_sv_mc": {
+      "mc1": 0.2622739018087855,
+      "mc1_stderr": 0.015821052272364522,
+      "mc2": 0.4457248931967088,
+      "mc2_stderr": 0.016517364176123605
+    }
+  },
+  "versions": {
+    "truthfulqa_sv_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_sv_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_sv_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..1665d4f2e88a870557fd94395d1d54f58919d85c
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_sv_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_sv_mc": {
+      "mc1": 0.2596899224806202,
+      "mc1_stderr": 0.015770469834891904,
+      "mc2": 0.40528913702963154,
+      "mc2_stderr": 0.015006798915735541
+    }
+  },
+  "versions": {
+    "truthfulqa_sv_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ta_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_ta_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d2164cc879ad161ad6563ad86aee4884b45ea32
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_ta_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ta_mc": {
+      "mc1": 0.26015228426395937,
+      "mc1_stderr": 0.015638591095633272,
+      "mc2": 0.4828328722219756,
+      "mc2_stderr": 0.01641270817636116
+    }
+  },
+  "versions": {
+    "truthfulqa_ta_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ta_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_ta_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..fee0b1146f0fd8e72ac72b5e05a85a9d0c18afcb
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_ta_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ta_mc": {
+      "mc1": 0.27411167512690354,
+      "mc1_stderr": 0.015900519226497174,
+      "mc2": 0.5027478455482438,
+      "mc2_stderr": 0.016693455124890125
+    }
+  },
+  "versions": {
+    "truthfulqa_ta_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_te_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_te_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..cb186a4cf39dc7c369f4adcb4c21742a3bb8d875
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_te_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_te_mc": {
+      "mc1": 0.2646276595744681,
+      "mc1_stderr": 0.016097235388949582,
+      "mc2": 0.4761751419934964,
+      "mc2_stderr": 0.01699481972514669
+    }
+  },
+  "versions": {
+    "truthfulqa_te_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_te_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_te_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..6a27e1784964b5486d2d2aeb7d5418ef3fbc892d
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_te_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_te_mc": {
+      "mc1": 0.2898936170212766,
+      "mc1_stderr": 0.016556215331027437,
+      "mc2": 0.4950446673992078,
+      "mc2_stderr": 0.017314129921675917
+    }
+  },
+  "versions": {
+    "truthfulqa_te_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_uk_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_uk_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2a55f54ab6ab50194bfa1058aacbecc18b36d6e7
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_uk_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_uk_mc": {
+      "mc1": 0.3082901554404145,
+      "mc1_stderr": 0.016630856554976103,
+      "mc2": 0.5156453949784039,
+      "mc2_stderr": 0.01673540498425732
+    }
+  },
+  "versions": {
+    "truthfulqa_uk_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_uk_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_uk_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..87ffa1a02265b9ec13f193b53fba9b06f985e7a2
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_uk_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_uk_mc": {
+      "mc1": 0.23575129533678757,
+      "mc1_stderr": 0.015286822062573322,
+      "mc2": 0.41551850845167937,
+      "mc2_stderr": 0.01559551532730194
+    }
+  },
+  "versions": {
+    "truthfulqa_uk_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_vi_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_vi_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..641e07a270f97ae74adc933fcaaf2f17f0cc2720
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_vi_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_vi_mc": {
+      "mc1": 0.2969543147208122,
+      "mc1_stderr": 0.01628730493420265,
+      "mc2": 0.44687544361363724,
+      "mc2_stderr": 0.015032707389451902
+    }
+  },
+  "versions": {
+    "truthfulqa_vi_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_vi_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_vi_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..281dd4ecf9b86e311de0e817f9acf01943305b44
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_vi_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_vi_mc": {
+      "mc1": 0.2436548223350254,
+      "mc1_stderr": 0.015302421509379252,
+      "mc2": 0.42906776165158894,
+      "mc2_stderr": 0.016213220197264143
+    }
+  },
+  "versions": {
+    "truthfulqa_vi_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_zh_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_zh_mc_bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ccc762b26a77cd8c55bbb320f1c81e3b51e30910
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_zh_mc_bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_zh_mc": {
+      "mc1": 0.22727272727272727,
+      "mc1_stderr": 0.014900421035751319,
+      "mc2": 0.3872774224063368,
+      "mc2_stderr": 0.01489618179042084
+    }
+  },
+  "versions": {
+    "truthfulqa_zh_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_zh_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_zh_mc_llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..5e49b170e61cb016ddf2105ccf2469c2fd884a24
--- /dev/null
+++ b/evals/truthfulqa-mc/truthfulqa_zh_mc_llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_zh_mc": {
+      "mc1": 0.26515151515151514,
+      "mc1_stderr": 0.015694869766795665,
+      "mc2": 0.43429601246293487,
+      "mc2_stderr": 0.015796890327346987
+    }
+  },
+  "versions": {
+    "truthfulqa_zh_mc": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": "1",
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file