diff --git a/evals/arc-challenge/arc_ar_challenge_bloom-1b7.json b/evals/arc-challenge/arc_ar_challenge_bloom-1b7.json
deleted file mode 100644
index f11ea3c48ac461ea8df812ba639e5871955a3481..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ar_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ar_challenge": {
-      "acc": 0.22818791946308725,
-      "acc_stderr": 0.02435139725761051,
-      "acc_norm": 0.2516778523489933,
-      "acc_norm_stderr": 0.025181904610615872
-    }
-  },
-  "versions": {
-    "arc_ar_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ar_challenge_bloom-560.json b/evals/arc-challenge/arc_ar_challenge_bloom-560.json
deleted file mode 100644
index 49fe745a2caa93a57a99f2a5d13b829f8544cd13..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ar_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ar_challenge": {
-      "acc": 0.2550335570469799,
-      "acc_stderr": 0.025292327380712708,
-      "acc_norm": 0.2550335570469799,
-      "acc_norm_stderr": 0.025292327380712708
-    }
-  },
-  "versions": {
-    "arc_ar_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ar_challenge_bloom-7b1.json b/evals/arc-challenge/arc_ar_challenge_bloom-7b1.json
deleted file mode 100644
index b79172a73e91dbbf21909686c17e2c23c1f18bef..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ar_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ar_challenge": {
-      "acc": 0.28187919463087246,
-      "acc_stderr": 0.026106703750007426,
-      "acc_norm": 0.3087248322147651,
-      "acc_norm_stderr": 0.026806063072940547
-    }
-  },
-  "versions": {
-    "arc_ar_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ar_challenge_gpt2-large.json b/evals/arc-challenge/arc_ar_challenge_gpt2-large.json
deleted file mode 100644
index f1aadc6691007c31ca76e985257d9ebfbffa04c5..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ar_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ar_challenge": {
-      "acc": 0.20134228187919462,
-      "acc_stderr": 0.023268565767685306,
-      "acc_norm": 0.21476510067114093,
-      "acc_norm_stderr": 0.023828868848284352
-    }
-  },
-  "versions": {
-    "arc_ar_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ar_challenge_gpt2-medium.json b/evals/arc-challenge/arc_ar_challenge_gpt2-medium.json
deleted file mode 100644
index db628063ccf012f4301410acf74c6449499d4a18..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ar_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ar_challenge": {
-      "acc": 0.19463087248322147,
-      "acc_stderr": 0.022973392306598162,
-      "acc_norm": 0.21140939597315436,
-      "acc_norm_stderr": 0.02369243605357901
-    }
-  },
-  "versions": {
-    "arc_ar_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ar_challenge_gpt2.json b/evals/arc-challenge/arc_ar_challenge_gpt2.json
deleted file mode 100644
index 5deb8a5f49f36a08688564ca109ad5160192b56e..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ar_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ar_challenge": {
-      "acc": 0.20134228187919462,
-      "acc_stderr": 0.023268565767685313,
-      "acc_norm": 0.22483221476510068,
-      "acc_norm_stderr": 0.024224169829650755
-    }
-  },
-  "versions": {
-    "arc_ar_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ar_challenge_llama-7B.json b/evals/arc-challenge/arc_ar_challenge_llama-7B.json
deleted file mode 100644
index e1b5a76fae32ffadeb87c9a634cef2c6de55e923..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ar_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ar_challenge": {
-      "acc": 0.22483221476510068,
-      "acc_stderr": 0.02422416982965075,
-      "acc_norm": 0.24161073825503357,
-      "acc_norm_stderr": 0.024838535108028477
-    }
-  },
-  "versions": {
-    "arc_ar_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_bn_challenge_bloom-1b7.json b/evals/arc-challenge/arc_bn_challenge_bloom-1b7.json
deleted file mode 100644
index fa55573d46ebd614a4feb5a1aac46df0effefe2f..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_bn_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_bn_challenge": {
-      "acc": 0.20945945945945946,
-      "acc_stderr": 0.023691963473475724,
-      "acc_norm": 0.2533783783783784,
-      "acc_norm_stderr": 0.025323518629100008
-    }
-  },
-  "versions": {
-    "arc_bn_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_bn_challenge_bloom-560.json b/evals/arc-challenge/arc_bn_challenge_bloom-560.json
deleted file mode 100644
index 389eeb09c0a92f6b7861501b6a3e0b9caff08e3e..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_bn_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_bn_challenge": {
-      "acc": 0.22972972972972974,
-      "acc_stderr": 0.024491712953916975,
-      "acc_norm": 0.24662162162162163,
-      "acc_norm_stderr": 0.025096383517594287
-    }
-  },
-  "versions": {
-    "arc_bn_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_bn_challenge_bloom-7b1.json b/evals/arc-challenge/arc_bn_challenge_bloom-7b1.json
deleted file mode 100644
index 7cf6ca71cd6f8268d0ed709fbff3ff9aa1aa20f9..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_bn_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_bn_challenge": {
-      "acc": 0.23986486486486486,
-      "acc_stderr": 0.02486094967084638,
-      "acc_norm": 0.28040540540540543,
-      "acc_norm_stderr": 0.026153277917823237
-    }
-  },
-  "versions": {
-    "arc_bn_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_bn_challenge_gpt2-medium.json b/evals/arc-challenge/arc_bn_challenge_gpt2-medium.json
deleted file mode 100644
index 69dd44fcae67f0511715af28d9a6762dc0732634..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_bn_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_bn_challenge": {
-      "acc": 0.20608108108108109,
-      "acc_stderr": 0.02355028295929425,
-      "acc_norm": 0.24662162162162163,
-      "acc_norm_stderr": 0.02509638351759427
-    }
-  },
-  "versions": {
-    "arc_bn_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_bn_challenge_gpt2.json b/evals/arc-challenge/arc_bn_challenge_gpt2.json
deleted file mode 100644
index 2de0f9a7b900ac9accabd3ade0c8a4d14d7fda03..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_bn_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_bn_challenge": {
-      "acc": 0.22635135135135134,
-      "acc_stderr": 0.024364215012920555,
-      "acc_norm": 0.2668918918918919,
-      "acc_norm_stderr": 0.025753762926257917
-    }
-  },
-  "versions": {
-    "arc_bn_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_bn_challenge_llama-7B.json b/evals/arc-challenge/arc_bn_challenge_llama-7B.json
deleted file mode 100644
index a3dbec93edb13b0fdf7c70d9a22d0f709e0a25b2..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_bn_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_bn_challenge": {
-      "acc": 0.22635135135135134,
-      "acc_stderr": 0.024364215012920565,
-      "acc_norm": 0.26013513513513514,
-      "acc_norm_stderr": 0.02554257639364025
-    }
-  },
-  "versions": {
-    "arc_bn_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ca_challenge_bloom-1b7.json b/evals/arc-challenge/arc_ca_challenge_bloom-1b7.json
deleted file mode 100644
index 80c6381676cf5f4508fe26a2e71b75de9f5857f5..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ca_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ca_challenge": {
-      "acc": 0.2356902356902357,
-      "acc_stderr": 0.02466946003490763,
-      "acc_norm": 0.27946127946127947,
-      "acc_norm_stderr": 0.026082164400369843
-    }
-  },
-  "versions": {
-    "arc_ca_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ca_challenge_bloom-560.json b/evals/arc-challenge/arc_ca_challenge_bloom-560.json
deleted file mode 100644
index 74ea721d64eabef94a72533148cf4d15946ea667..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ca_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ca_challenge": {
-      "acc": 0.2053872053872054,
-      "acc_stderr": 0.02348110951859932,
-      "acc_norm": 0.23232323232323232,
-      "acc_norm_stderr": 0.02454650495612789
-    }
-  },
-  "versions": {
-    "arc_ca_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ca_challenge_bloom-7b1.json b/evals/arc-challenge/arc_ca_challenge_bloom-7b1.json
deleted file mode 100644
index 828e5442ee5f197e68f640cec0d3f5a4d2190a86..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ca_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ca_challenge": {
-      "acc": 0.3164983164983165,
-      "acc_stderr": 0.02703395838420779,
-      "acc_norm": 0.3434343434343434,
-      "acc_norm_stderr": 0.0276003816062635
-    }
-  },
-  "versions": {
-    "arc_ca_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ca_challenge_gpt2-large.json b/evals/arc-challenge/arc_ca_challenge_gpt2-large.json
deleted file mode 100644
index 1d1333c44929e8c397db2c9c89aa32f6c849e02f..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ca_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ca_challenge": {
-      "acc": 0.20875420875420875,
-      "acc_stderr": 0.02362258775627148,
-      "acc_norm": 0.22895622895622897,
-      "acc_norm_stderr": 0.02442136264227106
-    }
-  },
-  "versions": {
-    "arc_ca_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ca_challenge_gpt2-medium.json b/evals/arc-challenge/arc_ca_challenge_gpt2-medium.json
deleted file mode 100644
index b9427197beac9ba8529aa3e8014b5dee0307e089..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ca_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ca_challenge": {
-      "acc": 0.20875420875420875,
-      "acc_stderr": 0.023622587756271473,
-      "acc_norm": 0.21212121212121213,
-      "acc_norm_stderr": 0.023761611918761673
-    }
-  },
-  "versions": {
-    "arc_ca_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ca_challenge_gpt2.json b/evals/arc-challenge/arc_ca_challenge_gpt2.json
deleted file mode 100644
index a9ebfd334ce3c7fa9305ddb2650d0c9ed8d727ac..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ca_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ca_challenge": {
-      "acc": 0.21885521885521886,
-      "acc_stderr": 0.024032467624412215,
-      "acc_norm": 0.21885521885521886,
-      "acc_norm_stderr": 0.02403246762441221
-    }
-  },
-  "versions": {
-    "arc_ca_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ca_challenge_llama-7B.json b/evals/arc-challenge/arc_ca_challenge_llama-7B.json
deleted file mode 100644
index 5b79736bea0e6806983af2b1d26982bb71d2169c..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ca_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ca_challenge": {
-      "acc": 0.29292929292929293,
-      "acc_stderr": 0.026452514969665927,
-      "acc_norm": 0.29292929292929293,
-      "acc_norm_stderr": 0.02645251496966592
-    }
-  },
-  "versions": {
-    "arc_ca_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_da_challenge_bloom-1b7.json b/evals/arc-challenge/arc_da_challenge_bloom-1b7.json
deleted file mode 100644
index ad507f37ee73db4c175fcd2ff76b2949c5186f12..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_da_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_da_challenge": {
-      "acc": 0.2255892255892256,
-      "acc_stderr": 0.02429399929295737,
-      "acc_norm": 0.26262626262626265,
-      "acc_norm_stderr": 0.02557802773320011
-    }
-  },
-  "versions": {
-    "arc_da_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_da_challenge_bloom-560.json b/evals/arc-challenge/arc_da_challenge_bloom-560.json
deleted file mode 100644
index 76c97cf086a3d4eb479d7ea19745c4f301127a2e..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_da_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_da_challenge": {
-      "acc": 0.25925925925925924,
-      "acc_stderr": 0.025471492792791667,
-      "acc_norm": 0.24579124579124578,
-      "acc_norm_stderr": 0.025025521384235284
-    }
-  },
-  "versions": {
-    "arc_da_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_da_challenge_bloom-7b1.json b/evals/arc-challenge/arc_da_challenge_bloom-7b1.json
deleted file mode 100644
index 38cbbb63a1aa857301e47a632ca28cb48df2b26a..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_da_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_da_challenge": {
-      "acc": 0.24242424242424243,
-      "acc_stderr": 0.02490893747050877,
-      "acc_norm": 0.24915824915824916,
-      "acc_norm_stderr": 0.025140041284626418
-    }
-  },
-  "versions": {
-    "arc_da_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_da_challenge_gpt2-large.json b/evals/arc-challenge/arc_da_challenge_gpt2-large.json
deleted file mode 100644
index c8ee21dc7b9e87604443ebe5bc43e5cd6006ac8a..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_da_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_da_challenge": {
-      "acc": 0.23232323232323232,
-      "acc_stderr": 0.02454650495612789,
-      "acc_norm": 0.24242424242424243,
-      "acc_norm_stderr": 0.024908937470508753
-    }
-  },
-  "versions": {
-    "arc_da_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_da_challenge_gpt2-medium.json b/evals/arc-challenge/arc_da_challenge_gpt2-medium.json
deleted file mode 100644
index df7aa6d8d8bffd69ae15219bdb1f31971d2146b7..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_da_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_da_challenge": {
-      "acc": 0.24579124579124578,
-      "acc_stderr": 0.0250255213842353,
-      "acc_norm": 0.2727272727272727,
-      "acc_norm_stderr": 0.025886127156886297
-    }
-  },
-  "versions": {
-    "arc_da_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_da_challenge_gpt2.json b/evals/arc-challenge/arc_da_challenge_gpt2.json
deleted file mode 100644
index e06d761ac718567edd82446e7cab3db268352caf..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_da_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_da_challenge": {
-      "acc": 0.2222222222222222,
-      "acc_stderr": 0.02416437978893547,
-      "acc_norm": 0.23905723905723905,
-      "acc_norm_stderr": 0.024790260423468984
-    }
-  },
-  "versions": {
-    "arc_da_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_da_challenge_llama-7B.json b/evals/arc-challenge/arc_da_challenge_llama-7B.json
deleted file mode 100644
index 0669687f3d0755614d71660a1b71b9c1d16c99af..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_da_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_da_challenge": {
-      "acc": 0.3063973063973064,
-      "acc_stderr": 0.026794891419479452,
-      "acc_norm": 0.3367003367003367,
-      "acc_norm_stderr": 0.02746823841289221
-    }
-  },
-  "versions": {
-    "arc_da_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_de_challenge_bloom-1b7.json b/evals/arc-challenge/arc_de_challenge_bloom-1b7.json
deleted file mode 100644
index 2c10bc700c0ecb2dfc8bde73b2f3f18879be1571..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_de_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_de_challenge": {
-      "acc": 0.24496644295302014,
-      "acc_stderr": 0.024955035980898946,
-      "acc_norm": 0.2953020134228188,
-      "acc_norm_stderr": 0.026470155629081085
-    }
-  },
-  "versions": {
-    "arc_de_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_de_challenge_bloom-560.json b/evals/arc-challenge/arc_de_challenge_bloom-560.json
deleted file mode 100644
index 0c23e9b1eaef780d6a824e7c0f623556d950ca89..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_de_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_de_challenge": {
-      "acc": 0.2348993288590604,
-      "acc_stderr": 0.024599255015999244,
-      "acc_norm": 0.28187919463087246,
-      "acc_norm_stderr": 0.026106703750007426
-    }
-  },
-  "versions": {
-    "arc_de_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_de_challenge_bloom-7b1.json b/evals/arc-challenge/arc_de_challenge_bloom-7b1.json
deleted file mode 100644
index 477d702b1bc9eee6d2f6b2ada459a35f84ed90e2..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_de_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_de_challenge": {
-      "acc": 0.2684563758389262,
-      "acc_stderr": 0.0257145395148175,
-      "acc_norm": 0.2684563758389262,
-      "acc_norm_stderr": 0.0257145395148175
-    }
-  },
-  "versions": {
-    "arc_de_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_de_challenge_gpt2-large.json b/evals/arc-challenge/arc_de_challenge_gpt2-large.json
deleted file mode 100644
index 2bc523b2a951a72b3cd9a3ca1f364c1880010ab0..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_de_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_de_challenge": {
-      "acc": 0.23825503355704697,
-      "acc_stderr": 0.024719951493159625,
-      "acc_norm": 0.27181208053691275,
-      "acc_norm_stderr": 0.025815342279487567
-    }
-  },
-  "versions": {
-    "arc_de_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_de_challenge_gpt2-medium.json b/evals/arc-challenge/arc_de_challenge_gpt2-medium.json
deleted file mode 100644
index 45b24780309957f9064133758d7f8cccdb182f96..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_de_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_de_challenge": {
-      "acc": 0.23825503355704697,
-      "acc_stderr": 0.024719951493159625,
-      "acc_norm": 0.28859060402684567,
-      "acc_norm_stderr": 0.026291942108676806
-    }
-  },
-  "versions": {
-    "arc_de_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_de_challenge_gpt2.json b/evals/arc-challenge/arc_de_challenge_gpt2.json
deleted file mode 100644
index dcac4b017ab401c82005ea115725c223d14f4bbb..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_de_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_de_challenge": {
-      "acc": 0.22483221476510068,
-      "acc_stderr": 0.02422416982965075,
-      "acc_norm": 0.21140939597315436,
-      "acc_norm_stderr": 0.02369243605357901
-    }
-  },
-  "versions": {
-    "arc_de_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_de_challenge_llama-7B.json b/evals/arc-challenge/arc_de_challenge_llama-7B.json
deleted file mode 100644
index 8cb6300f14d8c556143f550509be7862841dc7c6..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_de_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_de_challenge": {
-      "acc": 0.2785234899328859,
-      "acc_stderr": 0.0260114035784859,
-      "acc_norm": 0.348993288590604,
-      "acc_norm_stderr": 0.027658144793750224
-    }
-  },
-  "versions": {
-    "arc_de_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_es_challenge_bloom-1b7.json b/evals/arc-challenge/arc_es_challenge_bloom-1b7.json
deleted file mode 100644
index 74eba78a722fcedb488ec904b2f0d58171c8a749..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_es_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_es_challenge": {
-      "acc": 0.2356902356902357,
-      "acc_stderr": 0.02466946003490763,
-      "acc_norm": 0.2895622895622896,
-      "acc_norm_stderr": 0.026362594432681956
-    }
-  },
-  "versions": {
-    "arc_es_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_es_challenge_bloom-560.json b/evals/arc-challenge/arc_es_challenge_bloom-560.json
deleted file mode 100644
index f03023ac512f6466bc05adcbbd4b74fafdb0701e..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_es_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_es_challenge": {
-      "acc": 0.2255892255892256,
-      "acc_stderr": 0.024293999292957367,
-      "acc_norm": 0.2356902356902357,
-      "acc_norm_stderr": 0.02466946003490764
-    }
-  },
-  "versions": {
-    "arc_es_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_es_challenge_bloom-7b1.json b/evals/arc-challenge/arc_es_challenge_bloom-7b1.json
deleted file mode 100644
index 42cce52cd279c31092e728aadcc63cb1e0a04b59..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_es_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_es_challenge": {
-      "acc": 0.3265993265993266,
-      "acc_stderr": 0.027258287015652305,
-      "acc_norm": 0.3602693602693603,
-      "acc_norm_stderr": 0.02790399493827167
-    }
-  },
-  "versions": {
-    "arc_es_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_es_challenge_gpt2-large.json b/evals/arc-challenge/arc_es_challenge_gpt2-large.json
deleted file mode 100644
index 8889a96dc89f373c32d03d03beba715496d3c5cf..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_es_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_es_challenge": {
-      "acc": 0.2222222222222222,
-      "acc_stderr": 0.024164379788935483,
-      "acc_norm": 0.26262626262626265,
-      "acc_norm_stderr": 0.02557802773320012
-    }
-  },
-  "versions": {
-    "arc_es_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_es_challenge_gpt2-medium.json b/evals/arc-challenge/arc_es_challenge_gpt2-medium.json
deleted file mode 100644
index 292e3ed1cc0e8b1b1063554055397c13de7ff5f7..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_es_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_es_challenge": {
-      "acc": 0.1919191919191919,
-      "acc_stderr": 0.022889733897083934,
-      "acc_norm": 0.25252525252525254,
-      "acc_norm_stderr": 0.02525252525252536
-    }
-  },
-  "versions": {
-    "arc_es_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_es_challenge_gpt2.json b/evals/arc-challenge/arc_es_challenge_gpt2.json
deleted file mode 100644
index e71f05e3b44a477a0c85e997c61776163460f160..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_es_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_es_challenge": {
-      "acc": 0.19865319865319866,
-      "acc_stderr": 0.023190610381322127,
-      "acc_norm": 0.24579124579124578,
-      "acc_norm_stderr": 0.0250255213842353
-    }
-  },
-  "versions": {
-    "arc_es_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_es_challenge_llama-7B.json b/evals/arc-challenge/arc_es_challenge_llama-7B.json
deleted file mode 100644
index 0fab72d1a1f2e4fd24095bb5ec61c4a1d8f08aee..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_es_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_es_challenge": {
-      "acc": 0.3501683501683502,
-      "acc_stderr": 0.027726370308831506,
-      "acc_norm": 0.3602693602693603,
-      "acc_norm_stderr": 0.02790399493827167
-    }
-  },
-  "versions": {
-    "arc_es_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_eu_challenge_bloom-1b7.json b/evals/arc-challenge/arc_eu_challenge_bloom-1b7.json
deleted file mode 100644
index ec1113a347e63807533e24faa9f8f1133a725ba3..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_eu_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_eu_challenge": {
-      "acc": 0.22377622377622378,
-      "acc_stderr": 0.02468755105337312,
-      "acc_norm": 0.2517482517482518,
-      "acc_norm_stderr": 0.02570896966075011
-    }
-  },
-  "versions": {
-    "arc_eu_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_eu_challenge_bloom-560.json b/evals/arc-challenge/arc_eu_challenge_bloom-560.json
deleted file mode 100644
index d21d146ef31af9e17f56082cab45ffcd1938858f..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_eu_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_eu_challenge": {
-      "acc": 0.24475524475524477,
-      "acc_stderr": 0.02546756553847068,
-      "acc_norm": 0.19230769230769232,
-      "acc_norm_stderr": 0.023345268410264786
-    }
-  },
-  "versions": {
-    "arc_eu_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_eu_challenge_bloom-7b1.json b/evals/arc-challenge/arc_eu_challenge_bloom-7b1.json
deleted file mode 100644
index a5c3fd12b9223764b5f572dbfa37a6903f058c5e..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_eu_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_eu_challenge": {
-      "acc": 0.23076923076923078,
-      "acc_stderr": 0.024957141712425013,
-      "acc_norm": 0.24125874125874125,
-      "acc_norm_stderr": 0.025343462496583764
-    }
-  },
-  "versions": {
-    "arc_eu_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_eu_challenge_gpt2-large.json b/evals/arc-challenge/arc_eu_challenge_gpt2-large.json
deleted file mode 100644
index 1ca1581ef49b197cacfd25186739d7697494240c..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_eu_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_eu_challenge": {
-      "acc": 0.25874125874125875,
-      "acc_stderr": 0.02594151450124707,
-      "acc_norm": 0.24125874125874125,
-      "acc_norm_stderr": 0.025343462496583737
-    }
-  },
-  "versions": {
-    "arc_eu_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_eu_challenge_gpt2-medium.json b/evals/arc-challenge/arc_eu_challenge_gpt2-medium.json
deleted file mode 100644
index 9fcb0f103e4f8b17826dc742c5e2fd7760677501..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_eu_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_eu_challenge": {
-      "acc": 0.2762237762237762,
-      "acc_stderr": 0.026485626798716442,
-      "acc_norm": 0.25874125874125875,
-      "acc_norm_stderr": 0.025941514501247064
-    }
-  },
-  "versions": {
-    "arc_eu_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_eu_challenge_gpt2.json b/evals/arc-challenge/arc_eu_challenge_gpt2.json
deleted file mode 100644
index 7a6f7747e337535ab8fba538b1b3e6292e596be8..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_eu_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_eu_challenge": {
-      "acc": 0.2762237762237762,
-      "acc_stderr": 0.026485626798716456,
-      "acc_norm": 0.24825174825174826,
-      "acc_norm_stderr": 0.025589390464738234
-    }
-  },
-  "versions": {
-    "arc_eu_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_eu_challenge_llama-7B.json b/evals/arc-challenge/arc_eu_challenge_llama-7B.json
deleted file mode 100644
index 748beb769c74d6f45c8e93c5a0151df8949243d5..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_eu_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_eu_challenge": {
-      "acc": 0.26223776223776224,
-      "acc_stderr": 0.026054539173797044,
-      "acc_norm": 0.23426573426573427,
-      "acc_norm_stderr": 0.02508828621716978
-    }
-  },
-  "versions": {
-    "arc_eu_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_fr_challenge_bloom-1b7.json b/evals/arc-challenge/arc_fr_challenge_bloom-1b7.json
deleted file mode 100644
index e45f16627cad6e7f9c00c5e957f834e5d38c0364..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_fr_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_fr_challenge": {
-      "acc": 0.2550335570469799,
-      "acc_stderr": 0.025292327380712687,
-      "acc_norm": 0.2953020134228188,
-      "acc_norm_stderr": 0.026470155629081078
-    }
-  },
-  "versions": {
-    "arc_fr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_fr_challenge_bloom-560.json b/evals/arc-challenge/arc_fr_challenge_bloom-560.json
deleted file mode 100644
index c6a22e37448b26cc7b45d56b9eb1cb9358ea8a34..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_fr_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_fr_challenge": {
-      "acc": 0.2348993288590604,
-      "acc_stderr": 0.024599255015999244,
-      "acc_norm": 0.25838926174496646,
-      "acc_norm_stderr": 0.025400777524610105
-    }
-  },
-  "versions": {
-    "arc_fr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_fr_challenge_bloom-7b1.json b/evals/arc-challenge/arc_fr_challenge_bloom-7b1.json
deleted file mode 100644
index e7fc02c83acce1c27f68cacb276ebf9d1038459b..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_fr_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_fr_challenge": {
-      "acc": 0.36577181208053694,
-      "acc_stderr": 0.027947930997299652,
-      "acc_norm": 0.3825503355704698,
-      "acc_norm_stderr": 0.02820115194087938
-    }
-  },
-  "versions": {
-    "arc_fr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_fr_challenge_gpt2-large.json b/evals/arc-challenge/arc_fr_challenge_gpt2-large.json
deleted file mode 100644
index 9aae5d2ce6adfb2eb44ca3f0cdc1108895cd0a83..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_fr_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_fr_challenge": {
-      "acc": 0.1912751677852349,
-      "acc_stderr": 0.02282188225534101,
-      "acc_norm": 0.2684563758389262,
-      "acc_norm_stderr": 0.025714539514817496
-    }
-  },
-  "versions": {
-    "arc_fr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_fr_challenge_gpt2-medium.json b/evals/arc-challenge/arc_fr_challenge_gpt2-medium.json
deleted file mode 100644
index 465234e97d674cd00fa45996ea2f08a2d3e81dff..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_fr_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_fr_challenge": {
-      "acc": 0.2181208053691275,
-      "acc_stderr": 0.023962942745646792,
-      "acc_norm": 0.2785234899328859,
-      "acc_norm_stderr": 0.026011403578485918
-    }
-  },
-  "versions": {
-    "arc_fr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_fr_challenge_gpt2.json b/evals/arc-challenge/arc_fr_challenge_gpt2.json
deleted file mode 100644
index 4e91d18eac5ed9bf7def9d899e70e9280a10d994..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_fr_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_fr_challenge": {
-      "acc": 0.2080536912751678,
-      "acc_stderr": 0.023553603370264107,
-      "acc_norm": 0.2751677852348993,
-      "acc_norm_stderr": 0.025914289910427518
-    }
-  },
-  "versions": {
-    "arc_fr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_fr_challenge_llama-7B.json b/evals/arc-challenge/arc_fr_challenge_llama-7B.json
deleted file mode 100644
index 289f9e2b1689351de784a6a0a22e47ebaa0bcc28..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_fr_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_fr_challenge": {
-      "acc": 0.3523489932885906,
-      "acc_stderr": 0.027719080218117063,
-      "acc_norm": 0.3422818791946309,
-      "acc_norm_stderr": 0.027531738303985358
-    }
-  },
-  "versions": {
-    "arc_fr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_gu_challenge_bloom-1b7.json b/evals/arc-challenge/arc_gu_challenge_bloom-1b7.json
deleted file mode 100644
index a68c6f6a88aaab21388ac0f6f47a96fcad831091..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_gu_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_gu_challenge": {
-      "acc": 0.23693379790940766,
-      "acc_stderr": 0.02514268188080883,
-      "acc_norm": 0.2613240418118467,
-      "acc_norm_stderr": 0.025979671112800046
-    }
-  },
-  "versions": {
-    "arc_gu_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_gu_challenge_bloom-560.json b/evals/arc-challenge/arc_gu_challenge_bloom-560.json
deleted file mode 100644
index 8e1e6a4854fc92fa9250450b250a4769a4c3586d..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_gu_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_gu_challenge": {
-      "acc": 0.21951219512195122,
-      "acc_stderr": 0.0244753759026465,
-      "acc_norm": 0.25435540069686413,
-      "acc_norm_stderr": 0.025751551710541783
-    }
-  },
-  "versions": {
-    "arc_gu_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_gu_challenge_bloom-7b1.json b/evals/arc-challenge/arc_gu_challenge_bloom-7b1.json
deleted file mode 100644
index 920acb43e2275592dbf6351e0ee175bbb1a322c1..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_gu_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_gu_challenge": {
-      "acc": 0.23693379790940766,
-      "acc_stderr": 0.02514268188080883,
-      "acc_norm": 0.23693379790940766,
-      "acc_norm_stderr": 0.025142681880808825
-    }
-  },
-  "versions": {
-    "arc_gu_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_gu_challenge_gpt2-large.json b/evals/arc-challenge/arc_gu_challenge_gpt2-large.json
deleted file mode 100644
index c441954523c6d4bea5cc1b2cba0305b6c41fee49..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_gu_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_gu_challenge": {
-      "acc": 0.22996515679442509,
-      "acc_stderr": 0.02488302588342452,
-      "acc_norm": 0.23693379790940766,
-      "acc_norm_stderr": 0.025142681880808832
-    }
-  },
-  "versions": {
-    "arc_gu_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_gu_challenge_gpt2-medium.json b/evals/arc-challenge/arc_gu_challenge_gpt2-medium.json
deleted file mode 100644
index 7aaeca4ab77d4bf203d3bf29e50b2c3f50320f78..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_gu_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_gu_challenge": {
-      "acc": 0.2229965156794425,
-      "acc_stderr": 0.02461373413263406,
-      "acc_norm": 0.2508710801393728,
-      "acc_norm_stderr": 0.02563424701238326
-    }
-  },
-  "versions": {
-    "arc_gu_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_gu_challenge_gpt2.json b/evals/arc-challenge/arc_gu_challenge_gpt2.json
deleted file mode 100644
index a988ac9706a7406299e0de78b92c41a2151d0204..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_gu_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_gu_challenge": {
-      "acc": 0.22996515679442509,
-      "acc_stderr": 0.024883025883424517,
-      "acc_norm": 0.24390243902439024,
-      "acc_norm_stderr": 0.025392997717581856
-    }
-  },
-  "versions": {
-    "arc_gu_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_gu_challenge_llama-7B.json b/evals/arc-challenge/arc_gu_challenge_llama-7B.json
deleted file mode 100644
index 12e906c731a45f8bd9b92a525fa2d3edc9a6f62e..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_gu_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_gu_challenge": {
-      "acc": 0.20557491289198607,
-      "acc_stderr": 0.023896181928798988,
-      "acc_norm": 0.26480836236933797,
-      "acc_norm_stderr": 0.026090542561414385
-    }
-  },
-  "versions": {
-    "arc_gu_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hi_challenge_bloom-1b7.json b/evals/arc-challenge/arc_hi_challenge_bloom-1b7.json
deleted file mode 100644
index 474da43c63438f6e87405fb3780c9b001241b895..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hi_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hi_challenge": {
-      "acc": 0.21140939597315436,
-      "acc_stderr": 0.02369243605357901,
-      "acc_norm": 0.23825503355704697,
-      "acc_norm_stderr": 0.024719951493159625
-    }
-  },
-  "versions": {
-    "arc_hi_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hi_challenge_bloom-560.json b/evals/arc-challenge/arc_hi_challenge_bloom-560.json
deleted file mode 100644
index 1606ed0007915536346cb01b3395ab2cb67b09a9..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hi_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hi_challenge": {
-      "acc": 0.19798657718120805,
-      "acc_stderr": 0.023122269968056355,
-      "acc_norm": 0.2181208053691275,
-      "acc_norm_stderr": 0.023962942745646806
-    }
-  },
-  "versions": {
-    "arc_hi_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hi_challenge_bloom-7b1.json b/evals/arc-challenge/arc_hi_challenge_bloom-7b1.json
deleted file mode 100644
index b5660d5853f1219cfdbd0d886a4fccd9e6a3ab2b..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hi_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hi_challenge": {
-      "acc": 0.25838926174496646,
-      "acc_stderr": 0.025400777524610105,
-      "acc_norm": 0.29194630872483224,
-      "acc_norm_stderr": 0.026381917944561784
-    }
-  },
-  "versions": {
-    "arc_hi_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hi_challenge_gpt2-large.json b/evals/arc-challenge/arc_hi_challenge_gpt2-large.json
deleted file mode 100644
index e6870360e984b19d105ccc86592d36a7564ff98a..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hi_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hi_challenge": {
-      "acc": 0.22818791946308725,
-      "acc_stderr": 0.024351397257610513,
-      "acc_norm": 0.25838926174496646,
-      "acc_norm_stderr": 0.025400777524610105
-    }
-  },
-  "versions": {
-    "arc_hi_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hi_challenge_gpt2-medium.json b/evals/arc-challenge/arc_hi_challenge_gpt2-medium.json
deleted file mode 100644
index f64cba429b30075841311a50303cbff1487551af..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hi_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hi_challenge": {
-      "acc": 0.24161073825503357,
-      "acc_stderr": 0.02483853510802848,
-      "acc_norm": 0.27181208053691275,
-      "acc_norm_stderr": 0.025815342279487567
-    }
-  },
-  "versions": {
-    "arc_hi_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hi_challenge_gpt2.json b/evals/arc-challenge/arc_hi_challenge_gpt2.json
deleted file mode 100644
index 9ccb8fb7bd3bc4c523ed703b76c3d2526c010107..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hi_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hi_challenge": {
-      "acc": 0.2181208053691275,
-      "acc_stderr": 0.023962942745646785,
-      "acc_norm": 0.2785234899328859,
-      "acc_norm_stderr": 0.026011403578485925
-    }
-  },
-  "versions": {
-    "arc_hi_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hi_challenge_llama-7B.json b/evals/arc-challenge/arc_hi_challenge_llama-7B.json
deleted file mode 100644
index 90d5c1ec99c8e977e4997800431e69a1dc078659..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hi_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hi_challenge": {
-      "acc": 0.20469798657718122,
-      "acc_stderr": 0.02341232810510543,
-      "acc_norm": 0.2751677852348993,
-      "acc_norm_stderr": 0.025914289910427518
-    }
-  },
-  "versions": {
-    "arc_hi_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hr_challenge_bloom-1b7.json b/evals/arc-challenge/arc_hr_challenge_bloom-1b7.json
deleted file mode 100644
index c4ea79c0ffc6047bb74b51d401771a577f7b2a2e..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hr_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hr_challenge": {
-      "acc": 0.24579124579124578,
-      "acc_stderr": 0.025025521384235302,
-      "acc_norm": 0.25925925925925924,
-      "acc_norm_stderr": 0.025471492792791692
-    }
-  },
-  "versions": {
-    "arc_hr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hr_challenge_bloom-560.json b/evals/arc-challenge/arc_hr_challenge_bloom-560.json
deleted file mode 100644
index d0388389e9fdfe66978f0bb663af6b9c14905b74..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hr_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hr_challenge": {
-      "acc": 0.19865319865319866,
-      "acc_stderr": 0.023190610381322117,
-      "acc_norm": 0.2558922558922559,
-      "acc_norm_stderr": 0.025363000375801963
-    }
-  },
-  "versions": {
-    "arc_hr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hr_challenge_bloom-7b1.json b/evals/arc-challenge/arc_hr_challenge_bloom-7b1.json
deleted file mode 100644
index 27a6b5e7862ae33a52b4fcee86a333d1819e8514..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hr_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hr_challenge": {
-      "acc": 0.23905723905723905,
-      "acc_stderr": 0.02479026042346899,
-      "acc_norm": 0.2962962962962963,
-      "acc_norm_stderr": 0.026540687854980666
-    }
-  },
-  "versions": {
-    "arc_hr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hr_challenge_gpt2-large.json b/evals/arc-challenge/arc_hr_challenge_gpt2-large.json
deleted file mode 100644
index daac6d38e4cc4974c0a8b524053297e0971694a9..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hr_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hr_challenge": {
-      "acc": 0.18855218855218855,
-      "acc_stderr": 0.0227352759557704,
-      "acc_norm": 0.2255892255892256,
-      "acc_norm_stderr": 0.02429399929295737
-    }
-  },
-  "versions": {
-    "arc_hr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hr_challenge_gpt2-medium.json b/evals/arc-challenge/arc_hr_challenge_gpt2-medium.json
deleted file mode 100644
index b69e7a89e1d024529a1ccfa184f0ed211ab024e6..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hr_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hr_challenge": {
-      "acc": 0.18855218855218855,
-      "acc_stderr": 0.0227352759557704,
-      "acc_norm": 0.2255892255892256,
-      "acc_norm_stderr": 0.024293999292957367
-    }
-  },
-  "versions": {
-    "arc_hr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hr_challenge_gpt2.json b/evals/arc-challenge/arc_hr_challenge_gpt2.json
deleted file mode 100644
index d27da666a194a216383a01fe3c520895dbaada29..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hr_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hr_challenge": {
-      "acc": 0.19528619528619529,
-      "acc_stderr": 0.02304149438665811,
-      "acc_norm": 0.24242424242424243,
-      "acc_norm_stderr": 0.02490893747050875
-    }
-  },
-  "versions": {
-    "arc_hr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hr_challenge_llama-7B.json b/evals/arc-challenge/arc_hr_challenge_llama-7B.json
deleted file mode 100644
index cc0a77d97f36393c01b3325f7f341ed832c808cb..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hr_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hr_challenge": {
-      "acc": 0.2996632996632997,
-      "acc_stderr": 0.026627130450114996,
-      "acc_norm": 0.3468013468013468,
-      "acc_norm_stderr": 0.027664139917201607
-    }
-  },
-  "versions": {
-    "arc_hr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hu_challenge_bloom-1b7.json b/evals/arc-challenge/arc_hu_challenge_bloom-1b7.json
deleted file mode 100644
index d6ee518fa194a5cab2b0fcc73ab71cfa9a4c7938..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hu_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hu_challenge": {
-      "acc": 0.20875420875420875,
-      "acc_stderr": 0.023622587756271476,
-      "acc_norm": 0.21212121212121213,
-      "acc_norm_stderr": 0.023761611918761676
-    }
-  },
-  "versions": {
-    "arc_hu_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hu_challenge_bloom-560.json b/evals/arc-challenge/arc_hu_challenge_bloom-560.json
deleted file mode 100644
index 4326e9a449bfff5b4bffcb01ae73902068b16858..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hu_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hu_challenge": {
-      "acc": 0.20202020202020202,
-      "acc_stderr": 0.023337132573282595,
-      "acc_norm": 0.23905723905723905,
-      "acc_norm_stderr": 0.024790260423468987
-    }
-  },
-  "versions": {
-    "arc_hu_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hu_challenge_bloom-7b1.json b/evals/arc-challenge/arc_hu_challenge_bloom-7b1.json
deleted file mode 100644
index 7638b2f77f7140b0c0af0df71d4b9e1fd457bfb3..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hu_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hu_challenge": {
-      "acc": 0.2222222222222222,
-      "acc_stderr": 0.02416437978893547,
-      "acc_norm": 0.265993265993266,
-      "acc_norm_stderr": 0.025682629556652854
-    }
-  },
-  "versions": {
-    "arc_hu_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hu_challenge_gpt2-large.json b/evals/arc-challenge/arc_hu_challenge_gpt2-large.json
deleted file mode 100644
index 9a7113da6667b32d4460a28d91f71e3e716239d0..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hu_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hu_challenge": {
-      "acc": 0.21212121212121213,
-      "acc_stderr": 0.023761611918761655,
-      "acc_norm": 0.24242424242424243,
-      "acc_norm_stderr": 0.02490893747050876
-    }
-  },
-  "versions": {
-    "arc_hu_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hu_challenge_gpt2-medium.json b/evals/arc-challenge/arc_hu_challenge_gpt2-medium.json
deleted file mode 100644
index 9f05d0f663b1d94cfc4087ba1aae889603546e4a..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hu_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hu_challenge": {
-      "acc": 0.2356902356902357,
-      "acc_stderr": 0.02466946003490763,
-      "acc_norm": 0.2828282828282828,
-      "acc_norm_stderr": 0.026177438014745417
-    }
-  },
-  "versions": {
-    "arc_hu_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hu_challenge_gpt2.json b/evals/arc-challenge/arc_hu_challenge_gpt2.json
deleted file mode 100644
index 3cdc244f3a355351f2b2e8826aed014e23f29fab..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hu_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hu_challenge": {
-      "acc": 0.2053872053872054,
-      "acc_stderr": 0.023481109518599295,
-      "acc_norm": 0.25252525252525254,
-      "acc_norm_stderr": 0.025252525252525353
-    }
-  },
-  "versions": {
-    "arc_hu_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hu_challenge_llama-7B.json b/evals/arc-challenge/arc_hu_challenge_llama-7B.json
deleted file mode 100644
index d0add74575f51f34aaed4497cfc6e42d0d8d9bc9..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hu_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hu_challenge": {
-      "acc": 0.24915824915824916,
-      "acc_stderr": 0.025140041284626418,
-      "acc_norm": 0.30976430976430974,
-      "acc_norm_stderr": 0.0268762417790141
-    }
-  },
-  "versions": {
-    "arc_hu_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hy_challenge_bloom-1b7.json b/evals/arc-challenge/arc_hy_challenge_bloom-1b7.json
deleted file mode 100644
index c569232cfdeeffa2b9c398fa8102342e55669d6d..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hy_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hy_challenge": {
-      "acc": 0.2206896551724138,
-      "acc_stderr": 0.024394801425351647,
-      "acc_norm": 0.27241379310344827,
-      "acc_norm_stderr": 0.026188332965202905
-    }
-  },
-  "versions": {
-    "arc_hy_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hy_challenge_bloom-7b1.json b/evals/arc-challenge/arc_hy_challenge_bloom-7b1.json
deleted file mode 100644
index 6c5bcfbaa2c0570aa97441fc418e71f242460803..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hy_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hy_challenge": {
-      "acc": 0.18620689655172415,
-      "acc_stderr": 0.022898443475326664,
-      "acc_norm": 0.2689655172413793,
-      "acc_norm_stderr": 0.02608364690576629
-    }
-  },
-  "versions": {
-    "arc_hy_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hy_challenge_gpt2-large.json b/evals/arc-challenge/arc_hy_challenge_gpt2-large.json
deleted file mode 100644
index d3fa3d404e18049ccef76e50f8abe3deed88b1e6..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hy_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hy_challenge": {
-      "acc": 0.19310344827586207,
-      "acc_stderr": 0.02321961545031108,
-      "acc_norm": 0.23793103448275862,
-      "acc_norm_stderr": 0.025048040852790374
-    }
-  },
-  "versions": {
-    "arc_hy_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hy_challenge_gpt2-medium.json b/evals/arc-challenge/arc_hy_challenge_gpt2-medium.json
deleted file mode 100644
index a8f1fd794a777a25dca5bd3d54b52082a503039d..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hy_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hy_challenge": {
-      "acc": 0.20689655172413793,
-      "acc_stderr": 0.02382827611454507,
-      "acc_norm": 0.25862068965517243,
-      "acc_norm_stderr": 0.025757454562272446
-    }
-  },
-  "versions": {
-    "arc_hy_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hy_challenge_gpt2.json b/evals/arc-challenge/arc_hy_challenge_gpt2.json
deleted file mode 100644
index a6b0c05a8a5c5112ef3326264ffa348cbe02c2ff..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hy_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hy_challenge": {
-      "acc": 0.1793103448275862,
-      "acc_stderr": 0.022565410117928373,
-      "acc_norm": 0.27241379310344827,
-      "acc_norm_stderr": 0.026188332965202905
-    }
-  },
-  "versions": {
-    "arc_hy_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hy_challenge_llama-7B.json b/evals/arc-challenge/arc_hy_challenge_llama-7B.json
deleted file mode 100644
index 76c60ed9c16ffa50256b3420a3d1c544d27d0f8a..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_hy_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_hy_challenge": {
-      "acc": 0.2206896551724138,
-      "acc_stderr": 0.024394801425351637,
-      "acc_norm": 0.30344827586206896,
-      "acc_norm_stderr": 0.02704394858012006
-    }
-  },
-  "versions": {
-    "arc_hy_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_id_challenge_bloom-1b7.json b/evals/arc-challenge/arc_id_challenge_bloom-1b7.json
deleted file mode 100644
index 8edb6191b5ef4693fcf7dfc5cfad9800d7044c56..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_id_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_id_challenge": {
-      "acc": 0.2986577181208054,
-      "acc_stderr": 0.026556672487880535,
-      "acc_norm": 0.2751677852348993,
-      "acc_norm_stderr": 0.025914289910427518
-    }
-  },
-  "versions": {
-    "arc_id_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_id_challenge_bloom-560.json b/evals/arc-challenge/arc_id_challenge_bloom-560.json
deleted file mode 100644
index 1d88eb711d44c2d77c4554d4f4d6e553aa1209eb..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_id_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_id_challenge": {
-      "acc": 0.24496644295302014,
-      "acc_stderr": 0.024955035980898963,
-      "acc_norm": 0.28187919463087246,
-      "acc_norm_stderr": 0.026106703750007423
-    }
-  },
-  "versions": {
-    "arc_id_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_id_challenge_bloom-7b1.json b/evals/arc-challenge/arc_id_challenge_bloom-7b1.json
deleted file mode 100644
index 9d6908c8177308068c88e133ad1287687c46dcce..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_id_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_id_challenge": {
-      "acc": 0.3187919463087248,
-      "acc_stderr": 0.027040538296634997,
-      "acc_norm": 0.3825503355704698,
-      "acc_norm_stderr": 0.028201151940879375
-    }
-  },
-  "versions": {
-    "arc_id_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_id_challenge_gpt2-large.json b/evals/arc-challenge/arc_id_challenge_gpt2-large.json
deleted file mode 100644
index ab5432ed0c027006e5940d1dbd8e9231eccd5ab0..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_id_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_id_challenge": {
-      "acc": 0.23825503355704697,
-      "acc_stderr": 0.02471995149315962,
-      "acc_norm": 0.2684563758389262,
-      "acc_norm_stderr": 0.025714539514817496
-    }
-  },
-  "versions": {
-    "arc_id_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_id_challenge_gpt2-medium.json b/evals/arc-challenge/arc_id_challenge_gpt2-medium.json
deleted file mode 100644
index 156b2294f71673c6950d132b56805c5e36900b92..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_id_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_id_challenge": {
-      "acc": 0.2080536912751678,
-      "acc_stderr": 0.023553603370264114,
-      "acc_norm": 0.2483221476510067,
-      "acc_norm_stderr": 0.025069483148037884
-    }
-  },
-  "versions": {
-    "arc_id_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_id_challenge_gpt2.json b/evals/arc-challenge/arc_id_challenge_gpt2.json
deleted file mode 100644
index ef1ed97c321fe9cc50de905c218517b2d6bb812d..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_id_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_id_challenge": {
-      "acc": 0.23825503355704697,
-      "acc_stderr": 0.024719951493159628,
-      "acc_norm": 0.2785234899328859,
-      "acc_norm_stderr": 0.026011403578485907
-    }
-  },
-  "versions": {
-    "arc_id_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_id_challenge_llama-7B.json b/evals/arc-challenge/arc_id_challenge_llama-7B.json
deleted file mode 100644
index 531f6f81397ca5506b0f36d1291417201eb9b72e..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_id_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_id_challenge": {
-      "acc": 0.23154362416107382,
-      "acc_stderr": 0.024476414420146617,
-      "acc_norm": 0.28523489932885904,
-      "acc_norm_stderr": 0.02620021021413825
-    }
-  },
-  "versions": {
-    "arc_id_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_it_challenge_bloom-1b7.json b/evals/arc-challenge/arc_it_challenge_bloom-1b7.json
deleted file mode 100644
index c38c75e09195bcf94e26d180f17837747473c6f7..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_it_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_it_challenge": {
-      "acc": 0.2558922558922559,
-      "acc_stderr": 0.025363000375801963,
-      "acc_norm": 0.24579124579124578,
-      "acc_norm_stderr": 0.025025521384235284
-    }
-  },
-  "versions": {
-    "arc_it_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_it_challenge_bloom-560.json b/evals/arc-challenge/arc_it_challenge_bloom-560.json
deleted file mode 100644
index a1001fcc2f2df8d064ae2cefca3cbcf0212ed670..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_it_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_it_challenge": {
-      "acc": 0.20202020202020202,
-      "acc_stderr": 0.023337132573282612,
-      "acc_norm": 0.23232323232323232,
-      "acc_norm_stderr": 0.02454650495612789
-    }
-  },
-  "versions": {
-    "arc_it_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_it_challenge_bloom-7b1.json b/evals/arc-challenge/arc_it_challenge_bloom-7b1.json
deleted file mode 100644
index fe8c476fe99201a63e06353589f9b571026510a6..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_it_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_it_challenge": {
-      "acc": 0.24242424242424243,
-      "acc_stderr": 0.02490893747050875,
-      "acc_norm": 0.23232323232323232,
-      "acc_norm_stderr": 0.02454650495612789
-    }
-  },
-  "versions": {
-    "arc_it_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_it_challenge_gpt2-large.json b/evals/arc-challenge/arc_it_challenge_gpt2-large.json
deleted file mode 100644
index 2508d33a6975391a9665c19ebb10213e84bd23da..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_it_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_it_challenge": {
-      "acc": 0.2255892255892256,
-      "acc_stderr": 0.02429399929295737,
-      "acc_norm": 0.25252525252525254,
-      "acc_norm_stderr": 0.025252525252525342
-    }
-  },
-  "versions": {
-    "arc_it_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_it_challenge_gpt2.json b/evals/arc-challenge/arc_it_challenge_gpt2.json
deleted file mode 100644
index 611874b61c1374b902d583cf5cefbc4492ed6ac6..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_it_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_it_challenge": {
-      "acc": 0.22895622895622897,
-      "acc_stderr": 0.024421362642271068,
-      "acc_norm": 0.24579124579124578,
-      "acc_norm_stderr": 0.025025521384235284
-    }
-  },
-  "versions": {
-    "arc_it_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_it_challenge_llama-7B.json b/evals/arc-challenge/arc_it_challenge_llama-7B.json
deleted file mode 100644
index 026bc2c2a59b0b1e397e34c3f50a439cc3237e6c..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_it_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_it_challenge": {
-      "acc": 0.3164983164983165,
-      "acc_stderr": 0.02703395838420781,
-      "acc_norm": 0.3367003367003367,
-      "acc_norm_stderr": 0.02746823841289221
-    }
-  },
-  "versions": {
-    "arc_it_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_kn_challenge_bloom-1b7.json b/evals/arc-challenge/arc_kn_challenge_bloom-1b7.json
deleted file mode 100644
index d30129acdd6c23d97224155d05ff525778afc39a..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_kn_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_kn_challenge": {
-      "acc": 0.2097902097902098,
-      "acc_stderr": 0.024118005042923673,
-      "acc_norm": 0.25874125874125875,
-      "acc_norm_stderr": 0.025941514501247074
-    }
-  },
-  "versions": {
-    "arc_kn_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_kn_challenge_bloom-560.json b/evals/arc-challenge/arc_kn_challenge_bloom-560.json
deleted file mode 100644
index 9061ffd18bb78ef2415b46937475b366aaba5e70..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_kn_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_kn_challenge": {
-      "acc": 0.2097902097902098,
-      "acc_stderr": 0.024118005042923676,
-      "acc_norm": 0.2727272727272727,
-      "acc_norm_stderr": 0.026380954549454924
-    }
-  },
-  "versions": {
-    "arc_kn_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_kn_challenge_bloom-7b1.json b/evals/arc-challenge/arc_kn_challenge_bloom-7b1.json
deleted file mode 100644
index 083303db0d99abb50df9664e66431757fcbc34cf..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_kn_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_kn_challenge": {
-      "acc": 0.2062937062937063,
-      "acc_stderr": 0.023969030679396822,
-      "acc_norm": 0.27972027972027974,
-      "acc_norm_stderr": 0.02658827368712313
-    }
-  },
-  "versions": {
-    "arc_kn_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_kn_challenge_gpt2-large.json b/evals/arc-challenge/arc_kn_challenge_gpt2-large.json
deleted file mode 100644
index cc1d0795f8679f5f353a8fe04a823ce8944d6180..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_kn_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_kn_challenge": {
-      "acc": 0.24125874125874125,
-      "acc_stderr": 0.02534346249658375,
-      "acc_norm": 0.2062937062937063,
-      "acc_norm_stderr": 0.02396903067939682
-    }
-  },
-  "versions": {
-    "arc_kn_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_kn_challenge_gpt2-medium.json b/evals/arc-challenge/arc_kn_challenge_gpt2-medium.json
deleted file mode 100644
index 3272316d0c0fa316ff58bd4f0a3c248c27457501..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_kn_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_kn_challenge": {
-      "acc": 0.23076923076923078,
-      "acc_stderr": 0.02495714171242502,
-      "acc_norm": 0.23426573426573427,
-      "acc_norm_stderr": 0.025088286217169773
-    }
-  },
-  "versions": {
-    "arc_kn_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_kn_challenge_gpt2.json b/evals/arc-challenge/arc_kn_challenge_gpt2.json
deleted file mode 100644
index 06e41e33136f376ee8441914155f63301d2b3150..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_kn_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_kn_challenge": {
-      "acc": 0.21678321678321677,
-      "acc_stderr": 0.02440795482238759,
-      "acc_norm": 0.1993006993006993,
-      "acc_norm_stderr": 0.023662831210753306
-    }
-  },
-  "versions": {
-    "arc_kn_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_kn_challenge_llama-7B.json b/evals/arc-challenge/arc_kn_challenge_llama-7B.json
deleted file mode 100644
index 54ade592ef4b8faca4ac733019e8a288ffcd7080..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_kn_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_kn_challenge": {
-      "acc": 0.25524475524475526,
-      "acc_stderr": 0.025826334320570847,
-      "acc_norm": 0.2762237762237762,
-      "acc_norm_stderr": 0.026485626798716456
-    }
-  },
-  "versions": {
-    "arc_kn_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ml_challenge_bloom-1b7.json b/evals/arc-challenge/arc_ml_challenge_bloom-1b7.json
deleted file mode 100644
index 237a4de001e4d03d3a5da1bd85ff383ee5ed3641..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ml_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ml_challenge": {
-      "acc": 0.20270270270270271,
-      "acc_stderr": 0.023406091994174035,
-      "acc_norm": 0.20945945945945946,
-      "acc_norm_stderr": 0.023691963473475734
-    }
-  },
-  "versions": {
-    "arc_ml_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ml_challenge_bloom-560.json b/evals/arc-challenge/arc_ml_challenge_bloom-560.json
deleted file mode 100644
index b276b36482cf0a1c5ed243c8a17297e981587426..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ml_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ml_challenge": {
-      "acc": 0.19932432432432431,
-      "acc_stderr": 0.02325934388926828,
-      "acc_norm": 0.23310810810810811,
-      "acc_norm_stderr": 0.024616978985669728
-    }
-  },
-  "versions": {
-    "arc_ml_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ml_challenge_bloom-7b1.json b/evals/arc-challenge/arc_ml_challenge_bloom-7b1.json
deleted file mode 100644
index 57e340993dc80aab56386e3c1ade388f4d786241..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ml_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ml_challenge": {
-      "acc": 0.22635135135135134,
-      "acc_stderr": 0.024364215012920545,
-      "acc_norm": 0.22297297297297297,
-      "acc_norm_stderr": 0.02423444993634421
-    }
-  },
-  "versions": {
-    "arc_ml_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ml_challenge_gpt2-large.json b/evals/arc-challenge/arc_ml_challenge_gpt2-large.json
deleted file mode 100644
index a23148b0cf58ef04dc9ab3bb8d26aedadda9296f..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ml_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ml_challenge": {
-      "acc": 0.22972972972972974,
-      "acc_stderr": 0.024491712953916972,
-      "acc_norm": 0.22297297297297297,
-      "acc_norm_stderr": 0.024234449936344216
-    }
-  },
-  "versions": {
-    "arc_ml_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ml_challenge_gpt2-medium.json b/evals/arc-challenge/arc_ml_challenge_gpt2-medium.json
deleted file mode 100644
index 9aa842f5ce9d59030c7aae3de538f9b3ea816580..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ml_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ml_challenge": {
-      "acc": 0.2533783783783784,
-      "acc_stderr": 0.0253235186291,
-      "acc_norm": 0.21283783783783783,
-      "acc_norm_stderr": 0.0238311783119674
-    }
-  },
-  "versions": {
-    "arc_ml_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ml_challenge_llama-7B.json b/evals/arc-challenge/arc_ml_challenge_llama-7B.json
deleted file mode 100644
index 3f4555f5009cd795dea8981be98bec45e2ed9369..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ml_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ml_challenge": {
-      "acc": 0.21621621621621623,
-      "acc_stderr": 0.023967970439477224,
-      "acc_norm": 0.20270270270270271,
-      "acc_norm_stderr": 0.023406091994174035
-    }
-  },
-  "versions": {
-    "arc_ml_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_mr_challenge_bloom-1b7.json b/evals/arc-challenge/arc_mr_challenge_bloom-1b7.json
deleted file mode 100644
index c8b3bb6a26b22a95c0a8de8ae3221f476963428f..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_mr_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_mr_challenge": {
-      "acc": 0.24067796610169492,
-      "acc_stderr": 0.02493202205172924,
-      "acc_norm": 0.2440677966101695,
-      "acc_norm_stderr": 0.02505088069031971
-    }
-  },
-  "versions": {
-    "arc_mr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_mr_challenge_bloom-560.json b/evals/arc-challenge/arc_mr_challenge_bloom-560.json
deleted file mode 100644
index 213f904f45633d7bdef01eef045a28ec2636faf5..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_mr_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_mr_challenge": {
-      "acc": 0.2440677966101695,
-      "acc_stderr": 0.025050880690319716,
-      "acc_norm": 0.22372881355932203,
-      "acc_norm_stderr": 0.02430491058853199
-    }
-  },
-  "versions": {
-    "arc_mr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_mr_challenge_bloom-7b1.json b/evals/arc-challenge/arc_mr_challenge_bloom-7b1.json
deleted file mode 100644
index 4a6cfb61ab6cccf8da1ad0ec46c1bde46e11be82..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_mr_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_mr_challenge": {
-      "acc": 0.23389830508474577,
-      "acc_stderr": 0.024687839412166384,
-      "acc_norm": 0.2440677966101695,
-      "acc_norm_stderr": 0.025050880690319702
-    }
-  },
-  "versions": {
-    "arc_mr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_mr_challenge_gpt2-large.json b/evals/arc-challenge/arc_mr_challenge_gpt2-large.json
deleted file mode 100644
index 380f5aee1d555e85568122130af494663cb3123f..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_mr_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_mr_challenge": {
-      "acc": 0.2,
-      "acc_stderr": 0.023328473740792135,
-      "acc_norm": 0.2440677966101695,
-      "acc_norm_stderr": 0.025050880690319702
-    }
-  },
-  "versions": {
-    "arc_mr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_mr_challenge_gpt2-medium.json b/evals/arc-challenge/arc_mr_challenge_gpt2-medium.json
deleted file mode 100644
index 7df5889da7e82e2529e4532947c4e0e8507ba94c..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_mr_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_mr_challenge": {
-      "acc": 0.2,
-      "acc_stderr": 0.023328473740792135,
-      "acc_norm": 0.22372881355932203,
-      "acc_norm_stderr": 0.024304910588531993
-    }
-  },
-  "versions": {
-    "arc_mr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_mr_challenge_gpt2.json b/evals/arc-challenge/arc_mr_challenge_gpt2.json
deleted file mode 100644
index 8344c19a2efa7d7c252e94ea149ef5b421b34214..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_mr_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_mr_challenge": {
-      "acc": 0.18305084745762712,
-      "acc_stderr": 0.02255328043040195,
-      "acc_norm": 0.2033898305084746,
-      "acc_norm_stderr": 0.023475447251410726
-    }
-  },
-  "versions": {
-    "arc_mr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_mr_challenge_llama-7B.json b/evals/arc-challenge/arc_mr_challenge_llama-7B.json
deleted file mode 100644
index f1cf03e6c1c130bd7352dd7963fe03ae5f4303fe..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_mr_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_mr_challenge": {
-      "acc": 0.2271186440677966,
-      "acc_stderr": 0.024434819973932945,
-      "acc_norm": 0.2711864406779661,
-      "acc_norm_stderr": 0.025927971596786177
-    }
-  },
-  "versions": {
-    "arc_mr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ne_challenge_bloom-1b7.json b/evals/arc-challenge/arc_ne_challenge_bloom-1b7.json
deleted file mode 100644
index 9ef6fea604fc9172e63676717b7455a756bbbd4e..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ne_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ne_challenge": {
-      "acc": 0.2222222222222222,
-      "acc_stderr": 0.024164379788935486,
-      "acc_norm": 0.30303030303030304,
-      "acc_norm_stderr": 0.026711859553317677
-    }
-  },
-  "versions": {
-    "arc_ne_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ne_challenge_bloom-560.json b/evals/arc-challenge/arc_ne_challenge_bloom-560.json
deleted file mode 100644
index 490a9ae38f7edf0f013f898d0c075db2184dc99b..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ne_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ne_challenge": {
-      "acc": 0.25925925925925924,
-      "acc_stderr": 0.02547149279279167,
-      "acc_norm": 0.28619528619528617,
-      "acc_norm_stderr": 0.02627090829835463
-    }
-  },
-  "versions": {
-    "arc_ne_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ne_challenge_bloom-7b1.json b/evals/arc-challenge/arc_ne_challenge_bloom-7b1.json
deleted file mode 100644
index 0b1c6c30b759cb29ce78c358d0d709a7b53f16f3..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ne_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ne_challenge": {
-      "acc": 0.24242424242424243,
-      "acc_stderr": 0.024908937470508766,
-      "acc_norm": 0.2996632996632997,
-      "acc_norm_stderr": 0.02662713045011499
-    }
-  },
-  "versions": {
-    "arc_ne_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ne_challenge_gpt2-large.json b/evals/arc-challenge/arc_ne_challenge_gpt2-large.json
deleted file mode 100644
index 82b4b764b3fb7ef15563ca6d2c27830e3aef8d51..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ne_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ne_challenge": {
-      "acc": 0.23905723905723905,
-      "acc_stderr": 0.024790260423468984,
-      "acc_norm": 0.23905723905723905,
-      "acc_norm_stderr": 0.02479026042346898
-    }
-  },
-  "versions": {
-    "arc_ne_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ne_challenge_gpt2-medium.json b/evals/arc-challenge/arc_ne_challenge_gpt2-medium.json
deleted file mode 100644
index 18464b4f845260d9e4122a7c74c4fc758519296a..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ne_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ne_challenge": {
-      "acc": 0.23905723905723905,
-      "acc_stderr": 0.024790260423468984,
-      "acc_norm": 0.24579124579124578,
-      "acc_norm_stderr": 0.025025521384235295
-    }
-  },
-  "versions": {
-    "arc_ne_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ne_challenge_gpt2.json b/evals/arc-challenge/arc_ne_challenge_gpt2.json
deleted file mode 100644
index 669e0661f7894b2bdc02512e274ab12a340e6f2c..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ne_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ne_challenge": {
-      "acc": 0.2356902356902357,
-      "acc_stderr": 0.024669460034907637,
-      "acc_norm": 0.2255892255892256,
-      "acc_norm_stderr": 0.02429399929295737
-    }
-  },
-  "versions": {
-    "arc_ne_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_nl_challenge_bloom-1b7.json b/evals/arc-challenge/arc_nl_challenge_bloom-1b7.json
deleted file mode 100644
index de6df0fa84c07702ad9d3005757f4412e835e175..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_nl_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_nl_challenge": {
-      "acc": 0.20469798657718122,
-      "acc_stderr": 0.02341232810510543,
-      "acc_norm": 0.24161073825503357,
-      "acc_norm_stderr": 0.024838535108028484
-    }
-  },
-  "versions": {
-    "arc_nl_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_nl_challenge_bloom-560.json b/evals/arc-challenge/arc_nl_challenge_bloom-560.json
deleted file mode 100644
index 4bd9dec46927eea8709a44925f7f7f5e4d35c055..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_nl_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_nl_challenge": {
-      "acc": 0.22483221476510068,
-      "acc_stderr": 0.024224169829650748,
-      "acc_norm": 0.2651006711409396,
-      "acc_norm_stderr": 0.025611859712206003
-    }
-  },
-  "versions": {
-    "arc_nl_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_nl_challenge_bloom-7b1.json b/evals/arc-challenge/arc_nl_challenge_bloom-7b1.json
deleted file mode 100644
index 5360e3ed9ed9f43f4cbddc65166e1d83d89a29e6..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_nl_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_nl_challenge": {
-      "acc": 0.20134228187919462,
-      "acc_stderr": 0.0232685657676853,
-      "acc_norm": 0.2684563758389262,
-      "acc_norm_stderr": 0.025714539514817496
-    }
-  },
-  "versions": {
-    "arc_nl_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_nl_challenge_gpt2-large.json b/evals/arc-challenge/arc_nl_challenge_gpt2-large.json
deleted file mode 100644
index 432863c5e4840c2d01bdac986765c61050413f9f..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_nl_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_nl_challenge": {
-      "acc": 0.2080536912751678,
-      "acc_stderr": 0.023553603370264114,
-      "acc_norm": 0.2516778523489933,
-      "acc_norm_stderr": 0.025181904610615855
-    }
-  },
-  "versions": {
-    "arc_nl_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_nl_challenge_gpt2-medium.json b/evals/arc-challenge/arc_nl_challenge_gpt2-medium.json
deleted file mode 100644
index 65d7c05ced99e1bd53aa3110a033d9c0975025fa..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_nl_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_nl_challenge": {
-      "acc": 0.23154362416107382,
-      "acc_stderr": 0.024476414420146628,
-      "acc_norm": 0.2550335570469799,
-      "acc_norm_stderr": 0.025292327380712687
-    }
-  },
-  "versions": {
-    "arc_nl_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_nl_challenge_gpt2.json b/evals/arc-challenge/arc_nl_challenge_gpt2.json
deleted file mode 100644
index bce39d9e1424be6bf01a0c15447e59c3348a08d6..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_nl_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_nl_challenge": {
-      "acc": 0.21476510067114093,
-      "acc_stderr": 0.023828868848284373,
-      "acc_norm": 0.24496644295302014,
-      "acc_norm_stderr": 0.024955035980898956
-    }
-  },
-  "versions": {
-    "arc_nl_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_nl_challenge_llama-7B.json b/evals/arc-challenge/arc_nl_challenge_llama-7B.json
deleted file mode 100644
index a9b3e1e927abac3aba0720a5085b3a1b041af85b..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_nl_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_nl_challenge": {
-      "acc": 0.2953020134228188,
-      "acc_stderr": 0.026470155629081078,
-      "acc_norm": 0.32550335570469796,
-      "acc_norm_stderr": 0.027188760373954457
-    }
-  },
-  "versions": {
-    "arc_nl_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_pt_challenge_bloom-1b7.json b/evals/arc-challenge/arc_pt_challenge_bloom-1b7.json
deleted file mode 100644
index 86206aa4c02654dee089146263800252a9280415..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_pt_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_pt_challenge": {
-      "acc": 0.22483221476510068,
-      "acc_stderr": 0.024224169829650755,
-      "acc_norm": 0.28187919463087246,
-      "acc_norm_stderr": 0.026106703750007426
-    }
-  },
-  "versions": {
-    "arc_pt_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_pt_challenge_bloom-560.json b/evals/arc-challenge/arc_pt_challenge_bloom-560.json
deleted file mode 100644
index 11021802d7ffa732fc84739fd8ec1d531dc637b6..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_pt_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_pt_challenge": {
-      "acc": 0.22483221476510068,
-      "acc_stderr": 0.02422416982965075,
-      "acc_norm": 0.23154362416107382,
-      "acc_norm_stderr": 0.02447641442014662
-    }
-  },
-  "versions": {
-    "arc_pt_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_pt_challenge_bloom-7b1.json b/evals/arc-challenge/arc_pt_challenge_bloom-7b1.json
deleted file mode 100644
index e9f27045095eca6ce035e90605bdff561f37a5a8..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_pt_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_pt_challenge": {
-      "acc": 0.348993288590604,
-      "acc_stderr": 0.02765814479375022,
-      "acc_norm": 0.3724832214765101,
-      "acc_norm_stderr": 0.02805354855477509
-    }
-  },
-  "versions": {
-    "arc_pt_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_pt_challenge_gpt2-large.json b/evals/arc-challenge/arc_pt_challenge_gpt2-large.json
deleted file mode 100644
index fd1a4b8d1948d7ebf686b68f03b68fae0c5e41de..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_pt_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_pt_challenge": {
-      "acc": 0.18791946308724833,
-      "acc_stderr": 0.022667687029933926,
-      "acc_norm": 0.24161073825503357,
-      "acc_norm_stderr": 0.024838535108028477
-    }
-  },
-  "versions": {
-    "arc_pt_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_pt_challenge_gpt2-medium.json b/evals/arc-challenge/arc_pt_challenge_gpt2-medium.json
deleted file mode 100644
index 0380aff06ff37610aa48dddf5d15f62376f1d08b..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_pt_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_pt_challenge": {
-      "acc": 0.18120805369127516,
-      "acc_stderr": 0.02235101779623449,
-      "acc_norm": 0.2348993288590604,
-      "acc_norm_stderr": 0.024599255015999244
-    }
-  },
-  "versions": {
-    "arc_pt_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_pt_challenge_gpt2.json b/evals/arc-challenge/arc_pt_challenge_gpt2.json
deleted file mode 100644
index 6a1952ed53a80de06750b3d6155487089a0672bd..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_pt_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_pt_challenge": {
-      "acc": 0.19463087248322147,
-      "acc_stderr": 0.022973392306598166,
-      "acc_norm": 0.2483221476510067,
-      "acc_norm_stderr": 0.025069483148037884
-    }
-  },
-  "versions": {
-    "arc_pt_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_pt_challenge_llama-7B.json b/evals/arc-challenge/arc_pt_challenge_llama-7B.json
deleted file mode 100644
index e49526aa9a3f1e1f7fda72f9bf9b3a58227a95ce..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_pt_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_pt_challenge": {
-      "acc": 0.32550335570469796,
-      "acc_stderr": 0.027188760373954457,
-      "acc_norm": 0.33557046979865773,
-      "acc_norm_stderr": 0.027399214125091453
-    }
-  },
-  "versions": {
-    "arc_pt_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ro_challenge_bloom-1b7.json b/evals/arc-challenge/arc_ro_challenge_bloom-1b7.json
deleted file mode 100644
index bd189e9050be188d43e3bac19cd42c400c5df7c8..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ro_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ro_challenge": {
-      "acc": 0.24915824915824916,
-      "acc_stderr": 0.025140041284626418,
-      "acc_norm": 0.28619528619528617,
-      "acc_norm_stderr": 0.026270908298354635
-    }
-  },
-  "versions": {
-    "arc_ro_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ro_challenge_bloom-560.json b/evals/arc-challenge/arc_ro_challenge_bloom-560.json
deleted file mode 100644
index a797f1ebfa7d92e0c78e624b99da52e77c92822c..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ro_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ro_challenge": {
-      "acc": 0.20875420875420875,
-      "acc_stderr": 0.023622587756271473,
-      "acc_norm": 0.26936026936026936,
-      "acc_norm_stderr": 0.025785321789052268
-    }
-  },
-  "versions": {
-    "arc_ro_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ro_challenge_bloom-7b1.json b/evals/arc-challenge/arc_ro_challenge_bloom-7b1.json
deleted file mode 100644
index 7e63a3d72b4f1a770523a9859787818e4e1ed26e..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ro_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ro_challenge": {
-      "acc": 0.25252525252525254,
-      "acc_stderr": 0.025252525252525346,
-      "acc_norm": 0.30303030303030304,
-      "acc_norm_stderr": 0.02671185955331767
-    }
-  },
-  "versions": {
-    "arc_ro_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ro_challenge_gpt2-large.json b/evals/arc-challenge/arc_ro_challenge_gpt2-large.json
deleted file mode 100644
index 68f4f45196bec82ad2ec165f33cae93bfbedbe44..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ro_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ro_challenge": {
-      "acc": 0.18855218855218855,
-      "acc_stderr": 0.022735275955770386,
-      "acc_norm": 0.2828282828282828,
-      "acc_norm_stderr": 0.026177438014745407
-    }
-  },
-  "versions": {
-    "arc_ro_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ro_challenge_gpt2-medium.json b/evals/arc-challenge/arc_ro_challenge_gpt2-medium.json
deleted file mode 100644
index 5df0a11438afe98b491a6e5528d70eacb48652cf..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ro_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ro_challenge": {
-      "acc": 0.18855218855218855,
-      "acc_stderr": 0.022735275955770375,
-      "acc_norm": 0.2558922558922559,
-      "acc_norm_stderr": 0.025363000375801976
-    }
-  },
-  "versions": {
-    "arc_ro_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ro_challenge_gpt2.json b/evals/arc-challenge/arc_ro_challenge_gpt2.json
deleted file mode 100644
index 37203889a39601337bd2d8ffcd85a3e4693013ad..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ro_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ro_challenge": {
-      "acc": 0.20875420875420875,
-      "acc_stderr": 0.02362258775627147,
-      "acc_norm": 0.2962962962962963,
-      "acc_norm_stderr": 0.026540687854980673
-    }
-  },
-  "versions": {
-    "arc_ro_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ro_challenge_llama-7B.json b/evals/arc-challenge/arc_ro_challenge_llama-7B.json
deleted file mode 100644
index 37d943e737472a25d2c879425d478f6dd746e1f4..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ro_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ro_challenge": {
-      "acc": 0.2828282828282828,
-      "acc_stderr": 0.02617743801474542,
-      "acc_norm": 0.3164983164983165,
-      "acc_norm_stderr": 0.027033958384207805
-    }
-  },
-  "versions": {
-    "arc_ro_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ru_challenge_bloom-1b7.json b/evals/arc-challenge/arc_ru_challenge_bloom-1b7.json
deleted file mode 100644
index fc9a3f783edc283ec79c7906da73bc8a27f80a9d..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ru_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ru_challenge": {
-      "acc": 0.25252525252525254,
-      "acc_stderr": 0.02525252525252537,
-      "acc_norm": 0.3569023569023569,
-      "acc_norm_stderr": 0.027846288057490554
-    }
-  },
-  "versions": {
-    "arc_ru_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ru_challenge_bloom-560.json b/evals/arc-challenge/arc_ru_challenge_bloom-560.json
deleted file mode 100644
index 863c94dcc4459d25ef7faec70a11d6199434c8af..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ru_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ru_challenge": {
-      "acc": 0.24915824915824916,
-      "acc_stderr": 0.025140041284626418,
-      "acc_norm": 0.3333333333333333,
-      "acc_norm_stderr": 0.027399831217559588
-    }
-  },
-  "versions": {
-    "arc_ru_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ru_challenge_bloom-7b1.json b/evals/arc-challenge/arc_ru_challenge_bloom-7b1.json
deleted file mode 100644
index 5b61e526e728d5523f1e61b4fe49307c1c872c4c..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ru_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ru_challenge": {
-      "acc": 0.25925925925925924,
-      "acc_stderr": 0.025471492792791674,
-      "acc_norm": 0.32996632996632996,
-      "acc_norm_stderr": 0.02732985145570343
-    }
-  },
-  "versions": {
-    "arc_ru_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ru_challenge_gpt2-large.json b/evals/arc-challenge/arc_ru_challenge_gpt2-large.json
deleted file mode 100644
index fd367513e4157fb1556348f212a5c6e94922beee..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ru_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ru_challenge": {
-      "acc": 0.24579124579124578,
-      "acc_stderr": 0.02502552138423529,
-      "acc_norm": 0.29292929292929293,
-      "acc_norm_stderr": 0.026452514969665924
-    }
-  },
-  "versions": {
-    "arc_ru_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ru_challenge_gpt2-medium.json b/evals/arc-challenge/arc_ru_challenge_gpt2-medium.json
deleted file mode 100644
index 8a7b6aee643ab931ddd7a2528c36075699604170..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ru_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ru_challenge": {
-      "acc": 0.21548821548821548,
-      "acc_stderr": 0.023898224834697,
-      "acc_norm": 0.2558922558922559,
-      "acc_norm_stderr": 0.025363000375801963
-    }
-  },
-  "versions": {
-    "arc_ru_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ru_challenge_gpt2.json b/evals/arc-challenge/arc_ru_challenge_gpt2.json
deleted file mode 100644
index 6c01167509035c09b2ab40ba64c6f23d0d3b61c6..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ru_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ru_challenge": {
-      "acc": 0.19865319865319866,
-      "acc_stderr": 0.023190610381322137,
-      "acc_norm": 0.26936026936026936,
-      "acc_norm_stderr": 0.025785321789052268
-    }
-  },
-  "versions": {
-    "arc_ru_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ru_challenge_llama-7B.json b/evals/arc-challenge/arc_ru_challenge_llama-7B.json
deleted file mode 100644
index c6af8bacc84e8232e587af0b1b62f0360595f5b8..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ru_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ru_challenge": {
-      "acc": 0.2895622895622896,
-      "acc_stderr": 0.026362594432681956,
-      "acc_norm": 0.3333333333333333,
-      "acc_norm_stderr": 0.027399831217559577
-    }
-  },
-  "versions": {
-    "arc_ru_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sk_challenge_bloom-1b7.json b/evals/arc-challenge/arc_sk_challenge_bloom-1b7.json
deleted file mode 100644
index 5c061cbf7e912082f72face7e42633294acb46b4..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_sk_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sk_challenge": {
-      "acc": 0.2516778523489933,
-      "acc_stderr": 0.02518190461061586,
-      "acc_norm": 0.2516778523489933,
-      "acc_norm_stderr": 0.025181904610615865
-    }
-  },
-  "versions": {
-    "arc_sk_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sk_challenge_bloom-560.json b/evals/arc-challenge/arc_sk_challenge_bloom-560.json
deleted file mode 100644
index 77221ca57be5ff0cc96e73fc774d0670d7c7208c..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_sk_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sk_challenge": {
-      "acc": 0.24161073825503357,
-      "acc_stderr": 0.02483853510802848,
-      "acc_norm": 0.22483221476510068,
-      "acc_norm_stderr": 0.02422416982965075
-    }
-  },
-  "versions": {
-    "arc_sk_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sk_challenge_bloom-7b1.json b/evals/arc-challenge/arc_sk_challenge_bloom-7b1.json
deleted file mode 100644
index 2d78271208e5af3f6496e645f8b79b3b7394aa34..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_sk_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sk_challenge": {
-      "acc": 0.2348993288590604,
-      "acc_stderr": 0.024599255015999244,
-      "acc_norm": 0.25838926174496646,
-      "acc_norm_stderr": 0.025400777524610105
-    }
-  },
-  "versions": {
-    "arc_sk_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sk_challenge_gpt2-large.json b/evals/arc-challenge/arc_sk_challenge_gpt2-large.json
deleted file mode 100644
index 128f662c32c44780afb9fd950815540a151364d6..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_sk_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sk_challenge": {
-      "acc": 0.24161073825503357,
-      "acc_stderr": 0.02483853510802848,
-      "acc_norm": 0.2516778523489933,
-      "acc_norm_stderr": 0.025181904610615858
-    }
-  },
-  "versions": {
-    "arc_sk_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sk_challenge_gpt2-medium.json b/evals/arc-challenge/arc_sk_challenge_gpt2-medium.json
deleted file mode 100644
index 75bc31afba2a470fbe33869562f865ae458240c8..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_sk_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sk_challenge": {
-      "acc": 0.23825503355704697,
-      "acc_stderr": 0.02471995149315962,
-      "acc_norm": 0.24496644295302014,
-      "acc_norm_stderr": 0.02495503598089895
-    }
-  },
-  "versions": {
-    "arc_sk_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sk_challenge_gpt2.json b/evals/arc-challenge/arc_sk_challenge_gpt2.json
deleted file mode 100644
index 28459f8e1e1dc32e8d92343933fa438b717eb85b..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_sk_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sk_challenge": {
-      "acc": 0.2348993288590604,
-      "acc_stderr": 0.024599255015999244,
-      "acc_norm": 0.23154362416107382,
-      "acc_norm_stderr": 0.02447641442014662
-    }
-  },
-  "versions": {
-    "arc_sk_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sk_challenge_llama-7B.json b/evals/arc-challenge/arc_sk_challenge_llama-7B.json
deleted file mode 100644
index 3701c2f5034fd64259683639da7b904f8bf0d1d1..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_sk_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sk_challenge": {
-      "acc": 0.2348993288590604,
-      "acc_stderr": 0.024599255015999244,
-      "acc_norm": 0.2550335570469799,
-      "acc_norm_stderr": 0.025292327380712683
-    }
-  },
-  "versions": {
-    "arc_sk_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sr_challenge_bloom-1b7.json b/evals/arc-challenge/arc_sr_challenge_bloom-1b7.json
deleted file mode 100644
index dbdcdb6f40e4a2a2d630ac6967d84266a19ee386..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_sr_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sr_challenge": {
-      "acc": 0.23986486486486486,
-      "acc_stderr": 0.024860949670846393,
-      "acc_norm": 0.2635135135135135,
-      "acc_norm_stderr": 0.025649141242391035
-    }
-  },
-  "versions": {
-    "arc_sr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sr_challenge_bloom-560.json b/evals/arc-challenge/arc_sr_challenge_bloom-560.json
deleted file mode 100644
index f4e4aafa24a952d05d4ff3efde104237233e2747..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_sr_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sr_challenge": {
-      "acc": 0.22972972972972974,
-      "acc_stderr": 0.02449171295391697,
-      "acc_norm": 0.27702702702702703,
-      "acc_norm_stderr": 0.02605620088360472
-    }
-  },
-  "versions": {
-    "arc_sr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sr_challenge_bloom-7b1.json b/evals/arc-challenge/arc_sr_challenge_bloom-7b1.json
deleted file mode 100644
index e70cc59ff97ac76e9506b0a8c29249c91543af45..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_sr_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sr_challenge": {
-      "acc": 0.26013513513513514,
-      "acc_stderr": 0.025542576393640232,
-      "acc_norm": 0.30067567567567566,
-      "acc_norm_stderr": 0.026697921821786215
-    }
-  },
-  "versions": {
-    "arc_sr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sr_challenge_gpt2-large.json b/evals/arc-challenge/arc_sr_challenge_gpt2-large.json
deleted file mode 100644
index 381e33947c532c85c78a23c4986d737ed19bc7e1..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_sr_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sr_challenge": {
-      "acc": 0.1891891891891892,
-      "acc_stderr": 0.022803258753373676,
-      "acc_norm": 0.24324324324324326,
-      "acc_norm_stderr": 0.024979718407699757
-    }
-  },
-  "versions": {
-    "arc_sr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sr_challenge_gpt2-medium.json b/evals/arc-challenge/arc_sr_challenge_gpt2-medium.json
deleted file mode 100644
index d59206fddbda1dfd8cd1e6514ca6cba7f09dd45b..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_sr_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sr_challenge": {
-      "acc": 0.20608108108108109,
-      "acc_stderr": 0.023550282959294247,
-      "acc_norm": 0.24662162162162163,
-      "acc_norm_stderr": 0.02509638351759426
-    }
-  },
-  "versions": {
-    "arc_sr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sr_challenge_gpt2.json b/evals/arc-challenge/arc_sr_challenge_gpt2.json
deleted file mode 100644
index ed4d03dcbbbdb78f9e36972c6c09ea65f958accf..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_sr_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sr_challenge": {
-      "acc": 0.18243243243243243,
-      "acc_stderr": 0.0224854634796718,
-      "acc_norm": 0.22972972972972974,
-      "acc_norm_stderr": 0.024491712953916972
-    }
-  },
-  "versions": {
-    "arc_sr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sr_challenge_llama-7B.json b/evals/arc-challenge/arc_sr_challenge_llama-7B.json
deleted file mode 100644
index 9a1c5c3f8986ce3acbf704e6d2fbd4d82fbcc724..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_sr_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sr_challenge": {
-      "acc": 0.2905405405405405,
-      "acc_stderr": 0.026433590266607382,
-      "acc_norm": 0.2972972972972973,
-      "acc_norm_stderr": 0.02661155695908287
-    }
-  },
-  "versions": {
-    "arc_sr_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sv_challenge_bloom-1b7.json b/evals/arc-challenge/arc_sv_challenge_bloom-1b7.json
deleted file mode 100644
index 962c6f1d023be86a6fa7adf0d018a08eda14f1b8..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_sv_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sv_challenge": {
-      "acc": 0.20202020202020202,
-      "acc_stderr": 0.023337132573282605,
-      "acc_norm": 0.23232323232323232,
-      "acc_norm_stderr": 0.02454650495612789
-    }
-  },
-  "versions": {
-    "arc_sv_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sv_challenge_bloom-560.json b/evals/arc-challenge/arc_sv_challenge_bloom-560.json
deleted file mode 100644
index 9477cbe0f42a6cdde99f9a0af2293c4b1c23cf00..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_sv_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sv_challenge": {
-      "acc": 0.21212121212121213,
-      "acc_stderr": 0.02376161191876168,
-      "acc_norm": 0.2053872053872054,
-      "acc_norm_stderr": 0.023481109518599313
-    }
-  },
-  "versions": {
-    "arc_sv_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sv_challenge_bloom-7b1.json b/evals/arc-challenge/arc_sv_challenge_bloom-7b1.json
deleted file mode 100644
index c89c1d01bfea674f9f7d9549f8abf2abe32192f8..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_sv_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sv_challenge": {
-      "acc": 0.2255892255892256,
-      "acc_stderr": 0.024293999292957367,
-      "acc_norm": 0.265993265993266,
-      "acc_norm_stderr": 0.02568262955665285
-    }
-  },
-  "versions": {
-    "arc_sv_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sv_challenge_gpt2-large.json b/evals/arc-challenge/arc_sv_challenge_gpt2-large.json
deleted file mode 100644
index c090b83981933a41b620746123d08d4ba90f53a2..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_sv_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sv_challenge": {
-      "acc": 0.22895622895622897,
-      "acc_stderr": 0.02442136264227106,
-      "acc_norm": 0.23232323232323232,
-      "acc_norm_stderr": 0.02454650495612789
-    }
-  },
-  "versions": {
-    "arc_sv_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sv_challenge_gpt2-medium.json b/evals/arc-challenge/arc_sv_challenge_gpt2-medium.json
deleted file mode 100644
index 31f537c4fb8157ec63b8cbcb4d2001cfd08e1533..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_sv_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sv_challenge": {
-      "acc": 0.2255892255892256,
-      "acc_stderr": 0.024293999292957367,
-      "acc_norm": 0.24242424242424243,
-      "acc_norm_stderr": 0.02490893747050876
-    }
-  },
-  "versions": {
-    "arc_sv_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sv_challenge_llama-7B.json b/evals/arc-challenge/arc_sv_challenge_llama-7B.json
deleted file mode 100644
index c2c4e7550c402c4d3dbaf7d6ea56dbf864c439ce..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_sv_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_sv_challenge": {
-      "acc": 0.2962962962962963,
-      "acc_stderr": 0.026540687854980646,
-      "acc_norm": 0.30303030303030304,
-      "acc_norm_stderr": 0.02671185955331767
-    }
-  },
-  "versions": {
-    "arc_sv_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ta_challenge_bloom-1b7.json b/evals/arc-challenge/arc_ta_challenge_bloom-1b7.json
deleted file mode 100644
index a937aa6dd9066efa74a5b88515612f7dc4ba6691..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ta_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ta_challenge": {
-      "acc": 0.21283783783783783,
-      "acc_stderr": 0.02383117831196738,
-      "acc_norm": 0.25675675675675674,
-      "acc_norm_stderr": 0.025434043955304575
-    }
-  },
-  "versions": {
-    "arc_ta_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ta_challenge_bloom-560.json b/evals/arc-challenge/arc_ta_challenge_bloom-560.json
deleted file mode 100644
index 6b1c389d448803dd7a2c483cec6aa7ff1876c4a6..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ta_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ta_challenge": {
-      "acc": 0.19932432432432431,
-      "acc_stderr": 0.02325934388926828,
-      "acc_norm": 0.2533783783783784,
-      "acc_norm_stderr": 0.025323518629100025
-    }
-  },
-  "versions": {
-    "arc_ta_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ta_challenge_bloom-7b1.json b/evals/arc-challenge/arc_ta_challenge_bloom-7b1.json
deleted file mode 100644
index a5da07219683283eaafbda47b1ed0957be400dda..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ta_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ta_challenge": {
-      "acc": 0.23310810810810811,
-      "acc_stderr": 0.024616978985669728,
-      "acc_norm": 0.24324324324324326,
-      "acc_norm_stderr": 0.02497971840769973
-    }
-  },
-  "versions": {
-    "arc_ta_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ta_challenge_gpt2-large.json b/evals/arc-challenge/arc_ta_challenge_gpt2-large.json
deleted file mode 100644
index 918cb1c7f6be3a7693ecf8713714c664843cfc38..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ta_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ta_challenge": {
-      "acc": 0.21283783783783783,
-      "acc_stderr": 0.02383117831196738,
-      "acc_norm": 0.23310810810810811,
-      "acc_norm_stderr": 0.024616978985669724
-    }
-  },
-  "versions": {
-    "arc_ta_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ta_challenge_gpt2-medium.json b/evals/arc-challenge/arc_ta_challenge_gpt2-medium.json
deleted file mode 100644
index 6af3ab31fdcf16311ec8594bad8ee052c05b16bc..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ta_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ta_challenge": {
-      "acc": 0.2195945945945946,
-      "acc_stderr": 0.02410238110604679,
-      "acc_norm": 0.2668918918918919,
-      "acc_norm_stderr": 0.025753762926257903
-    }
-  },
-  "versions": {
-    "arc_ta_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ta_challenge_gpt2.json b/evals/arc-challenge/arc_ta_challenge_gpt2.json
deleted file mode 100644
index 5245a03aac201f65f42e53dcabf6d1f7c0717d52..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ta_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ta_challenge": {
-      "acc": 0.23986486486486486,
-      "acc_stderr": 0.024860949670846396,
-      "acc_norm": 0.26013513513513514,
-      "acc_norm_stderr": 0.025542576393640246
-    }
-  },
-  "versions": {
-    "arc_ta_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ta_challenge_llama-7B.json b/evals/arc-challenge/arc_ta_challenge_llama-7B.json
deleted file mode 100644
index 241feef032d750202d858fbc9162e3549a178160..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_ta_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_ta_challenge": {
-      "acc": 0.20270270270270271,
-      "acc_stderr": 0.02340609199417405,
-      "acc_norm": 0.22297297297297297,
-      "acc_norm_stderr": 0.02423444993634422
-    }
-  },
-  "versions": {
-    "arc_ta_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_te_challenge_bloom-1b7.json b/evals/arc-challenge/arc_te_challenge_bloom-1b7.json
deleted file mode 100644
index ce9a2c9841dcb9e494770a8c9199b82c8ab4c9f7..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_te_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_te_challenge": {
-      "acc": 0.21897810218978103,
-      "acc_stderr": 0.02502941075517834,
-      "acc_norm": 0.2591240875912409,
-      "acc_norm_stderr": 0.026518277256436896
-    }
-  },
-  "versions": {
-    "arc_te_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_te_challenge_bloom-560.json b/evals/arc-challenge/arc_te_challenge_bloom-560.json
deleted file mode 100644
index 0d326f4a1b5d45a12a085af0588dc48da1242b19..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_te_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_te_challenge": {
-      "acc": 0.22627737226277372,
-      "acc_stderr": 0.02532397574413385,
-      "acc_norm": 0.24087591240875914,
-      "acc_norm_stderr": 0.025880445559939208
-    }
-  },
-  "versions": {
-    "arc_te_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_te_challenge_bloom-7b1.json b/evals/arc-challenge/arc_te_challenge_bloom-7b1.json
deleted file mode 100644
index 1c6d34bb9da6f86f1a4494caba49a2d1bab46bcf..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_te_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_te_challenge": {
-      "acc": 0.20072992700729927,
-      "acc_stderr": 0.024242171306158907,
-      "acc_norm": 0.25547445255474455,
-      "acc_norm_stderr": 0.026395641265678074
-    }
-  },
-  "versions": {
-    "arc_te_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_te_challenge_gpt2-large.json b/evals/arc-challenge/arc_te_challenge_gpt2-large.json
deleted file mode 100644
index 226ed83458102ea0a3f4161159558d6ae8875357..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_te_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_te_challenge": {
-      "acc": 0.22627737226277372,
-      "acc_stderr": 0.02532397574413385,
-      "acc_norm": 0.24087591240875914,
-      "acc_norm_stderr": 0.025880445559939208
-    }
-  },
-  "versions": {
-    "arc_te_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_te_challenge_gpt2-medium.json b/evals/arc-challenge/arc_te_challenge_gpt2-medium.json
deleted file mode 100644
index a5bd92092ab22f31db2d36d69626c32b485ab331..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_te_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_te_challenge": {
-      "acc": 0.2116788321167883,
-      "acc_stderr": 0.02472344500978517,
-      "acc_norm": 0.22992700729927007,
-      "acc_norm_stderr": 0.025467107178386465
-    }
-  },
-  "versions": {
-    "arc_te_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_te_challenge_gpt2.json b/evals/arc-challenge/arc_te_challenge_gpt2.json
deleted file mode 100644
index c6b5f06c5f92b644a3c4ac037330810277460f0a..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_te_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_te_challenge": {
-      "acc": 0.22627737226277372,
-      "acc_stderr": 0.02532397574413385,
-      "acc_norm": 0.24087591240875914,
-      "acc_norm_stderr": 0.025880445559939215
-    }
-  },
-  "versions": {
-    "arc_te_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_te_challenge_llama-7B.json b/evals/arc-challenge/arc_te_challenge_llama-7B.json
deleted file mode 100644
index a20fb71e7ce5932ff220ab3a23466714b469cd51..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_te_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_te_challenge": {
-      "acc": 0.24087591240875914,
-      "acc_stderr": 0.025880445559939215,
-      "acc_norm": 0.26277372262773724,
-      "acc_norm_stderr": 0.026638517193281797
-    }
-  },
-  "versions": {
-    "arc_te_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_uk_challenge_bloom-1b7.json b/evals/arc-challenge/arc_uk_challenge_bloom-1b7.json
deleted file mode 100644
index 72eee1e288b03359fecf649039ec7e1a796086ee..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_uk_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_uk_challenge": {
-      "acc": 0.24579124579124578,
-      "acc_stderr": 0.025025521384235305,
-      "acc_norm": 0.28619528619528617,
-      "acc_norm_stderr": 0.026270908298354635
-    }
-  },
-  "versions": {
-    "arc_uk_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_uk_challenge_bloom-560.json b/evals/arc-challenge/arc_uk_challenge_bloom-560.json
deleted file mode 100644
index ef5e9d5a99c327e81413b16eb715a91e70b6c5b3..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_uk_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_uk_challenge": {
-      "acc": 0.265993265993266,
-      "acc_stderr": 0.02568262955665285,
-      "acc_norm": 0.2895622895622896,
-      "acc_norm_stderr": 0.026362594432681956
-    }
-  },
-  "versions": {
-    "arc_uk_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_uk_challenge_bloom-7b1.json b/evals/arc-challenge/arc_uk_challenge_bloom-7b1.json
deleted file mode 100644
index 3c2cc6b833fb7540bcca14af70e018d3eb236524..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_uk_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_uk_challenge": {
-      "acc": 0.2222222222222222,
-      "acc_stderr": 0.02416437978893547,
-      "acc_norm": 0.265993265993266,
-      "acc_norm_stderr": 0.02568262955665285
-    }
-  },
-  "versions": {
-    "arc_uk_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_uk_challenge_gpt2-large.json b/evals/arc-challenge/arc_uk_challenge_gpt2-large.json
deleted file mode 100644
index c03f6ddf265c02f0fc83f91f5c16d2586666d682..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_uk_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_uk_challenge": {
-      "acc": 0.23232323232323232,
-      "acc_stderr": 0.02454650495612789,
-      "acc_norm": 0.27946127946127947,
-      "acc_norm_stderr": 0.026082164400369843
-    }
-  },
-  "versions": {
-    "arc_uk_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_uk_challenge_gpt2-medium.json b/evals/arc-challenge/arc_uk_challenge_gpt2-medium.json
deleted file mode 100644
index 51083b7158f2de8700c8c253b7e5e98eba1626a9..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_uk_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_uk_challenge": {
-      "acc": 0.2222222222222222,
-      "acc_stderr": 0.02416437978893546,
-      "acc_norm": 0.265993265993266,
-      "acc_norm_stderr": 0.02568262955665285
-    }
-  },
-  "versions": {
-    "arc_uk_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_uk_challenge_gpt2.json b/evals/arc-challenge/arc_uk_challenge_gpt2.json
deleted file mode 100644
index e32104934ab1fe23828d680bf766e04e93ea044a..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_uk_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_uk_challenge": {
-      "acc": 0.21212121212121213,
-      "acc_stderr": 0.023761611918761662,
-      "acc_norm": 0.24242424242424243,
-      "acc_norm_stderr": 0.02490893747050876
-    }
-  },
-  "versions": {
-    "arc_uk_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_uk_challenge_llama-7B.json b/evals/arc-challenge/arc_uk_challenge_llama-7B.json
deleted file mode 100644
index a02491cf171678a4ddc940caa47d4c778b0e3cf5..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_uk_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_uk_challenge": {
-      "acc": 0.30976430976430974,
-      "acc_stderr": 0.026876241779014095,
-      "acc_norm": 0.3367003367003367,
-      "acc_norm_stderr": 0.027468238412892212
-    }
-  },
-  "versions": {
-    "arc_uk_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_vi_challenge_bloom-1b7.json b/evals/arc-challenge/arc_vi_challenge_bloom-1b7.json
deleted file mode 100644
index 508c46f8cd77b71773ecc8623d362eae91a1dc3f..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_vi_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_vi_challenge": {
-      "acc": 0.24496644295302014,
-      "acc_stderr": 0.024955035980898942,
-      "acc_norm": 0.28187919463087246,
-      "acc_norm_stderr": 0.026106703750007423
-    }
-  },
-  "versions": {
-    "arc_vi_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_vi_challenge_bloom-560.json b/evals/arc-challenge/arc_vi_challenge_bloom-560.json
deleted file mode 100644
index 70d9cffdbf7b3adea2bbded15e8a36d7f930b24b..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_vi_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_vi_challenge": {
-      "acc": 0.2483221476510067,
-      "acc_stderr": 0.025069483148037874,
-      "acc_norm": 0.25838926174496646,
-      "acc_norm_stderr": 0.025400777524610105
-    }
-  },
-  "versions": {
-    "arc_vi_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_vi_challenge_bloom-7b1.json b/evals/arc-challenge/arc_vi_challenge_bloom-7b1.json
deleted file mode 100644
index f1588613ea4565257bfb7f46328c5e696a1434de..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_vi_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_vi_challenge": {
-      "acc": 0.3087248322147651,
-      "acc_stderr": 0.02680606307294056,
-      "acc_norm": 0.3288590604026846,
-      "acc_norm_stderr": 0.02726048303556786
-    }
-  },
-  "versions": {
-    "arc_vi_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_vi_challenge_gpt2-large.json b/evals/arc-challenge/arc_vi_challenge_gpt2-large.json
deleted file mode 100644
index c071ea16496ed3627a0dc0840835a827894a8a61..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_vi_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_vi_challenge": {
-      "acc": 0.18120805369127516,
-      "acc_stderr": 0.02235101779623446,
-      "acc_norm": 0.23825503355704697,
-      "acc_norm_stderr": 0.024719951493159628
-    }
-  },
-  "versions": {
-    "arc_vi_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_vi_challenge_gpt2-medium.json b/evals/arc-challenge/arc_vi_challenge_gpt2-medium.json
deleted file mode 100644
index 0cb1f34c59a21cb916520b7e956a1bd193ba1395..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_vi_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_vi_challenge": {
-      "acc": 0.2080536912751678,
-      "acc_stderr": 0.023553603370264103,
-      "acc_norm": 0.23825503355704697,
-      "acc_norm_stderr": 0.024719951493159628
-    }
-  },
-  "versions": {
-    "arc_vi_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_vi_challenge_gpt2.json b/evals/arc-challenge/arc_vi_challenge_gpt2.json
deleted file mode 100644
index 6f912cfc57fb3d8efe3773d82b7a95532a6f69b0..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_vi_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_vi_challenge": {
-      "acc": 0.2080536912751678,
-      "acc_stderr": 0.0235536033702641,
-      "acc_norm": 0.2080536912751678,
-      "acc_norm_stderr": 0.023553603370264124
-    }
-  },
-  "versions": {
-    "arc_vi_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_vi_challenge_llama-7B.json b/evals/arc-challenge/arc_vi_challenge_llama-7B.json
deleted file mode 100644
index 8427c0ad1958ea7ad114255f020f43c5d50d076c..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_vi_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_vi_challenge": {
-      "acc": 0.1912751677852349,
-      "acc_stderr": 0.022821882255340997,
-      "acc_norm": 0.2516778523489933,
-      "acc_norm_stderr": 0.025181904610615855
-    }
-  },
-  "versions": {
-    "arc_vi_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_zh_challenge_bloom-1b7.json b/evals/arc-challenge/arc_zh_challenge_bloom-1b7.json
deleted file mode 100644
index 4626e7c607b4dd4f9c82472abe983c30203c245c..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_zh_challenge_bloom-1b7.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_zh_challenge": {
-      "acc": 0.25252525252525254,
-      "acc_stderr": 0.025252525252525356,
-      "acc_norm": 0.25925925925925924,
-      "acc_norm_stderr": 0.025471492792791674
-    }
-  },
-  "versions": {
-    "arc_zh_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_zh_challenge_bloom-560.json b/evals/arc-challenge/arc_zh_challenge_bloom-560.json
deleted file mode 100644
index 127c0ce8f0b322902ecae312152c6905394bf82e..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_zh_challenge_bloom-560.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_zh_challenge": {
-      "acc": 0.24242424242424243,
-      "acc_stderr": 0.024908937470508753,
-      "acc_norm": 0.26936026936026936,
-      "acc_norm_stderr": 0.025785321789052268
-    }
-  },
-  "versions": {
-    "arc_zh_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_zh_challenge_bloom-7b1.json b/evals/arc-challenge/arc_zh_challenge_bloom-7b1.json
deleted file mode 100644
index b488311a8cccbd9e611c8abe983c979453acd882..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_zh_challenge_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_zh_challenge": {
-      "acc": 0.3400673400673401,
-      "acc_stderr": 0.027535084762190663,
-      "acc_norm": 0.367003367003367,
-      "acc_norm_stderr": 0.028014951100692458
-    }
-  },
-  "versions": {
-    "arc_zh_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_zh_challenge_gpt2-large.json b/evals/arc-challenge/arc_zh_challenge_gpt2-large.json
deleted file mode 100644
index b20ff9d4fb351205e7abdc821a99a7a9c62aa9c6..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_zh_challenge_gpt2-large.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_zh_challenge": {
-      "acc": 0.21548821548821548,
-      "acc_stderr": 0.023898224834697,
-      "acc_norm": 0.24915824915824916,
-      "acc_norm_stderr": 0.025140041284626418
-    }
-  },
-  "versions": {
-    "arc_zh_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_zh_challenge_gpt2-medium.json b/evals/arc-challenge/arc_zh_challenge_gpt2-medium.json
deleted file mode 100644
index fe9d9b64694a7c0355b5de8e14577532c3e16db0..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_zh_challenge_gpt2-medium.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_zh_challenge": {
-      "acc": 0.21548821548821548,
-      "acc_stderr": 0.023898224834697005,
-      "acc_norm": 0.23232323232323232,
-      "acc_norm_stderr": 0.02454650495612789
-    }
-  },
-  "versions": {
-    "arc_zh_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_zh_challenge_gpt2.json b/evals/arc-challenge/arc_zh_challenge_gpt2.json
deleted file mode 100644
index d8da342e3dfff17d37f9f34a3f90753cb4850243..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_zh_challenge_gpt2.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_zh_challenge": {
-      "acc": 0.20875420875420875,
-      "acc_stderr": 0.023622587756271476,
-      "acc_norm": 0.22895622895622897,
-      "acc_norm_stderr": 0.02442136264227106
-    }
-  },
-  "versions": {
-    "arc_zh_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_zh_challenge_llama-7B.json b/evals/arc-challenge/arc_zh_challenge_llama-7B.json
deleted file mode 100644
index 51e82fa68d852ff2bafe284c29d895d2422b66e9..0000000000000000000000000000000000000000
--- a/evals/arc-challenge/arc_zh_challenge_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "arc_zh_challenge": {
-      "acc": 0.2558922558922559,
-      "acc_stderr": 0.02536300037580196,
-      "acc_norm": 0.27946127946127947,
-      "acc_norm_stderr": 0.026082164400369843
-    }
-  },
-  "versions": {
-    "arc_zh_challenge": 0
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_sr_mc_bloom-7b1.json b/evals/arc/arc_ar-bloom-7b1.json
similarity index 54%
rename from evals/truthfulqa-mc/truthfulqa_sr_mc_bloom-7b1.json
rename to evals/arc/arc_ar-bloom-7b1.json
index 7a9be337308c1b4de36187d0139341115ab5acc1..66c115459f73a74be6bd4b1b3933509010a82342 100644
--- a/evals/truthfulqa-mc/truthfulqa_sr_mc_bloom-7b1.json
+++ b/evals/arc/arc_ar-bloom-7b1.json
@@ -1,19 +1,19 @@
 {
   "results": {
-    "truthfulqa_sr_mc": {
-      "mc1": 0.2875318066157761,
-      "mc1_stderr": 0.016154400981864346,
-      "mc2": 0.4611856949025646,
-      "mc2_stderr": 0.01648960635223338
+    "arc_ar": {
+      "acc": 0.2634730538922156,
+      "acc_stderr": 0.012889646336321774,
+      "acc_norm": 0.31394354148845166,
+      "acc_norm_stderr": 0.013579515768185788
     }
   },
   "versions": {
-    "truthfulqa_sr_mc": 1
+    "arc_ar": 0
   },
   "config": {
     "model": "hf-auto",
     "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": "1",
+    "batch_size": 1,
     "device": "cuda",
     "no_cache": false,
     "limit": null,
diff --git a/evals/truthfulqa-mc/truthfulqa_ne_mc_llama-7B.json b/evals/arc/arc_ar-llama-7B.json
similarity index 56%
rename from evals/truthfulqa-mc/truthfulqa_ne_mc_llama-7B.json
rename to evals/arc/arc_ar-llama-7B.json
index 547c3b78ee0caef9b096972901f0b3d40c939029..31293a19637055f69dbf3fb11cadfd2fde391402 100644
--- a/evals/truthfulqa-mc/truthfulqa_ne_mc_llama-7B.json
+++ b/evals/arc/arc_ar-llama-7B.json
@@ -1,19 +1,19 @@
 {
   "results": {
-    "truthfulqa_ne_mc": {
-      "mc1": 0.2906091370558376,
-      "mc1_stderr": 0.016184901529011933,
-      "mc2": 0.466774725144191,
-      "mc2_stderr": 0.01677791483100084
+    "arc_ar": {
+      "acc": 0.19760479041916168,
+      "acc_stderr": 0.011651221980953499,
+      "acc_norm": 0.24636441402908468,
+      "acc_norm_stderr": 0.012608059960468694
     }
   },
   "versions": {
-    "truthfulqa_ne_mc": 1
+    "arc_ar": 0
   },
   "config": {
     "model": "hf-auto",
     "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
+    "batch_size": 1,
     "device": "cuda",
     "no_cache": false,
     "limit": null,
diff --git a/evals/truthfulqa-mc/truthfulqa_sk_mc_bloom-7b1.json b/evals/arc/arc_bn-bloom-7b1.json
similarity index 54%
rename from evals/truthfulqa-mc/truthfulqa_sk_mc_bloom-7b1.json
rename to evals/arc/arc_bn-bloom-7b1.json
index 9bb50aa50589d9959d2accbb09d2d099246f74e5..b7b877a4a649f59197b24de7b3ec917785979683 100644
--- a/evals/truthfulqa-mc/truthfulqa_sk_mc_bloom-7b1.json
+++ b/evals/arc/arc_bn-bloom-7b1.json
@@ -1,19 +1,19 @@
 {
   "results": {
-    "truthfulqa_sk_mc": {
-      "mc1": 0.23846153846153847,
-      "mc1_stderr": 0.015268148070057835,
-      "mc2": 0.4379856829317774,
-      "mc2_stderr": 0.016560323561497736
+    "arc_bn": {
+      "acc": 0.22412318220701455,
+      "acc_stderr": 0.012201644195165715,
+      "acc_norm": 0.2617621899059025,
+      "acc_norm_stderr": 0.012862641889254466
     }
   },
   "versions": {
-    "truthfulqa_sk_mc": 1
+    "arc_bn": 0
   },
   "config": {
     "model": "hf-auto",
     "model_args": "pretrained=bigscience/bloom-7b1",
-    "batch_size": "1",
+    "batch_size": 1,
     "device": "cuda",
     "no_cache": false,
     "limit": null,
diff --git a/evals/truthfulqa-mc/truthfulqa_ar_mc_llama-7B.json b/evals/arc/arc_bn-llama-7B.json
similarity index 56%
rename from evals/truthfulqa-mc/truthfulqa_ar_mc_llama-7B.json
rename to evals/arc/arc_bn-llama-7B.json
index 5f817545d204b5083023e5456ee8029ce2191005..1dafcad0f0dbcae9d42395e2697e1ddc5c1ba0c2 100644
--- a/evals/truthfulqa-mc/truthfulqa_ar_mc_llama-7B.json
+++ b/evals/arc/arc_bn-llama-7B.json
@@ -1,19 +1,19 @@
 {
   "results": {
-    "truthfulqa_ar_mc": {
-      "mc1": 0.2777777777777778,
-      "mc1_stderr": 0.016109958670672858,
-      "mc2": 0.4504998624708924,
-      "mc2_stderr": 0.01620052408197046
+    "arc_bn": {
+      "acc": 0.1899059024807528,
+      "acc_stderr": 0.011476660752315397,
+      "acc_norm": 0.2583404619332763,
+      "acc_norm_stderr": 0.012807875214816267
     }
   },
   "versions": {
-    "truthfulqa_ar_mc": 1
+    "arc_bn": 0
   },
   "config": {
     "model": "hf-auto",
     "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
+    "batch_size": 1,
     "device": "cuda",
     "no_cache": false,
     "limit": null,
diff --git a/evals/arc/arc_ca-bloom-7b1.json b/evals/arc/arc_ca-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f0a15e06750a49e5570198c619957cce3e35cf0c
--- /dev/null
+++ b/evals/arc/arc_ca-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ca": {
+      "acc": 0.31989708404802747,
+      "acc_stderr": 0.01366562491926326,
+      "acc_norm": 0.34734133790737565,
+      "acc_norm_stderr": 0.013949489903701517
+    }
+  },
+  "versions": {
+    "arc_ca": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ca_mc_llama-7B.json b/evals/arc/arc_ca-llama-7B.json
similarity index 56%
rename from evals/truthfulqa-mc/truthfulqa_ca_mc_llama-7B.json
rename to evals/arc/arc_ca-llama-7B.json
index dd6e11c0a02074e790f1099cbbeb59e13a69f2e1..f0e3b53912555842b913d4cc78b61de1b70a2380 100644
--- a/evals/truthfulqa-mc/truthfulqa_ca_mc_llama-7B.json
+++ b/evals/arc/arc_ca-llama-7B.json
@@ -1,19 +1,19 @@
 {
   "results": {
-    "truthfulqa_ca_mc": {
-      "mc1": 0.2336328626444159,
-      "mc1_stderr": 0.015170350095728855,
-      "mc2": 0.388488309525287,
-      "mc2_stderr": 0.015026705835089502
+    "arc_ca": {
+      "acc": 0.3276157804459691,
+      "acc_stderr": 0.01375080741597368,
+      "acc_norm": 0.3507718696397942,
+      "acc_norm_stderr": 0.013981316936172217
     }
   },
   "versions": {
-    "truthfulqa_ca_mc": 1
+    "arc_ca": 0
   },
   "config": {
     "model": "hf-auto",
     "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
+    "batch_size": 1,
     "device": "cuda",
     "no_cache": false,
     "limit": null,
diff --git a/evals/arc/arc_da-bloom-7b1.json b/evals/arc/arc_da-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f4e588f7cac0716c4285f186e6d2aa122ee795d
--- /dev/null
+++ b/evals/arc/arc_da-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_da": {
+      "acc": 0.20137103684661525,
+      "acc_stderr": 0.011744154502532795,
+      "acc_norm": 0.24592973436161097,
+      "acc_norm_stderr": 0.012611366681285752
+    }
+  },
+  "versions": {
+    "arc_da": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_da-llama-7B.json b/evals/arc/arc_da-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..814a2fb017691ccd12afbf034c490e10a646843e
--- /dev/null
+++ b/evals/arc/arc_da-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_da": {
+      "acc": 0.286203941730934,
+      "acc_stderr": 0.013236574332463879,
+      "acc_norm": 0.3273350471293916,
+      "acc_norm_stderr": 0.013741887176251822
+    }
+  },
+  "versions": {
+    "arc_da": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_de-bloom-7b1.json b/evals/arc/arc_de-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..205cbe1e5a60177701994fa2eca97338da50bd02
--- /dev/null
+++ b/evals/arc/arc_de-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_de": {
+      "acc": 0.22241231822070145,
+      "acc_stderr": 0.012168377742629776,
+      "acc_norm": 0.262617621899059,
+      "acc_norm_stderr": 0.01287617552045283
+    }
+  },
+  "versions": {
+    "arc_de": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_de-llama-7B.json b/evals/arc/arc_de-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..f13cfc00bfd0ac6e8b6e48a5c0bc3b99c3140b69
--- /dev/null
+++ b/evals/arc/arc_de-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_de": {
+      "acc": 0.2951240376390077,
+      "acc_stderr": 0.013345572865502645,
+      "acc_norm": 0.35072711719418304,
+      "acc_norm_stderr": 0.013962940383743043
+    }
+  },
+  "versions": {
+    "arc_de": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_es-bloom-7b1.json b/evals/arc/arc_es-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..39a5c5211ff20ef49014baa232a8ea2a9d8884be
--- /dev/null
+++ b/evals/arc/arc_es-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_es": {
+      "acc": 0.3316239316239316,
+      "acc_stderr": 0.013769752111910177,
+      "acc_norm": 0.3811965811965812,
+      "acc_norm_stderr": 0.01420507709573084
+    }
+  },
+  "versions": {
+    "arc_es": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_es-llama-7B.json b/evals/arc/arc_es-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..11544ff8942a30c3fb128aa473ea30d88443b0e6
--- /dev/null
+++ b/evals/arc/arc_es-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_es": {
+      "acc": 0.3606837606837607,
+      "acc_stderr": 0.014044746572948867,
+      "acc_norm": 0.3683760683760684,
+      "acc_norm_stderr": 0.014108074259155369
+    }
+  },
+  "versions": {
+    "arc_es": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_eu-bloom-7b1.json b/evals/arc/arc_eu-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..156fd60ab449125d255226262654e5337e4cb697
--- /dev/null
+++ b/evals/arc/arc_eu-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_eu": {
+      "acc": 0.22056239015817222,
+      "acc_stderr": 0.01229634886589257,
+      "acc_norm": 0.2521968365553603,
+      "acc_norm_stderr": 0.012879032347922939
+    }
+  },
+  "versions": {
+    "arc_eu": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_eu-llama-7B.json b/evals/arc/arc_eu-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..10a039f055cb172c7978f840a54bec6cc724948c
--- /dev/null
+++ b/evals/arc/arc_eu-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_eu": {
+      "acc": 0.20738137082601055,
+      "acc_stderr": 0.012023662461166562,
+      "acc_norm": 0.2451669595782074,
+      "acc_norm_stderr": 0.012757811738008544
+    }
+  },
+  "versions": {
+    "arc_eu": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_sv_challenge_gpt2.json b/evals/arc/arc_fr-bloom-7b1.json
similarity index 50%
rename from evals/arc-challenge/arc_sv_challenge_gpt2.json
rename to evals/arc/arc_fr-bloom-7b1.json
index 718a97a6d9df935c9f0818257fda43ef3bfc7996..78cbf1e3cfc337f169be33735f919ab397b8d085 100644
--- a/evals/arc-challenge/arc_sv_challenge_gpt2.json
+++ b/evals/arc/arc_fr-bloom-7b1.json
@@ -1,19 +1,19 @@
 {
   "results": {
-    "arc_sv_challenge": {
-      "acc": 0.2255892255892256,
-      "acc_stderr": 0.024293999292957367,
-      "acc_norm": 0.2356902356902357,
-      "acc_norm_stderr": 0.024669460034907637
+    "arc_fr": {
+      "acc": 0.32677502138579984,
+      "acc_stderr": 0.01372407602199982,
+      "acc_norm": 0.3669803250641574,
+      "acc_norm_stderr": 0.014102904772197396
     }
   },
   "versions": {
-    "arc_sv_challenge": 0
+    "arc_fr": 0
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=gpt2",
-    "batch_size": "1",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
     "device": "cuda",
     "no_cache": false,
     "limit": null,
diff --git a/evals/arc/arc_fr-llama-7B.json b/evals/arc/arc_fr-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..c79866a45e043e6b6e5e139f5ac63dfb8b522f27
--- /dev/null
+++ b/evals/arc/arc_fr-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_fr": {
+      "acc": 0.3473053892215569,
+      "acc_stderr": 0.013931226499492353,
+      "acc_norm": 0.3729683490162532,
+      "acc_norm_stderr": 0.014150093168782438
+    }
+  },
+  "versions": {
+    "arc_fr": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_gu-bloom-7b1.json b/evals/arc/arc_gu-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c78878020cb8341b5adb388627ffa309dde3ad3a
--- /dev/null
+++ b/evals/arc/arc_gu-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_gu": {
+      "acc": 0.2206896551724138,
+      "acc_stderr": 0.012181604374453973,
+      "acc_norm": 0.2336206896551724,
+      "acc_norm_stderr": 0.012428989430945793
+    }
+  },
+  "versions": {
+    "arc_gu": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_gu-llama-7B.json b/evals/arc/arc_gu-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..afadd880b353d2482c13ab85d24811ac5ea5fd57
--- /dev/null
+++ b/evals/arc/arc_gu-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_gu": {
+      "acc": 0.2120689655172414,
+      "acc_stderr": 0.012007177871292825,
+      "acc_norm": 0.23189655172413792,
+      "acc_norm_stderr": 0.012396962423413033
+    }
+  },
+  "versions": {
+    "arc_gu": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_hi-bloom-7b1.json b/evals/arc/arc_hi-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..70136df6c1f9731ab888c323fa0128c0beb43524
--- /dev/null
+++ b/evals/arc/arc_hi-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hi": {
+      "acc": 0.2363013698630137,
+      "acc_stderr": 0.012435369590403731,
+      "acc_norm": 0.2919520547945205,
+      "acc_norm_stderr": 0.013309191484613488
+    }
+  },
+  "versions": {
+    "arc_hi": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_hi-llama-7B.json b/evals/arc/arc_hi-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..ddcd58ade570221ad656710d0944a241789b1d8b
--- /dev/null
+++ b/evals/arc/arc_hi-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hi": {
+      "acc": 0.21232876712328766,
+      "acc_stderr": 0.011971304657273123,
+      "acc_norm": 0.25,
+      "acc_norm_stderr": 0.012675503164084846
+    }
+  },
+  "versions": {
+    "arc_hi": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_hr-bloom-7b1.json b/evals/arc/arc_hr-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..80efc06ef94471b0b04935089a967e72d9e2095e
--- /dev/null
+++ b/evals/arc/arc_hr-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hr": {
+      "acc": 0.19332763045337895,
+      "acc_stderr": 0.011555111310342437,
+      "acc_norm": 0.2369546621043627,
+      "acc_norm_stderr": 0.012441890624187792
+    }
+  },
+  "versions": {
+    "arc_hr": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_hr-llama-7B.json b/evals/arc/arc_hr-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..9c50fa3252a0133486190ed9d5cbc497e1a17fe9
--- /dev/null
+++ b/evals/arc/arc_hr-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hr": {
+      "acc": 0.2754491017964072,
+      "acc_stderr": 0.01307174925264165,
+      "acc_norm": 0.330196749358426,
+      "acc_norm_stderr": 0.013760638974726852
+    }
+  },
+  "versions": {
+    "arc_hr": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_hu-bloom-7b1.json b/evals/arc/arc_hu-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c7e8773a07af63cf8522b314bbd0611c37c7b98
--- /dev/null
+++ b/evals/arc/arc_hu-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hu": {
+      "acc": 0.1969178082191781,
+      "acc_stderr": 0.011640913614197496,
+      "acc_norm": 0.2585616438356164,
+      "acc_norm_stderr": 0.0128169339627777
+    }
+  },
+  "versions": {
+    "arc_hu": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_hu-llama-7B.json b/evals/arc/arc_hu-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac3191180768a88cd6c937d51bf005adb11c7ccf
--- /dev/null
+++ b/evals/arc/arc_hu-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hu": {
+      "acc": 0.2517123287671233,
+      "acc_stderr": 0.012704310825494622,
+      "acc_norm": 0.2979452054794521,
+      "acc_norm_stderr": 0.013388079339102703
+    }
+  },
+  "versions": {
+    "arc_hu": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_hy-bloom-7b1.json b/evals/arc/arc_hy-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d138545e18f6bb49f13d11bd9cd3b515db23815b
--- /dev/null
+++ b/evals/arc/arc_hy-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hy": {
+      "acc": 0.21181818181818182,
+      "acc_stderr": 0.01232525683396216,
+      "acc_norm": 0.26181818181818184,
+      "acc_norm_stderr": 0.013261197012809796
+    }
+  },
+  "versions": {
+    "arc_hy": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_hy-llama-7B.json b/evals/arc/arc_hy-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..35e46c981f8bc3bf9374fdf6ad4b483f4c65762b
--- /dev/null
+++ b/evals/arc/arc_hy-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_hy": {
+      "acc": 0.19454545454545455,
+      "acc_stderr": 0.011940766785664334,
+      "acc_norm": 0.2718181818181818,
+      "acc_norm_stderr": 0.013420241182110736
+    }
+  },
+  "versions": {
+    "arc_hy": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_id-bloom-7b1.json b/evals/arc/arc_id-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a2cc8cf230eda88935959ff54b9ded1986940b84
--- /dev/null
+++ b/evals/arc/arc_id-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_id": {
+      "acc": 0.3128205128205128,
+      "acc_stderr": 0.013560492090917607,
+      "acc_norm": 0.3598290598290598,
+      "acc_norm_stderr": 0.014037469945597791
+    }
+  },
+  "versions": {
+    "arc_id": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_id-llama-7B.json b/evals/arc/arc_id-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..59fcc7ff10a29c0f82833ce5df7a260a8d4bbd42
--- /dev/null
+++ b/evals/arc/arc_id-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_id": {
+      "acc": 0.19316239316239317,
+      "acc_stderr": 0.011546413314069014,
+      "acc_norm": 0.26666666666666666,
+      "acc_norm_stderr": 0.012933850109759573
+    }
+  },
+  "versions": {
+    "arc_id": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_it-bloom-7b1.json b/evals/arc/arc_it-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7eda117416da15b68b1713aa6ef9ff77e69fd826
--- /dev/null
+++ b/evals/arc/arc_it-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_it": {
+      "acc": 0.24037639007698888,
+      "acc_stderr": 0.01250327289928353,
+      "acc_norm": 0.28999144568006846,
+      "acc_norm_stderr": 0.01327709194338097
+    }
+  },
+  "versions": {
+    "arc_it": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_it-llama-7B.json b/evals/arc/arc_it-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..76b8875276c1b0078d3d087c16397df3b3ea9200
--- /dev/null
+++ b/evals/arc/arc_it-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_it": {
+      "acc": 0.31736526946107785,
+      "acc_stderr": 0.013619227292898307,
+      "acc_norm": 0.3575705731394354,
+      "acc_norm_stderr": 0.014024008839912006
+    }
+  },
+  "versions": {
+    "arc_it": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_kn-bloom-7b1.json b/evals/arc/arc_kn-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e92b7d0d555bc117110f34dbbc68d327f5092f5f
--- /dev/null
+++ b/evals/arc/arc_kn-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_kn": {
+      "acc": 0.2221254355400697,
+      "acc_stderr": 0.012273607270054452,
+      "acc_norm": 0.24738675958188153,
+      "acc_norm_stderr": 0.012740675198098838
+    }
+  },
+  "versions": {
+    "arc_kn": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_kn-llama-7B.json b/evals/arc/arc_kn-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..39ae5661b6403f677d4427689194c417f1f2f8b5
--- /dev/null
+++ b/evals/arc/arc_kn-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_kn": {
+      "acc": 0.20470383275261325,
+      "acc_stderr": 0.011913674295957856,
+      "acc_norm": 0.24738675958188153,
+      "acc_norm_stderr": 0.012740675198098834
+    }
+  },
+  "versions": {
+    "arc_kn": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_ml-bloom-7b1.json b/evals/arc/arc_ml-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7c83104b2f7701b8a7af344179886c58a0e89a0
--- /dev/null
+++ b/evals/arc/arc_ml-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ml": {
+      "acc": 0.2075306479859895,
+      "acc_stderr": 0.01200575665793095,
+      "acc_norm": 0.2635726795096322,
+      "acc_norm_stderr": 0.013042844591075362
+    }
+  },
+  "versions": {
+    "arc_ml": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_ml-llama-7B.json b/evals/arc/arc_ml-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..fc465c13860754471e99430d5e6c5e1df5046b2e
--- /dev/null
+++ b/evals/arc/arc_ml-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ml": {
+      "acc": 0.21628721541155868,
+      "acc_stderr": 0.012188522634632977,
+      "acc_norm": 0.27845884413309985,
+      "acc_norm_stderr": 0.013269918016014967
+    }
+  },
+  "versions": {
+    "arc_ml": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_it_challenge_gpt2-medium.json b/evals/arc/arc_mr-bloom-7b1.json
similarity index 51%
rename from evals/arc-challenge/arc_it_challenge_gpt2-medium.json
rename to evals/arc/arc_mr-bloom-7b1.json
index 2663af9d466539843f48e70d58dd9a236db69c79..cb854d6690652622f9f24d8c241c70b1cab749f9 100644
--- a/evals/arc-challenge/arc_it_challenge_gpt2-medium.json
+++ b/evals/arc/arc_mr-bloom-7b1.json
@@ -1,19 +1,19 @@
 {
   "results": {
-    "arc_it_challenge": {
-      "acc": 0.2255892255892256,
-      "acc_stderr": 0.02429399929295737,
+    "arc_mr": {
+      "acc": 0.23376623376623376,
+      "acc_stderr": 0.012458582396003653,
       "acc_norm": 0.2727272727272727,
-      "acc_norm_stderr": 0.025886127156886297
+      "acc_norm_stderr": 0.013110221561502926
     }
   },
   "versions": {
-    "arc_it_challenge": 0
+    "arc_mr": 0
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=gpt2-medium",
-    "batch_size": "1",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
     "device": "cuda",
     "no_cache": false,
     "limit": null,
diff --git a/evals/arc/arc_mr-llama-7B.json b/evals/arc/arc_mr-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..0755f8ce24bf655025ef6eb6414570573beb9858
--- /dev/null
+++ b/evals/arc/arc_mr-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_mr": {
+      "acc": 0.2051948051948052,
+      "acc_stderr": 0.011888050053276677,
+      "acc_norm": 0.2545454545454545,
+      "acc_norm_stderr": 0.012823020964319998
+    }
+  },
+  "versions": {
+    "arc_mr": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_ne-bloom-7b1.json b/evals/arc/arc_ne-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8642b825a874e720a4bb8c0f92ff6fc304357c9f
--- /dev/null
+++ b/evals/arc/arc_ne-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ne": {
+      "acc": 0.21300256629597947,
+      "acc_stderr": 0.01198002307808546,
+      "acc_norm": 0.223267750213858,
+      "acc_norm_stderr": 0.012185048029719049
+    }
+  },
+  "versions": {
+    "arc_ne": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_ne-llama-7B.json b/evals/arc/arc_ne-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..e20341882d82d53d339ccb9e726250d842765069
--- /dev/null
+++ b/evals/arc/arc_ne-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ne": {
+      "acc": 0.2172797262617622,
+      "acc_stderr": 0.012066782166932105,
+      "acc_norm": 0.24294268605645852,
+      "acc_norm_stderr": 0.012548588352773893
+    }
+  },
+  "versions": {
+    "arc_ne": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_nl-bloom-7b1.json b/evals/arc/arc_nl-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..70e6704be7426e916dc20cd9645eb2e99bb6b03a
--- /dev/null
+++ b/evals/arc/arc_nl-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_nl": {
+      "acc": 0.1881950384944397,
+      "acc_stderr": 0.011436905010368727,
+      "acc_norm": 0.2309666381522669,
+      "acc_norm_stderr": 0.012331780770152612
+    }
+  },
+  "versions": {
+    "arc_nl": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ne_challenge_llama-7B.json b/evals/arc/arc_nl-llama-7B.json
similarity index 61%
rename from evals/arc-challenge/arc_ne_challenge_llama-7B.json
rename to evals/arc/arc_nl-llama-7B.json
index a22c844ed32434eb2d404f76e104c502e7218625..6258adbcef55b321a04101c65d1c369f53c6cc09 100644
--- a/evals/arc-challenge/arc_ne_challenge_llama-7B.json
+++ b/evals/arc/arc_nl-llama-7B.json
@@ -1,19 +1,19 @@
 {
   "results": {
-    "arc_ne_challenge": {
-      "acc": 0.2255892255892256,
-      "acc_stderr": 0.024293999292957367,
-      "acc_norm": 0.265993265993266,
-      "acc_norm_stderr": 0.025682629556652858
+    "arc_nl": {
+      "acc": 0.32677502138579984,
+      "acc_stderr": 0.013724076021999824,
+      "acc_norm": 0.3361847733105218,
+      "acc_norm_stderr": 0.013822646555385164
     }
   },
   "versions": {
-    "arc_ne_challenge": 0
+    "arc_nl": 0
   },
   "config": {
     "model": "hf-auto",
     "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
+    "batch_size": 1,
     "device": "cuda",
     "no_cache": false,
     "limit": null,
diff --git a/evals/arc/arc_pt-bloom-7b1.json b/evals/arc/arc_pt-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..880d8570463408853523eec06407b3c8ed9e5b11
--- /dev/null
+++ b/evals/arc/arc_pt-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_pt": {
+      "acc": 0.3401709401709402,
+      "acc_stderr": 0.013856612397310694,
+      "acc_norm": 0.4,
+      "acc_norm_stderr": 0.014328422047021531
+    }
+  },
+  "versions": {
+    "arc_pt": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_pt-llama-7B.json b/evals/arc/arc_pt-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a856face8fef0cab72d3cda7305f6949d011ce3
--- /dev/null
+++ b/evals/arc/arc_pt-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_pt": {
+      "acc": 0.3367521367521368,
+      "acc_stderr": 0.01382247630777062,
+      "acc_norm": 0.37777777777777777,
+      "acc_norm_stderr": 0.014180244103534094
+    }
+  },
+  "versions": {
+    "arc_pt": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_ro-bloom-7b1.json b/evals/arc/arc_ro-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..083766c1f50d79393939908a8f8837dcc7cb697d
--- /dev/null
+++ b/evals/arc/arc_ro-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ro": {
+      "acc": 0.2099400171379606,
+      "acc_stderr": 0.011926921791273557,
+      "acc_norm": 0.26906598114824337,
+      "acc_norm_stderr": 0.012987310039914976
+    }
+  },
+  "versions": {
+    "arc_ro": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_ro-llama-7B.json b/evals/arc/arc_ro-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..eab2e4a70b967696417355b0d11bd69cabf3ddc5
--- /dev/null
+++ b/evals/arc/arc_ro-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ro": {
+      "acc": 0.30077120822622105,
+      "acc_stderr": 0.013430077114209907,
+      "acc_norm": 0.32390745501285345,
+      "acc_norm_stderr": 0.013704533924425027
+    }
+  },
+  "versions": {
+    "arc_ro": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_ru-bloom-7b1.json b/evals/arc/arc_ru-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9ed6089fca2642658a8e6f9f74471739e87e6
--- /dev/null
+++ b/evals/arc/arc_ru-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ru": {
+      "acc": 0.21043627031650983,
+      "acc_stderr": 0.01192703439080346,
+      "acc_norm": 0.2754491017964072,
+      "acc_norm_stderr": 0.01307174925264165
+    }
+  },
+  "versions": {
+    "arc_ru": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_ru-llama-7B.json b/evals/arc/arc_ru-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..f62854eef188594fdc60a93341410fac7a49fa14
--- /dev/null
+++ b/evals/arc/arc_ru-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ru": {
+      "acc": 0.2934131736526946,
+      "acc_stderr": 0.013322973103306575,
+      "acc_norm": 0.32078699743370404,
+      "acc_norm_stderr": 0.013658089444975752
+    }
+  },
+  "versions": {
+    "arc_ru": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_sk-bloom-7b1.json b/evals/arc/arc_sk-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4404e57e2290a69cce8029b89f0939593bbe7d8e
--- /dev/null
+++ b/evals/arc/arc_sk-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sk": {
+      "acc": 0.20359281437125748,
+      "acc_stderr": 0.011782227020010716,
+      "acc_norm": 0.24893071000855432,
+      "acc_norm_stderr": 0.012651960282598879
+    }
+  },
+  "versions": {
+    "arc_sk": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_sk-llama-7B.json b/evals/arc/arc_sk-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..b018df9a5453495bb3ff51f8908c88c064d888a4
--- /dev/null
+++ b/evals/arc/arc_sk-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sk": {
+      "acc": 0.23609923011120615,
+      "acc_stderr": 0.012426371635795894,
+      "acc_norm": 0.28999144568006846,
+      "acc_norm_stderr": 0.013277091943380979
+    }
+  },
+  "versions": {
+    "arc_sk": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_sr-bloom-7b1.json b/evals/arc/arc_sr-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca68a7fae3c2920f66e9f6948396528ea7efe421
--- /dev/null
+++ b/evals/arc/arc_sr-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sr": {
+      "acc": 0.2172797262617622,
+      "acc_stderr": 0.012066782166932079,
+      "acc_norm": 0.25149700598802394,
+      "acc_norm_stderr": 0.01269526466186626
+    }
+  },
+  "versions": {
+    "arc_sr": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_sr-llama-7B.json b/evals/arc/arc_sr-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..dbe0e415ecd651a7afbe25423df0f79ddbf30b59
--- /dev/null
+++ b/evals/arc/arc_sr-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sr": {
+      "acc": 0.25748502994011974,
+      "acc_stderr": 0.012794024494042348,
+      "acc_norm": 0.30795551753635586,
+      "acc_norm_stderr": 0.013507954174822524
+    }
+  },
+  "versions": {
+    "arc_sr": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_sv-bloom-7b1.json b/evals/arc/arc_sv-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e602b4d12926dbb93b567be032a836cb50b2ff51
--- /dev/null
+++ b/evals/arc/arc_sv-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sv": {
+      "acc": 0.20515021459227467,
+      "acc_stderr": 0.011835920197074948,
+      "acc_norm": 0.2515021459227468,
+      "acc_norm_stderr": 0.012717145410329311
+    }
+  },
+  "versions": {
+    "arc_sv": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_sv-llama-7B.json b/evals/arc/arc_sv-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..3cacd9bbf330b2d6be85b2903f5d124c0045cc94
--- /dev/null
+++ b/evals/arc/arc_sv-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_sv": {
+      "acc": 0.303862660944206,
+      "acc_stderr": 0.013480613043590443,
+      "acc_norm": 0.34935622317596565,
+      "acc_norm_stderr": 0.013974278424227307
+    }
+  },
+  "versions": {
+    "arc_sv": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_ta-bloom-7b1.json b/evals/arc/arc_ta-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..68a6f4875888d86505752626ba4a52fd12cc3c84
--- /dev/null
+++ b/evals/arc/arc_ta-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ta": {
+      "acc": 0.22942206654991243,
+      "acc_stderr": 0.01244752638770244,
+      "acc_norm": 0.24168126094570927,
+      "acc_norm_stderr": 0.012673733216040754
+    }
+  },
+  "versions": {
+    "arc_ta": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_ta-llama-7B.json b/evals/arc/arc_ta-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..d7c697739212d1bec5e84f1a4e6f0017d500ecc7
--- /dev/null
+++ b/evals/arc/arc_ta-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_ta": {
+      "acc": 0.2075306479859895,
+      "acc_stderr": 0.012005756657930957,
+      "acc_norm": 0.27495621716287216,
+      "acc_norm_stderr": 0.013218161880960047
+    }
+  },
+  "versions": {
+    "arc_ta": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_te-bloom-7b1.json b/evals/arc/arc_te-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1be31afe5307f0b3c626e305437b1932d4457b68
--- /dev/null
+++ b/evals/arc/arc_te-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_te": {
+      "acc": 0.20175438596491227,
+      "acc_stderr": 0.01189098690363561,
+      "acc_norm": 0.24298245614035088,
+      "acc_norm_stderr": 0.01270803987901337
+    }
+  },
+  "versions": {
+    "arc_te": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_te-llama-7B.json b/evals/arc/arc_te-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..f84a1b907c92965f5829cbd68e89759d2d1ef9d7
--- /dev/null
+++ b/evals/arc/arc_te-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_te": {
+      "acc": 0.2026315789473684,
+      "acc_stderr": 0.011910259341316062,
+      "acc_norm": 0.2517543859649123,
+      "acc_norm_stderr": 0.012860230436368953
+    }
+  },
+  "versions": {
+    "arc_te": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_uk-bloom-7b1.json b/evals/arc/arc_uk-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..05233ff08727d5cac7dd74429dbc024eb5fd5f4f
--- /dev/null
+++ b/evals/arc/arc_uk-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_uk": {
+      "acc": 0.1958939264328486,
+      "acc_stderr": 0.011613035012800898,
+      "acc_norm": 0.2275449101796407,
+      "acc_norm_stderr": 0.012267293637033645
+    }
+  },
+  "versions": {
+    "arc_uk": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_uk-llama-7B.json b/evals/arc/arc_uk-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..717afd73b3550c42e809f9bdb7fac834e805b5ee
--- /dev/null
+++ b/evals/arc/arc_uk-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_uk": {
+      "acc": 0.28999144568006846,
+      "acc_stderr": 0.013277091943380968,
+      "acc_norm": 0.32934131736526945,
+      "acc_norm_stderr": 0.013751575689336035
+    }
+  },
+  "versions": {
+    "arc_uk": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_vi-bloom-7b1.json b/evals/arc/arc_vi-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4bc8e4783cc71214d4ba57feef30a0bfee5774c2
--- /dev/null
+++ b/evals/arc/arc_vi-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_vi": {
+      "acc": 0.28974358974358977,
+      "acc_stderr": 0.013268054405378885,
+      "acc_norm": 0.3367521367521368,
+      "acc_norm_stderr": 0.01382247630777062
+    }
+  },
+  "versions": {
+    "arc_vi": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_vi-llama-7B.json b/evals/arc/arc_vi-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..7c14775b05df6587593cb1cbb921ee6ac86a8370
--- /dev/null
+++ b/evals/arc/arc_vi-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_vi": {
+      "acc": 0.20256410256410257,
+      "acc_stderr": 0.011754979539893694,
+      "acc_norm": 0.23675213675213674,
+      "acc_norm_stderr": 0.01243290160581911
+    }
+  },
+  "versions": {
+    "arc_vi": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_zh-bloom-7b1.json b/evals/arc/arc_zh-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4deb085367a11032bec8e265cc4cb91fe75a0f5
--- /dev/null
+++ b/evals/arc/arc_zh-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_zh": {
+      "acc": 0.3076923076923077,
+      "acc_stderr": 0.013498970320941413,
+      "acc_norm": 0.37264957264957266,
+      "acc_norm_stderr": 0.014141587247061969
+    }
+  },
+  "versions": {
+    "arc_zh": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc/arc_zh-llama-7B.json b/evals/arc/arc_zh-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..9cca2a2335f34f3b9eb36c125304f260fc3f8cd9
--- /dev/null
+++ b/evals/arc/arc_zh-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "arc_zh": {
+      "acc": 0.2564102564102564,
+      "acc_stderr": 0.012771065618749024,
+      "acc_norm": 0.2982905982905983,
+      "acc_norm_stderr": 0.013381080232166387
+    }
+  },
+  "versions": {
+    "arc_zh": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ar-bloom-7b1.json b/evals/mmlu/mmlu_ar-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b6e593af4922000fb94fdaab7a48477f593319ba
--- /dev/null
+++ b/evals/mmlu/mmlu_ar-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_ar": {
+      "acc": 0.26531559405940597,
+      "acc_stderr": 0.0038831388933726414,
+      "acc_norm": 0.2754486386138614,
+      "acc_norm_stderr": 0.003929217133330591
+    }
+  },
+  "versions": {
+    "mmlu_ar": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_bn-bloom-7b1.json b/evals/mmlu/mmlu_bn-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..89c8ade0841c9df16a86355a7b703e726726acfa
--- /dev/null
+++ b/evals/mmlu/mmlu_bn-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_bn": {
+      "acc": 0.2671137646192852,
+      "acc_stderr": 0.004001512896559074,
+      "acc_norm": 0.28150813772797906,
+      "acc_norm_stderr": 0.004067374934957544
+    }
+  },
+  "versions": {
+    "mmlu_bn": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_bn_challenge_gpt2-large.json b/evals/mmlu/mmlu_ca-bloom-7b1.json
similarity index 52%
rename from evals/arc-challenge/arc_bn_challenge_gpt2-large.json
rename to evals/mmlu/mmlu_ca-bloom-7b1.json
index 6b36e33e7bf7866400a4c7d058836627255b75a8..b760f91f32565b551455d9bf715837b34540ec24 100644
--- a/evals/arc-challenge/arc_bn_challenge_gpt2-large.json
+++ b/evals/mmlu/mmlu_ca-bloom-7b1.json
@@ -1,19 +1,19 @@
 {
   "results": {
-    "arc_bn_challenge": {
-      "acc": 0.2195945945945946,
-      "acc_stderr": 0.024102381106046785,
-      "acc_norm": 0.2668918918918919,
-      "acc_norm_stderr": 0.025753762926257924
+    "mmlu_ca": {
+      "acc": 0.2785041045910611,
+      "acc_stderr": 0.003908294722890792,
+      "acc_norm": 0.28785345089692915,
+      "acc_norm_stderr": 0.003947525835346328
     }
   },
   "versions": {
-    "arc_bn_challenge": 0
+    "mmlu_ca": 0
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=gpt2-large",
-    "batch_size": "1",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
     "device": "cuda",
     "no_cache": false,
     "limit": null,
diff --git a/evals/mmlu/mmlu_da-bloom-7b1.json b/evals/mmlu/mmlu_da-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b81f4f5ab7529c0d7efd0c3b2c040d9e4643cc2
--- /dev/null
+++ b/evals/mmlu/mmlu_da-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_da": {
+      "acc": 0.2557170982886567,
+      "acc_stderr": 0.0037964676375075402,
+      "acc_norm": 0.2705588368923217,
+      "acc_norm_stderr": 0.003865954982495375
+    }
+  },
+  "versions": {
+    "mmlu_da": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_de-bloom-7b1.json b/evals/mmlu/mmlu_de-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..40c8412a571fbf0d4f63f6290e66bfbbab5fa943
--- /dev/null
+++ b/evals/mmlu/mmlu_de-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_de": {
+      "acc": 0.2670085985819882,
+      "acc_stderr": 0.0038422837632401587,
+      "acc_norm": 0.2812641424045859,
+      "acc_norm_stderr": 0.003904983582450586
+    }
+  },
+  "versions": {
+    "mmlu_de": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_de-llama-7B.json b/evals/mmlu/mmlu_de-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..48403f057f5a6bffdb9e4cb2644c286f80b5ccf0
--- /dev/null
+++ b/evals/mmlu/mmlu_de-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_de": {
+      "acc": 0.3045708251621662,
+      "acc_stderr": 0.003997127255569371,
+      "acc_norm": 0.2988384371700106,
+      "acc_norm_stderr": 0.003975618018830569
+    }
+  },
+  "versions": {
+    "mmlu_de": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_es-bloom-7b1.json b/evals/mmlu/mmlu_es-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ca552b581fe950c76b7e801b8922438a03f50b6
--- /dev/null
+++ b/evals/mmlu/mmlu_es-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_es": {
+      "acc": 0.2846857657117144,
+      "acc_stderr": 0.00390811532232558,
+      "acc_norm": 0.28926053697315135,
+      "acc_norm_stderr": 0.003926773662056655
+    }
+  },
+  "versions": {
+    "mmlu_es": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_es-llama-7B.json b/evals/mmlu/mmlu_es-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..6c5c8136a88729662690739c773310e7e60685c7
--- /dev/null
+++ b/evals/mmlu/mmlu_es-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_es": {
+      "acc": 0.30808459577021147,
+      "acc_stderr": 0.00399850416060033,
+      "acc_norm": 0.30268486575671216,
+      "acc_norm_stderr": 0.0039787436578546075
+    }
+  },
+  "versions": {
+    "mmlu_es": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_eu-bloom-7b1.json b/evals/mmlu/mmlu_eu-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..bd26e106ebaee3484061fd6d78bd4e9d52579fcd
--- /dev/null
+++ b/evals/mmlu/mmlu_eu-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_eu": {
+      "acc": 0.2576611914684972,
+      "acc_stderr": 0.003953719493412054,
+      "acc_norm": 0.2735147503473073,
+      "acc_norm_stderr": 0.0040298051028790725
+    }
+  },
+  "versions": {
+    "mmlu_eu": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_fr-bloom-7b1.json b/evals/mmlu/mmlu_fr-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..518cf70d5d420bdf6c38c7dc1d83ad8289360cb0
--- /dev/null
+++ b/evals/mmlu/mmlu_fr-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_fr": {
+      "acc": 0.2887479948055916,
+      "acc_stderr": 0.0039609687595635185,
+      "acc_norm": 0.29860209304102053,
+      "acc_norm_stderr": 0.003999989334139082
+    }
+  },
+  "versions": {
+    "mmlu_fr": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_gu-bloom-7b1.json b/evals/mmlu/mmlu_gu-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..08db474bfffcd53c11f37cca5a5523de19ab27b2
--- /dev/null
+++ b/evals/mmlu/mmlu_gu-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_gu": {
+      "acc": 0.24933390631714655,
+      "acc_stderr": 0.004010971174274014,
+      "acc_norm": 0.26566394499355395,
+      "acc_norm_stderr": 0.004094955673385403
+    }
+  },
+  "versions": {
+    "mmlu_gu": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_hi-bloom-7b1.json b/evals/mmlu/mmlu_hi-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8402e114c7f1914a4c05f4a1f91ecb4aad9df2d8
--- /dev/null
+++ b/evals/mmlu/mmlu_hi-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_hi": {
+      "acc": 0.2666237838707084,
+      "acc_stderr": 0.00396526756671177,
+      "acc_norm": 0.2751467395674198,
+      "acc_norm_stderr": 0.004004671316183439
+    }
+  },
+  "versions": {
+    "mmlu_hi": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_hr-bloom-7b1.json b/evals/mmlu/mmlu_hr-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..11c2e3822a0ada199f63dd7adb04e6c604d3151e
--- /dev/null
+++ b/evals/mmlu/mmlu_hr-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_hr": {
+      "acc": 0.25448737450562825,
+      "acc_stderr": 0.0037988075329188904,
+      "acc_norm": 0.26954669911773654,
+      "acc_norm_stderr": 0.0038699014491549413
+    }
+  },
+  "versions": {
+    "mmlu_hr": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_hu-bloom-7b1.json b/evals/mmlu/mmlu_hu-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b5cd6a42f13e7a2790a24766a0455177825ac001
--- /dev/null
+++ b/evals/mmlu/mmlu_hu-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_hu": {
+      "acc": 0.25,
+      "acc_stderr": 0.0037944175097970817,
+      "acc_norm": 0.269041769041769,
+      "acc_norm_stderr": 0.0038859804834747223
+    }
+  },
+  "versions": {
+    "mmlu_hu": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_hy-bloom-7b1.json b/evals/mmlu/mmlu_hy-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b33b978463855a30343b21fc48c4d5eeefe9ed4
--- /dev/null
+++ b/evals/mmlu/mmlu_hy-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_hy": {
+      "acc": 0.24754384354053807,
+      "acc_stderr": 0.004135735206626923,
+      "acc_norm": 0.2570930125791938,
+      "acc_norm_stderr": 0.004187920399106458
+    }
+  },
+  "versions": {
+    "mmlu_hy": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_id-bloom-7b1.json b/evals/mmlu/mmlu_id-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..eab2b6f207224be214da56e0b7642b6e08ab6522
--- /dev/null
+++ b/evals/mmlu/mmlu_id-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_id": {
+      "acc": 0.26631554843141747,
+      "acc_stderr": 0.0038620444798720234,
+      "acc_norm": 0.28058926799480954,
+      "acc_norm_stderr": 0.003925439934317792
+    }
+  },
+  "versions": {
+    "mmlu_id": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_it-bloom-7b1.json b/evals/mmlu/mmlu_it-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f1fd4d72695bef88e7d84fea1cef3fe7a204b1d4
--- /dev/null
+++ b/evals/mmlu/mmlu_it-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_it": {
+      "acc": 0.26161516960036263,
+      "acc_stderr": 0.0038202735800333108,
+      "acc_norm": 0.2760444209413009,
+      "acc_norm_stderr": 0.0038856803174993136
+    }
+  },
+  "versions": {
+    "mmlu_it": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_it-llama-7B.json b/evals/mmlu/mmlu_it-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..4911cc10b24667a5ceebaa64adfc01511364c093
--- /dev/null
+++ b/evals/mmlu/mmlu_it-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_it": {
+      "acc": 0.29848152904736724,
+      "acc_stderr": 0.003977405833855968,
+      "acc_norm": 0.29901034977713986,
+      "acc_norm_stderr": 0.003979426926074157
+    }
+  },
+  "versions": {
+    "mmlu_it": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_kn-bloom-7b1.json b/evals/mmlu/mmlu_kn-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..cdc6e7a6340ce902630293fdf1c6020b92559efd
--- /dev/null
+++ b/evals/mmlu/mmlu_kn-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_kn": {
+      "acc": 0.24622316459051152,
+      "acc_stderr": 0.0040494962676919264,
+      "acc_norm": 0.26716141001855287,
+      "acc_norm_stderr": 0.004159165326445932
+    }
+  },
+  "versions": {
+    "mmlu_kn": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ml-bloom-7b1.json b/evals/mmlu/mmlu_ml-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..0dfd9c349dd00e3ccd1fece3fcf4c414525835bb
--- /dev/null
+++ b/evals/mmlu/mmlu_ml-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_ml": {
+      "acc": 0.24646354733405876,
+      "acc_stderr": 0.0041039285720239,
+      "acc_norm": 0.26414581066376497,
+      "acc_norm_stderr": 0.0041984507173371734
+    }
+  },
+  "versions": {
+    "mmlu_ml": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_mr-bloom-7b1.json b/evals/mmlu/mmlu_mr-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..de6dc10fd113d66213dca64afc3849f020f6285e
--- /dev/null
+++ b/evals/mmlu/mmlu_mr-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_mr": {
+      "acc": 0.2495736213757817,
+      "acc_stderr": 0.003900219801135433,
+      "acc_norm": 0.26289287744660117,
+      "acc_norm_stderr": 0.003967257688070526
+    }
+  },
+  "versions": {
+    "mmlu_mr": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_mr-llama-7B.json b/evals/mmlu/mmlu_mr-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..a68274469ffcdac51ed2534e328a082e752259d5
--- /dev/null
+++ b/evals/mmlu/mmlu_mr-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_mr": {
+      "acc": 0.24941119142369853,
+      "acc_stderr": 0.0038993723464080766,
+      "acc_norm": 0.2784861528465849,
+      "acc_norm_stderr": 0.004039799718714403
+    }
+  },
+  "versions": {
+    "mmlu_mr": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ne-bloom-7b1.json b/evals/mmlu/mmlu_ne-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..63db04e7a0d9e7387ac032f7c649cd67f1996ea4
--- /dev/null
+++ b/evals/mmlu/mmlu_ne-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_ne": {
+      "acc": 0.2568858909499719,
+      "acc_stderr": 0.003915419717331052,
+      "acc_norm": 0.2658797077009556,
+      "acc_norm_stderr": 0.0039591928340292366
+    }
+  },
+  "versions": {
+    "mmlu_ne": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ne-llama-7B.json b/evals/mmlu/mmlu_ne-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f6048f4b5b7f57e7bc90c0226fb4fb987b1f1b5
--- /dev/null
+++ b/evals/mmlu/mmlu_ne-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_ne": {
+      "acc": 0.245483016140689,
+      "acc_stderr": 0.0038567872193795804,
+      "acc_norm": 0.2774431863807918,
+      "acc_norm_stderr": 0.004012393111736023
+    }
+  },
+  "versions": {
+    "mmlu_ne": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_nl-bloom-7b1.json b/evals/mmlu/mmlu_nl-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..11f031c8a30795485e92c546f4b04d94df9c7e32
--- /dev/null
+++ b/evals/mmlu/mmlu_nl-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_nl": {
+      "acc": 0.25931547393185095,
+      "acc_stderr": 0.0038180275621108187,
+      "acc_norm": 0.2749487743796008,
+      "acc_norm_stderr": 0.003889720954246996
+    }
+  },
+  "versions": {
+    "mmlu_nl": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_nl-llama-7B.json b/evals/mmlu/mmlu_nl-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..db5993204885ee62dd922204656d17bcc53a0869
--- /dev/null
+++ b/evals/mmlu/mmlu_nl-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_nl": {
+      "acc": 0.3053046975791151,
+      "acc_stderr": 0.004012103530956046,
+      "acc_norm": 0.2983987250512256,
+      "acc_norm_stderr": 0.003986133809323066
+    }
+  },
+  "versions": {
+    "mmlu_nl": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_pt-bloom-7b1.json b/evals/mmlu/mmlu_pt-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3887b3366a9810116b594c74c02905628ee78fcf
--- /dev/null
+++ b/evals/mmlu/mmlu_pt-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_pt": {
+      "acc": 0.2809216451516061,
+      "acc_stderr": 0.0038938542873620118,
+      "acc_norm": 0.287676373461423,
+      "acc_norm_stderr": 0.0039218389764563225
+    }
+  },
+  "versions": {
+    "mmlu_pt": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_pt-llama-7B.json b/evals/mmlu/mmlu_pt-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..d5ff15ab450754ca303e55e1503611a1b7fd3d44
--- /dev/null
+++ b/evals/mmlu/mmlu_pt-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_pt": {
+      "acc": 0.3016361453017112,
+      "acc_stderr": 0.003976322071656026,
+      "acc_norm": 0.3007355148604023,
+      "acc_norm_stderr": 0.003972940683152965
+    }
+  },
+  "versions": {
+    "mmlu_pt": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ro-bloom-7b1.json b/evals/mmlu/mmlu_ro-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b9ced8c74d8ae4d628e7fe9168ff402ce98cd279
--- /dev/null
+++ b/evals/mmlu/mmlu_ro-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_ro": {
+      "acc": 0.2555891238670695,
+      "acc_stderr": 0.003790966515146354,
+      "acc_norm": 0.2737160120845921,
+      "acc_norm_stderr": 0.0038750360364507622
+    }
+  },
+  "versions": {
+    "mmlu_ro": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ro-llama-7B.json b/evals/mmlu/mmlu_ro-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..7474e610db1236709be35a3a648960d8b40a838e
--- /dev/null
+++ b/evals/mmlu/mmlu_ro-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_ro": {
+      "acc": 0.29342900302114805,
+      "acc_stderr": 0.003957326026204448,
+      "acc_norm": 0.2965256797583082,
+      "acc_norm_stderr": 0.003969425800928827
+    }
+  },
+  "versions": {
+    "mmlu_ro": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ru-bloom-7b1.json b/evals/mmlu/mmlu_ru-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..597b21a215ebd9c9d442c41b7c7577008553e896
--- /dev/null
+++ b/evals/mmlu/mmlu_ru-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_ru": {
+      "acc": 0.2525563158299377,
+      "acc_stderr": 0.0038097500220131194,
+      "acc_norm": 0.2695471669101253,
+      "acc_norm_stderr": 0.0038908241231695112
+    }
+  },
+  "versions": {
+    "mmlu_ru": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ru-llama-7B.json b/evals/mmlu/mmlu_ru-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cc8eed486b867ef15f762b1387fd29a6cf4416b
--- /dev/null
+++ b/evals/mmlu/mmlu_ru-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_ru": {
+      "acc": 0.29445683093718766,
+      "acc_stderr": 0.0039966925205054795,
+      "acc_norm": 0.3016068270931037,
+      "acc_norm_stderr": 0.004024377402999243
+    }
+  },
+  "versions": {
+    "mmlu_ru": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_sk-bloom-7b1.json b/evals/mmlu/mmlu_sk-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..c5c41d03419b8a4038c58ab0e4166ce0e96c28d9
--- /dev/null
+++ b/evals/mmlu/mmlu_sk-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_sk": {
+      "acc": 0.24927269943347113,
+      "acc_stderr": 0.003785212350164864,
+      "acc_norm": 0.26672791303016385,
+      "acc_norm_stderr": 0.003869711564658995
+    }
+  },
+  "versions": {
+    "mmlu_sk": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_sk-llama-7B.json b/evals/mmlu/mmlu_sk-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..309a344b59b192e0dbc8e50b499a16b67538c1ef
--- /dev/null
+++ b/evals/mmlu/mmlu_sk-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_sk": {
+      "acc": 0.28127392436074106,
+      "acc_stderr": 0.003934216199449274,
+      "acc_norm": 0.2944418925126321,
+      "acc_norm_stderr": 0.003988209639409228
+    }
+  },
+  "versions": {
+    "mmlu_sk": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_sr-bloom-7b1.json b/evals/mmlu/mmlu_sr-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..88c6699b6f71aadafabd08193c19c50d25887e85
--- /dev/null
+++ b/evals/mmlu/mmlu_sr-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_sr": {
+      "acc": 0.25650952706293173,
+      "acc_stderr": 0.0038050782551146203,
+      "acc_norm": 0.27245122599256055,
+      "acc_norm_stderr": 0.003879266167871199
+    }
+  },
+  "versions": {
+    "mmlu_sr": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_sr-llama-7B.json b/evals/mmlu/mmlu_sr-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..fbe389b6b884d3a9692413dc84031d5ea2363b31
--- /dev/null
+++ b/evals/mmlu/mmlu_sr-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_sr": {
+      "acc": 0.2902907462233356,
+      "acc_stderr": 0.003954858675409034,
+      "acc_norm": 0.2920367418203902,
+      "acc_norm_stderr": 0.003961851981605455
+    }
+  },
+  "versions": {
+    "mmlu_sr": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_sv-bloom-7b1.json b/evals/mmlu/mmlu_sv-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..90ee3cd4e9733639263cdcf04b82e171f8485253
--- /dev/null
+++ b/evals/mmlu/mmlu_sv-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_sv": {
+      "acc": 0.26122788446998335,
+      "acc_stderr": 0.003820033520031446,
+      "acc_norm": 0.27491305005292604,
+      "acc_norm_stderr": 0.0038823517609477554
+    }
+  },
+  "versions": {
+    "mmlu_sv": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_sv-llama-7B.json b/evals/mmlu/mmlu_sv-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..d962d7acbb38d8ae28b5d3c396c6389a2ae6bf49
--- /dev/null
+++ b/evals/mmlu/mmlu_sv-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_sv": {
+      "acc": 0.30024194767881446,
+      "acc_stderr": 0.003985765983480769,
+      "acc_norm": 0.29321034326326934,
+      "acc_norm_stderr": 0.003958556933478504
+    }
+  },
+  "versions": {
+    "mmlu_sv": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ta-bloom-7b1.json b/evals/mmlu/mmlu_ta-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..227c87597c1eb663c59c29f3eb1d52a08a3d189d
--- /dev/null
+++ b/evals/mmlu/mmlu_ta-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_ta": {
+      "acc": 0.2531252694197776,
+      "acc_stderr": 0.00403738422854994,
+      "acc_norm": 0.2664884903871023,
+      "acc_norm_stderr": 0.004105359016847502
+    }
+  },
+  "versions": {
+    "mmlu_ta": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_ta-llama-7B.json b/evals/mmlu/mmlu_ta-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..c47ddc1d3941b02c8ef307b03e1af7c3f33d41f8
--- /dev/null
+++ b/evals/mmlu/mmlu_ta-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_ta": {
+      "acc": 0.24743512371756185,
+      "acc_stderr": 0.004006923901271705,
+      "acc_norm": 0.27752392447624796,
+      "acc_norm_stderr": 0.004157865121797154
+    }
+  },
+  "versions": {
+    "mmlu_ta": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_te-bloom-7b1.json b/evals/mmlu/mmlu_te-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..6dda2185b223b03895db5556e33db9db1733d107
--- /dev/null
+++ b/evals/mmlu/mmlu_te-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_te": {
+      "acc": 0.2502857142857143,
+      "acc_stderr": 0.004061713740284853,
+      "acc_norm": 0.2618901098901099,
+      "acc_norm_stderr": 0.00412252643604891
+    }
+  },
+  "versions": {
+    "mmlu_te": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_te-llama-7B.json b/evals/mmlu/mmlu_te-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..d495ac0b0d562ef0467a6d5a79b03bb80ccfc6a4
--- /dev/null
+++ b/evals/mmlu/mmlu_te-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_te": {
+      "acc": 0.24562637362637363,
+      "acc_stderr": 0.00403621353648515,
+      "acc_norm": 0.26874725274725275,
+      "acc_norm_stderr": 0.004156704581054155
+    }
+  },
+  "versions": {
+    "mmlu_te": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_uk-bloom-7b1.json b/evals/mmlu/mmlu_uk-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7ad6aa7c934875a8ffa40228178610c089842e74
--- /dev/null
+++ b/evals/mmlu/mmlu_uk-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_uk": {
+      "acc": 0.24719188163296923,
+      "acc_stderr": 0.0037969053429642604,
+      "acc_norm": 0.2663258191959098,
+      "acc_norm_stderr": 0.003890709230487387
+    }
+  },
+  "versions": {
+    "mmlu_uk": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_uk-llama-7B.json b/evals/mmlu/mmlu_uk-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..2ac08620ea865817dc03d2021d1c2a89e95bd091
--- /dev/null
+++ b/evals/mmlu/mmlu_uk-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_uk": {
+      "acc": 0.2894104888062592,
+      "acc_stderr": 0.003991508434906801,
+      "acc_norm": 0.2939809435277713,
+      "acc_norm_stderr": 0.004009944142684111
+    }
+  },
+  "versions": {
+    "mmlu_uk": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_vi-bloom-7b1.json b/evals/mmlu/mmlu_vi-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b29824403bc095477d8a6a0acdb87f1e76c4dfb
--- /dev/null
+++ b/evals/mmlu/mmlu_vi-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_vi": {
+      "acc": 0.26726381871076405,
+      "acc_stderr": 0.003872181345366132,
+      "acc_norm": 0.281427040269484,
+      "acc_norm_stderr": 0.003934867675165376
+    }
+  },
+  "versions": {
+    "mmlu_vi": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_vi-llama-7B.json b/evals/mmlu/mmlu_vi-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..194b2dd47470bee66f0c97bb28f1a825707dccea
--- /dev/null
+++ b/evals/mmlu/mmlu_vi-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_vi": {
+      "acc": 0.26052671872607563,
+      "acc_stderr": 0.0038406007591986315,
+      "acc_norm": 0.28579084366865715,
+      "acc_norm_stderr": 0.003953198731610307
+    }
+  },
+  "versions": {
+    "mmlu_vi": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_zh-bloom-7b1.json b/evals/mmlu/mmlu_zh-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e98a766b006fc2ceed3e7d766f77be6fdaf5abe6
--- /dev/null
+++ b/evals/mmlu/mmlu_zh-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_zh": {
+      "acc": 0.27884542347132546,
+      "acc_stderr": 0.003908427008060506,
+      "acc_norm": 0.29137865552601594,
+      "acc_norm_stderr": 0.003960427300065885
+    }
+  },
+  "versions": {
+    "mmlu_zh": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/mmlu/mmlu_zh-llama-7B.json b/evals/mmlu/mmlu_zh-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..963997e00a6c8204be6df0d19adfe241fd53d094
--- /dev/null
+++ b/evals/mmlu/mmlu_zh-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "mmlu_zh": {
+      "acc": 0.2769464489175845,
+      "acc_stderr": 0.003900220811105949,
+      "acc_norm": 0.2883402962400304,
+      "acc_norm_stderr": 0.003948161607934338
+    }
+  },
+  "versions": {
+    "mmlu_zh": 0
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_bn_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_bn_mc_bloom-7b1.json
deleted file mode 100644
index 9370c174001acd0fca0cddf24e9076e303b9a18d..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_bn_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_bn_mc": {
-      "mc1": 0.26548672566371684,
-      "mc1_stderr": 0.015711139487640472,
-      "mc2": 0.4852587344144857,
-      "mc2_stderr": 0.01612406516233488
-    }
-  },
-  "versions": {
-    "truthfulqa_bn_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_bn_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_bn_mc_llama-7B.json
deleted file mode 100644
index 16e9590be5e353f400674681f4f4e162bad08d5f..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_bn_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_bn_mc": {
-      "mc1": 0.27939317319848295,
-      "mc1_stderr": 0.015964066769100945,
-      "mc2": 0.513392699496713,
-      "mc2_stderr": 0.016700880970144227
-    }
-  },
-  "versions": {
-    "truthfulqa_bn_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ca_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_ca_mc_bloom-7b1.json
deleted file mode 100644
index 11285119043f95ac0d376ad5c3e9afaeb0e2d7e9..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_ca_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ca_mc": {
-      "mc1": 0.24261874197689345,
-      "mc1_stderr": 0.01536843525152329,
-      "mc2": 0.39989771937446994,
-      "mc2_stderr": 0.015246797370718152
-    }
-  },
-  "versions": {
-    "truthfulqa_ca_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_es_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_es_mc_bloom-7b1.json
deleted file mode 100644
index c983b9fd981831059a19411e2f854761bb466743..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_es_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_es_mc": {
-      "mc1": 0.2468354430379747,
-      "mc1_stderr": 0.01535006418032032,
-      "mc2": 0.40446379335454147,
-      "mc2_stderr": 0.01462209461275691
-    }
-  },
-  "versions": {
-    "truthfulqa_es_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_es_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_es_mc_llama-7B.json
deleted file mode 100644
index ded6c86f6861c4d0dc091db262fe1d2a25208804..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_es_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_es_mc": {
-      "mc1": 0.22658227848101264,
-      "mc1_stderr": 0.014903268563982738,
-      "mc2": 0.37120532090630015,
-      "mc2_stderr": 0.014441690126415349
-    }
-  },
-  "versions": {
-    "truthfulqa_es_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_eu_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_eu_mc_bloom-7b1.json
deleted file mode 100644
index 52f4939ac5fa964406f4eecce983e80178660657..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_eu_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_eu_mc": {
-      "mc1": 0.26214833759590794,
-      "mc1_stderr": 0.015737384911607682,
-      "mc2": 0.4464332201206485,
-      "mc2_stderr": 0.01621754992783137
-    }
-  },
-  "versions": {
-    "truthfulqa_eu_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_eu_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_eu_mc_llama-7B.json
deleted file mode 100644
index 2591b2575e316599868892fc6541e53cca27f1eb..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_eu_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_eu_mc": {
-      "mc1": 0.22762148337595908,
-      "mc1_stderr": 0.01500362498587022,
-      "mc2": 0.4077400427662786,
-      "mc2_stderr": 0.01655029094183041
-    }
-  },
-  "versions": {
-    "truthfulqa_eu_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_fr_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_fr_mc_bloom-7b1.json
deleted file mode 100644
index 74d3041ce242f33429dfa1dec98c70a446ad3459..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_fr_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_fr_mc": {
-      "mc1": 0.2598225602027883,
-      "mc1_stderr": 0.015622237721822354,
-      "mc2": 0.40857191925599595,
-      "mc2_stderr": 0.01474266494761903
-    }
-  },
-  "versions": {
-    "truthfulqa_fr_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_fr_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_fr_mc_llama-7B.json
deleted file mode 100644
index 800ad2a78b80c2eb4974ba18bc90689969705247..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_fr_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_fr_mc": {
-      "mc1": 0.23827629911280102,
-      "mc1_stderr": 0.015176654543722067,
-      "mc2": 0.39924075017495203,
-      "mc2_stderr": 0.014258162205908845
-    }
-  },
-  "versions": {
-    "truthfulqa_fr_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_gu_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_gu_mc_bloom-7b1.json
deleted file mode 100644
index 64f963ad419e8b93cc4134accc25685a3b6c7973..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_gu_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_gu_mc": {
-      "mc1": 0.2572944297082228,
-      "mc1_stderr": 0.015930376662111265,
-      "mc2": 0.4550226506739247,
-      "mc2_stderr": 0.016990336661822224
-    }
-  },
-  "versions": {
-    "truthfulqa_gu_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_gu_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_gu_mc_llama-7B.json
deleted file mode 100644
index c069c02eb514218d456bb1424dd8cfe77f48a1ab..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_gu_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_gu_mc": {
-      "mc1": 0.2572944297082228,
-      "mc1_stderr": 0.015930376662111265,
-      "mc2": 0.42704504017782213,
-      "mc2_stderr": 0.017012444121235887
-    }
-  },
-  "versions": {
-    "truthfulqa_gu_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_hi_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_hi_mc_bloom-7b1.json
deleted file mode 100644
index 8962a71a352d9b104821eb68a25a8785186a6f80..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_hi_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_hi_mc": {
-      "mc1": 0.26153846153846155,
-      "mc1_stderr": 0.0157457370262172,
-      "mc2": 0.4459427734456273,
-      "mc2_stderr": 0.015816895972907637
-    }
-  },
-  "versions": {
-    "truthfulqa_hi_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_hi_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_hi_mc_llama-7B.json
deleted file mode 100644
index 2f7c57699fb99f36e65e991419808a451e65b58d..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_hi_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_hi_mc": {
-      "mc1": 0.28076923076923077,
-      "mc1_stderr": 0.016100529409585174,
-      "mc2": 0.47439648196687334,
-      "mc2_stderr": 0.016645149126511907
-    }
-  },
-  "versions": {
-    "truthfulqa_hi_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_hr_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_hr_mc_bloom-7b1.json
deleted file mode 100644
index 314546568b9f50af4248c3961474c8f4e4d3b021..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_hr_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_hr_mc": {
-      "mc1": 0.2805194805194805,
-      "mc1_stderr": 0.01620047927370478,
-      "mc2": 0.4799867976765054,
-      "mc2_stderr": 0.016630823388575047
-    }
-  },
-  "versions": {
-    "truthfulqa_hr_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_hr_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_hr_mc_llama-7B.json
deleted file mode 100644
index a89b4ca336f2e469df36faf9e3b8bae78e238226..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_hr_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_hr_mc": {
-      "mc1": 0.24285714285714285,
-      "mc1_stderr": 0.015463264535393416,
-      "mc2": 0.4178069276061212,
-      "mc2_stderr": 0.015457117904740929
-    }
-  },
-  "versions": {
-    "truthfulqa_hr_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_hu_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_hu_mc_bloom-7b1.json
deleted file mode 100644
index f0063c59598d9ace87e37889c678b775e7685f4e..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_hu_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_hu_mc": {
-      "mc1": 0.2664941785252264,
-      "mc1_stderr": 0.01591244793052595,
-      "mc2": 0.5012245769743321,
-      "mc2_stderr": 0.017012659134722635
-    }
-  },
-  "versions": {
-    "truthfulqa_hu_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_hu_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_hu_mc_llama-7B.json
deleted file mode 100644
index 8186b5b669612791a673c1748562011f0fa91aec..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_hu_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_hu_mc": {
-      "mc1": 0.24579560155239327,
-      "mc1_stderr": 0.01549611867708382,
-      "mc2": 0.432092949382587,
-      "mc2_stderr": 0.015533288486024798
-    }
-  },
-  "versions": {
-    "truthfulqa_hu_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_hy_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_hy_mc_bloom-7b1.json
deleted file mode 100644
index ddde03654d791b6a3794476cfb89b83c5ef45e53..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_hy_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_hy_mc": {
-      "mc1": 0.2629032258064516,
-      "mc1_stderr": 0.017693546356249937,
-      "mc2": 0.4681902443615651,
-      "mc2_stderr": 0.019292338415181538
-    }
-  },
-  "versions": {
-    "truthfulqa_hy_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_hy_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_hy_mc_llama-7B.json
deleted file mode 100644
index f5ca203decb570b2e7314edadae5c00a4adfc62c..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_hy_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_hy_mc": {
-      "mc1": 0.2564516129032258,
-      "mc1_stderr": 0.017551409976203195,
-      "mc2": 0.46436602760838236,
-      "mc2_stderr": 0.018999233967880117
-    }
-  },
-  "versions": {
-    "truthfulqa_hy_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_id_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_id_mc_bloom-7b1.json
deleted file mode 100644
index e5c70280232e984fefca1f1a8cfe4a29409de1c8..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_id_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_id_mc": {
-      "mc1": 0.25288831835686776,
-      "mc1_stderr": 0.015583584105316878,
-      "mc2": 0.4035395580966099,
-      "mc2_stderr": 0.015018121460072335
-    }
-  },
-  "versions": {
-    "truthfulqa_id_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_id_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_id_mc_llama-7B.json
deleted file mode 100644
index 6dfd743ea67805acd941cfb18f2d6362f9880f82..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_id_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_id_mc": {
-      "mc1": 0.25673940949935814,
-      "mc1_stderr": 0.015661271683095182,
-      "mc2": 0.39766031480749814,
-      "mc2_stderr": 0.015508891980724996
-    }
-  },
-  "versions": {
-    "truthfulqa_id_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_it_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_it_mc_bloom-7b1.json
deleted file mode 100644
index e83a75ef58f484e4f28d9b48fd6106931bcd7a26..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_it_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_it_mc": {
-      "mc1": 0.2697201017811705,
-      "mc1_stderr": 0.015840413061442026,
-      "mc2": 0.4389841648203799,
-      "mc2_stderr": 0.015926853851979495
-    }
-  },
-  "versions": {
-    "truthfulqa_it_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_it_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_it_mc_llama-7B.json
deleted file mode 100644
index b9f0f156188649c4a7542e6d6f2ba9b37c457655..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_it_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_it_mc": {
-      "mc1": 0.24427480916030533,
-      "mc1_stderr": 0.015335094706043257,
-      "mc2": 0.39785622787135533,
-      "mc2_stderr": 0.014810294602470058
-    }
-  },
-  "versions": {
-    "truthfulqa_it_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_kn_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_kn_mc_bloom-7b1.json
deleted file mode 100644
index 4fe9dd96dcdc93bdacfb696ab94a78b3f7f7a246..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_kn_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_kn_mc": {
-      "mc1": 0.28792134831460675,
-      "mc1_stderr": 0.0169811116006733,
-      "mc2": 0.4971377207989088,
-      "mc2_stderr": 0.0171981853340177
-    }
-  },
-  "versions": {
-    "truthfulqa_kn_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_kn_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_kn_mc_llama-7B.json
deleted file mode 100644
index 993e1c25914137bc34c8316cf67a25cc17ab83c4..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_kn_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_kn_mc": {
-      "mc1": 0.27808988764044945,
-      "mc1_stderr": 0.01680348492221316,
-      "mc2": 0.46974001502290064,
-      "mc2_stderr": 0.017840960060966953
-    }
-  },
-  "versions": {
-    "truthfulqa_kn_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ml_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_ml_mc_bloom-7b1.json
deleted file mode 100644
index 24914faf5345d35faa0a1b782d6c784b3edd07d6..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_ml_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ml_mc": {
-      "mc1": 0.25831202046035806,
-      "mc1_stderr": 0.01566236755478916,
-      "mc2": 0.4909574719052267,
-      "mc2_stderr": 0.016823307128975565
-    }
-  },
-  "versions": {
-    "truthfulqa_ml_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ml_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_ml_mc_llama-7B.json
deleted file mode 100644
index 0a3806514d04876fc28bdf3370af03eacd615826..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_ml_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ml_mc": {
-      "mc1": 0.2749360613810742,
-      "mc1_stderr": 0.015976383961112832,
-      "mc2": 0.5095091855665959,
-      "mc2_stderr": 0.016954647599861927
-    }
-  },
-  "versions": {
-    "truthfulqa_ml_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_mr_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_mr_mc_bloom-7b1.json
deleted file mode 100644
index cf87faf0ec1093a1588fff90b3e24e1f69710c9d..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_mr_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_mr_mc": {
-      "mc1": 0.2753807106598985,
-      "mc1_stderr": 0.015923346195889237,
-      "mc2": 0.47635177057868366,
-      "mc2_stderr": 0.016517346765693778
-    }
-  },
-  "versions": {
-    "truthfulqa_mr_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_mr_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_mr_mc_llama-7B.json
deleted file mode 100644
index 0ddcccf963e412f25a53de4c98f439abf7a25388..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_mr_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_mr_mc": {
-      "mc1": 0.28553299492385786,
-      "mc1_stderr": 0.01610022231189975,
-      "mc2": 0.4895379243686521,
-      "mc2_stderr": 0.016741018968357894
-    }
-  },
-  "versions": {
-    "truthfulqa_mr_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ne_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_ne_mc_bloom-7b1.json
deleted file mode 100644
index 90e378f71e4638bf2da69d765a15b05858d6e2b9..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_ne_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ne_mc": {
-      "mc1": 0.2880710659898477,
-      "mc1_stderr": 0.016142870973426694,
-      "mc2": 0.467435004054711,
-      "mc2_stderr": 0.016544742019032287
-    }
-  },
-  "versions": {
-    "truthfulqa_ne_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_nl_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_nl_mc_bloom-7b1.json
deleted file mode 100644
index 3ce8ddbd63d98848d347aa0302b9cbaccb48cbd3..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_nl_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_nl_mc": {
-      "mc1": 0.25477707006369427,
-      "mc1_stderr": 0.015561993973145626,
-      "mc2": 0.4267767591847509,
-      "mc2_stderr": 0.016186878668566853
-    }
-  },
-  "versions": {
-    "truthfulqa_nl_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_pt_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_pt_mc_bloom-7b1.json
deleted file mode 100644
index b684b021a3805f5bd343cabdea341eecc0435e00..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_pt_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_pt_mc": {
-      "mc1": 0.23857868020304568,
-      "mc1_stderr": 0.015192910034567015,
-      "mc2": 0.38894722340741383,
-      "mc2_stderr": 0.014531269277587647
-    }
-  },
-  "versions": {
-    "truthfulqa_pt_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ro_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_ro_mc_bloom-7b1.json
deleted file mode 100644
index af2110ac326a3065b94fa267cded253cc069b3e0..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_ro_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ro_mc": {
-      "mc1": 0.2608695652173913,
-      "mc1_stderr": 0.015712552179082358,
-      "mc2": 0.46132785760214634,
-      "mc2_stderr": 0.016284566824666485
-    }
-  },
-  "versions": {
-    "truthfulqa_ro_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ro_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_ro_mc_llama-7B.json
deleted file mode 100644
index fe7ed655b7f61f10c4accbd13f5b9fc293536300..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_ro_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ro_mc": {
-      "mc1": 0.22762148337595908,
-      "mc1_stderr": 0.015003624985870205,
-      "mc2": 0.37160168017693795,
-      "mc2_stderr": 0.015014785650167688
-    }
-  },
-  "versions": {
-    "truthfulqa_ro_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ru_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_ru_mc_bloom-7b1.json
deleted file mode 100644
index d15e5341b01a6e2876ffb863286387d4dcc69456..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_ru_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ru_mc": {
-      "mc1": 0.30632911392405066,
-      "mc1_stderr": 0.016410898874958186,
-      "mc2": 0.49751656068823824,
-      "mc2_stderr": 0.016150279946055047
-    }
-  },
-  "versions": {
-    "truthfulqa_ru_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ru_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_ru_mc_llama-7B.json
deleted file mode 100644
index 2036782896e35aee07acce858c408720bcb3b9b9..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_ru_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ru_mc": {
-      "mc1": 0.24556962025316456,
-      "mc1_stderr": 0.015323515145952671,
-      "mc2": 0.40851860840920967,
-      "mc2_stderr": 0.015225752517489843
-    }
-  },
-  "versions": {
-    "truthfulqa_ru_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_sk_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_sk_mc_llama-7B.json
deleted file mode 100644
index 13785fc105b2964d3bcf70bb68daf0ddc0ccdbfd..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_sk_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_sk_mc": {
-      "mc1": 0.22692307692307692,
-      "mc1_stderr": 0.01500658794494848,
-      "mc2": 0.40846796746265707,
-      "mc2_stderr": 0.015828756550364212
-    }
-  },
-  "versions": {
-    "truthfulqa_sk_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_sr_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_sr_mc_llama-7B.json
deleted file mode 100644
index 3a70158ad0bf874c11233369e2b8b2fbd08bb508..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_sr_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_sr_mc": {
-      "mc1": 0.2684478371501272,
-      "mc1_stderr": 0.015816769133859612,
-      "mc2": 0.42343608663478216,
-      "mc2_stderr": 0.015372831241353751
-    }
-  },
-  "versions": {
-    "truthfulqa_sr_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_sv_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_sv_mc_llama-7B.json
deleted file mode 100644
index 1665d4f2e88a870557fd94395d1d54f58919d85c..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_sv_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_sv_mc": {
-      "mc1": 0.2596899224806202,
-      "mc1_stderr": 0.015770469834891904,
-      "mc2": 0.40528913702963154,
-      "mc2_stderr": 0.015006798915735541
-    }
-  },
-  "versions": {
-    "truthfulqa_sv_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ta_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_ta_mc_bloom-7b1.json
deleted file mode 100644
index 4d2164cc879ad161ad6563ad86aee4884b45ea32..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_ta_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ta_mc": {
-      "mc1": 0.26015228426395937,
-      "mc1_stderr": 0.015638591095633272,
-      "mc2": 0.4828328722219756,
-      "mc2_stderr": 0.01641270817636116
-    }
-  },
-  "versions": {
-    "truthfulqa_ta_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_ta_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_ta_mc_llama-7B.json
deleted file mode 100644
index fee0b1146f0fd8e72ac72b5e05a85a9d0c18afcb..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_ta_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_ta_mc": {
-      "mc1": 0.27411167512690354,
-      "mc1_stderr": 0.015900519226497174,
-      "mc2": 0.5027478455482438,
-      "mc2_stderr": 0.016693455124890125
-    }
-  },
-  "versions": {
-    "truthfulqa_ta_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_te_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_te_mc_bloom-7b1.json
deleted file mode 100644
index cb186a4cf39dc7c369f4adcb4c21742a3bb8d875..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_te_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_te_mc": {
-      "mc1": 0.2646276595744681,
-      "mc1_stderr": 0.016097235388949582,
-      "mc2": 0.4761751419934964,
-      "mc2_stderr": 0.01699481972514669
-    }
-  },
-  "versions": {
-    "truthfulqa_te_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_te_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_te_mc_llama-7B.json
deleted file mode 100644
index 6a27e1784964b5486d2d2aeb7d5418ef3fbc892d..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_te_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_te_mc": {
-      "mc1": 0.2898936170212766,
-      "mc1_stderr": 0.016556215331027437,
-      "mc2": 0.4950446673992078,
-      "mc2_stderr": 0.017314129921675917
-    }
-  },
-  "versions": {
-    "truthfulqa_te_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_uk_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_uk_mc_bloom-7b1.json
deleted file mode 100644
index 2a55f54ab6ab50194bfa1058aacbecc18b36d6e7..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_uk_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_uk_mc": {
-      "mc1": 0.3082901554404145,
-      "mc1_stderr": 0.016630856554976103,
-      "mc2": 0.5156453949784039,
-      "mc2_stderr": 0.01673540498425732
-    }
-  },
-  "versions": {
-    "truthfulqa_uk_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_uk_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_uk_mc_llama-7B.json
deleted file mode 100644
index 87ffa1a02265b9ec13f193b53fba9b06f985e7a2..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_uk_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_uk_mc": {
-      "mc1": 0.23575129533678757,
-      "mc1_stderr": 0.015286822062573322,
-      "mc2": 0.41551850845167937,
-      "mc2_stderr": 0.01559551532730194
-    }
-  },
-  "versions": {
-    "truthfulqa_uk_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_vi_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_vi_mc_bloom-7b1.json
deleted file mode 100644
index 641e07a270f97ae74adc933fcaaf2f17f0cc2720..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_vi_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_vi_mc": {
-      "mc1": 0.2969543147208122,
-      "mc1_stderr": 0.01628730493420265,
-      "mc2": 0.44687544361363724,
-      "mc2_stderr": 0.015032707389451902
-    }
-  },
-  "versions": {
-    "truthfulqa_vi_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_vi_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_vi_mc_llama-7B.json
deleted file mode 100644
index 281dd4ecf9b86e311de0e817f9acf01943305b44..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_vi_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_vi_mc": {
-      "mc1": 0.2436548223350254,
-      "mc1_stderr": 0.015302421509379252,
-      "mc2": 0.42906776165158894,
-      "mc2_stderr": 0.016213220197264143
-    }
-  },
-  "versions": {
-    "truthfulqa_vi_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_zh_mc_bloom-7b1.json b/evals/truthfulqa-mc/truthfulqa_zh_mc_bloom-7b1.json
deleted file mode 100644
index ccc762b26a77cd8c55bbb320f1c81e3b51e30910..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_zh_mc_bloom-7b1.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_zh_mc": {
-      "mc1": 0.22727272727272727,
-      "mc1_stderr": 0.014900421035751319,
-      "mc2": 0.3872774224063368,
-      "mc2_stderr": 0.01489618179042084
-    }
-  },
-  "versions": {
-    "truthfulqa_zh_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_zh_mc_llama-7B.json b/evals/truthfulqa-mc/truthfulqa_zh_mc_llama-7B.json
deleted file mode 100644
index 5e49b170e61cb016ddf2105ccf2469c2fd884a24..0000000000000000000000000000000000000000
--- a/evals/truthfulqa-mc/truthfulqa_zh_mc_llama-7B.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "results": {
-    "truthfulqa_zh_mc": {
-      "mc1": 0.26515151515151514,
-      "mc1_stderr": 0.015694869766795665,
-      "mc2": 0.43429601246293487,
-      "mc2_stderr": 0.015796890327346987
-    }
-  },
-  "versions": {
-    "truthfulqa_zh_mc": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ar-bloom-7b1.json b/evals/truthfulqa/truthfulqa_ar-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..4ecb61811afa7d48353c2bef8d82befffceceb07
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_ar-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ar": {
+      "mc1": 0.26002587322121606,
+      "mc1_stderr": 0.015787301353849415,
+      "mc2": 0.4256353881905651,
+      "mc2_stderr": 0.015737567507798107
+    }
+  },
+  "versions": {
+    "truthfulqa_ar": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_ml_challenge_gpt2.json b/evals/truthfulqa/truthfulqa_ar-gpt2.json
similarity index 53%
rename from evals/arc-challenge/arc_ml_challenge_gpt2.json
rename to evals/truthfulqa/truthfulqa_ar-gpt2.json
index 0c8fc7d983c690076289a5040bce6204cb0b9146..f83b2bef80b7c2c4a74c05764b7e0d0996d4b489 100644
--- a/evals/arc-challenge/arc_ml_challenge_gpt2.json
+++ b/evals/truthfulqa/truthfulqa_ar-gpt2.json
@@ -1,19 +1,19 @@
 {
   "results": {
-    "arc_ml_challenge": {
-      "acc": 0.25,
-      "acc_stderr": 0.025210974204480537,
-      "acc_norm": 0.21283783783783783,
-      "acc_norm_stderr": 0.023831178311967415
+    "truthfulqa_ar": {
+      "mc1": 0.23932729624838292,
+      "mc1_stderr": 0.015356292760819215,
+      "mc2": 0.44027391572034885,
+      "mc2_stderr": 0.01696958534622728
     }
   },
   "versions": {
-    "arc_ml_challenge": 0
+    "truthfulqa_ar": 1
   },
   "config": {
     "model": "hf-auto",
     "model_args": "pretrained=gpt2",
-    "batch_size": "1",
+    "batch_size": 1,
     "device": "cuda",
     "no_cache": false,
     "limit": null,
diff --git a/evals/truthfulqa/truthfulqa_ar-llama-7B.json b/evals/truthfulqa/truthfulqa_ar-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..8eaf03b60bf7c8428a848aa8ce0dceeb1b8649da
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_ar-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ar": {
+      "mc1": 0.278137128072445,
+      "mc1_stderr": 0.016126799456170973,
+      "mc2": 0.4510826498021589,
+      "mc2_stderr": 0.01621099626555797
+    }
+  },
+  "versions": {
+    "truthfulqa_ar": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_da_mc_bloom-7b1.json b/evals/truthfulqa/truthfulqa_bn-bloom-7b1.json
similarity index 56%
rename from evals/truthfulqa-mc/truthfulqa_da_mc_bloom-7b1.json
rename to evals/truthfulqa/truthfulqa_bn-bloom-7b1.json
index e55ee209ca0f7da10707018a73476230d0beb314..3f0f5acb8958dae16338d6f3538d1c45fd1d5be8 100644
--- a/evals/truthfulqa-mc/truthfulqa_da_mc_bloom-7b1.json
+++ b/evals/truthfulqa/truthfulqa_bn-bloom-7b1.json
@@ -1,19 +1,19 @@
 {
   "results": {
-    "truthfulqa_da_mc": {
+    "truthfulqa_bn": {
       "mc1": 0.26248399487836105,
       "mc1_stderr": 0.015753963575796108,
-      "mc2": 0.4375025988127948,
-      "mc2_stderr": 0.01662443223981383
+      "mc2": 0.48383834952509674,
+      "mc2_stderr": 0.01620495508989729
     }
   },
   "versions": {
-    "truthfulqa_da_mc": 1
+    "truthfulqa_bn": 1
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
     "device": "cuda",
     "no_cache": false,
     "limit": null,
diff --git a/evals/truthfulqa/truthfulqa_bn-llama-7B.json b/evals/truthfulqa/truthfulqa_bn-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c9c3b9489ea6ca298a17d5e7f442b2a42217543
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_bn-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_bn": {
+      "mc1": 0.2765685019206146,
+      "mc1_stderr": 0.016015952210618845,
+      "mc2": 0.5123820777474262,
+      "mc2_stderr": 0.01680032112327857
+    }
+  },
+  "versions": {
+    "truthfulqa_bn": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ca-bloom-7b1.json b/evals/truthfulqa/truthfulqa_ca-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef3e258e39add637921d92a92ce41f916a905cce
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_ca-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ca": {
+      "mc1": 0.24324324324324326,
+      "mc1_stderr": 0.015401665455019378,
+      "mc2": 0.4007618819736215,
+      "mc2_stderr": 0.015273518926419462
+    }
+  },
+  "versions": {
+    "truthfulqa_ca": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ca-llama-7B.json b/evals/truthfulqa/truthfulqa_ca-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..279d4a6dd8300c3fdf93c1251995060f831d8f3d
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_ca-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ca": {
+      "mc1": 0.23423423423423423,
+      "mc1_stderr": 0.015203455154765249,
+      "mc2": 0.3889981216363435,
+      "mc2_stderr": 0.015057090749567676
+    }
+  },
+  "versions": {
+    "truthfulqa_ca": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_da-bloom-7b1.json b/evals/truthfulqa/truthfulqa_da-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..74bcde7ba97432b4b569a73b77198ee611a380d0
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_da-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_da": {
+      "mc1": 0.26248399487836105,
+      "mc1_stderr": 0.01575396357579612,
+      "mc2": 0.4375025988127945,
+      "mc2_stderr": 0.01662443223981383
+    }
+  },
+  "versions": {
+    "truthfulqa_da": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_da_mc_llama-7B.json b/evals/truthfulqa/truthfulqa_da-llama-7B.json
similarity index 70%
rename from evals/truthfulqa-mc/truthfulqa_da_mc_llama-7B.json
rename to evals/truthfulqa/truthfulqa_da-llama-7B.json
index 1b7cb2557be3886ead061adba89f89d50eefb9dd..08c1d956bd1de9206944f2438d9f56022794d2d5 100644
--- a/evals/truthfulqa-mc/truthfulqa_da_mc_llama-7B.json
+++ b/evals/truthfulqa/truthfulqa_da-llama-7B.json
@@ -1,19 +1,19 @@
 {
   "results": {
-    "truthfulqa_da_mc": {
+    "truthfulqa_da": {
       "mc1": 0.2573623559539053,
       "mc1_stderr": 0.01565358047400349,
-      "mc2": 0.4161317873775416,
-      "mc2_stderr": 0.015138516880476799
+      "mc2": 0.4161317873775415,
+      "mc2_stderr": 0.015138516880476807
     }
   },
   "versions": {
-    "truthfulqa_da_mc": 1
+    "truthfulqa_da": 1
   },
   "config": {
     "model": "hf-auto",
     "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
+    "batch_size": 1,
     "device": "cuda",
     "no_cache": false,
     "limit": null,
diff --git a/evals/truthfulqa-mc/truthfulqa_de_mc_bloom-7b1.json b/evals/truthfulqa/truthfulqa_de-bloom-7b1.json
similarity index 56%
rename from evals/truthfulqa-mc/truthfulqa_de_mc_bloom-7b1.json
rename to evals/truthfulqa/truthfulqa_de-bloom-7b1.json
index f9009861966dc1cff1e1868b91e2bb41bfccd0f4..068e8c49c1d499f40d02aeb1b4037569845e3f39 100644
--- a/evals/truthfulqa-mc/truthfulqa_de_mc_bloom-7b1.json
+++ b/evals/truthfulqa/truthfulqa_de-bloom-7b1.json
@@ -1,19 +1,19 @@
 {
   "results": {
-    "truthfulqa_de_mc": {
+    "truthfulqa_de": {
       "mc1": 0.24746192893401014,
-      "mc1_stderr": 0.015382646812261827,
-      "mc2": 0.4351673407370902,
+      "mc1_stderr": 0.015382646812261825,
+      "mc2": 0.43516734073709074,
       "mc2_stderr": 0.015914493454090475
     }
   },
   "versions": {
-    "truthfulqa_de_mc": 1
+    "truthfulqa_de": 1
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
     "device": "cuda",
     "no_cache": false,
     "limit": null,
diff --git a/evals/truthfulqa-mc/truthfulqa_de_mc_llama-7B.json b/evals/truthfulqa/truthfulqa_de-llama-7B.json
similarity index 62%
rename from evals/truthfulqa-mc/truthfulqa_de_mc_llama-7B.json
rename to evals/truthfulqa/truthfulqa_de-llama-7B.json
index 37147ee36d47e8dd84509b2c477c0c4563f0a7c9..870d9cc5a8bc73c2ca376de43d027b704b474970 100644
--- a/evals/truthfulqa-mc/truthfulqa_de_mc_llama-7B.json
+++ b/evals/truthfulqa/truthfulqa_de-llama-7B.json
@@ -1,19 +1,19 @@
 {
   "results": {
-    "truthfulqa_de_mc": {
+    "truthfulqa_de": {
       "mc1": 0.233502538071066,
-      "mc1_stderr": 0.015080432502225447,
-      "mc2": 0.383224305558326,
-      "mc2_stderr": 0.014662714095686993
+      "mc1_stderr": 0.015080432502225448,
+      "mc2": 0.38322430555832593,
+      "mc2_stderr": 0.014662714095687
     }
   },
   "versions": {
-    "truthfulqa_de_mc": 1
+    "truthfulqa_de": 1
   },
   "config": {
     "model": "hf-auto",
     "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
+    "batch_size": 1,
     "device": "cuda",
     "no_cache": false,
     "limit": null,
diff --git a/evals/truthfulqa/truthfulqa_es-bloom-7b1.json b/evals/truthfulqa/truthfulqa_es-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..ff2caf3355fd7554ac124714fa094f7631c4b942
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_es-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_es": {
+      "mc1": 0.24714828897338403,
+      "mc1_stderr": 0.015366339219335662,
+      "mc2": 0.4037104105160595,
+      "mc2_stderr": 0.014621192787404666
+    }
+  },
+  "versions": {
+    "truthfulqa_es": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_es-llama-7B.json b/evals/truthfulqa/truthfulqa_es-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..57d59d5a6d7fcd5e98b4558ed333d506ab551069
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_es-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_es": {
+      "mc1": 0.22686945500633712,
+      "mc1_stderr": 0.014919398735157142,
+      "mc2": 0.3704736235055417,
+      "mc2_stderr": 0.014441434139778718
+    }
+  },
+  "versions": {
+    "truthfulqa_es": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_eu-bloom-7b1.json b/evals/truthfulqa/truthfulqa_eu-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..0af0c1ab614e35a49f6251d7b28e594279fd4640
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_eu-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_eu": {
+      "mc1": 0.26098191214470284,
+      "mc1_stderr": 0.015795849655411115,
+      "mc2": 0.4458532690626118,
+      "mc2_stderr": 0.016282676760451684
+    }
+  },
+  "versions": {
+    "truthfulqa_eu": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_eu-llama-7B.json b/evals/truthfulqa/truthfulqa_eu-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..173bbf1cdee4e48adcce1026ba92eea153711152
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_eu-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_eu": {
+      "mc1": 0.22739018087855298,
+      "mc1_stderr": 0.015075655972442521,
+      "mc2": 0.4067861653338961,
+      "mc2_stderr": 0.016617765169363637
+    }
+  },
+  "versions": {
+    "truthfulqa_eu": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_fr-bloom-7b1.json b/evals/truthfulqa/truthfulqa_fr-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..59d411be1a435aa79d393d5234b98b20153fa489
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_fr-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_fr": {
+      "mc1": 0.2604828462515883,
+      "mc1_stderr": 0.015654976408037494,
+      "mc2": 0.40875422704780084,
+      "mc2_stderr": 0.014771598297171899
+    }
+  },
+  "versions": {
+    "truthfulqa_fr": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_fr-llama-7B.json b/evals/truthfulqa/truthfulqa_fr-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..f2cf1301239dab8cdd09c7e41a803f442a37aaff
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_fr-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_fr": {
+      "mc1": 0.2388818297331639,
+      "mc1_stderr": 0.015209198584184304,
+      "mc2": 0.3992160965584639,
+      "mc2_stderr": 0.014275541507345014
+    }
+  },
+  "versions": {
+    "truthfulqa_fr": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_gu-bloom-7b1.json b/evals/truthfulqa/truthfulqa_gu-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..2e428d6ce6e3db9502a089fe9c54da6bd4d4e2fa
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_gu-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_gu": {
+      "mc1": 0.2585499316005472,
+      "mc1_stderr": 0.016205100857272815,
+      "mc2": 0.4553767987804663,
+      "mc2_stderr": 0.01727282663518889
+    }
+  },
+  "versions": {
+    "truthfulqa_gu": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_gu-llama-7B.json b/evals/truthfulqa/truthfulqa_gu-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..a439f0578967f86f0d5cd4f63d5c8655fa596680
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_gu-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_gu": {
+      "mc1": 0.2612859097127223,
+      "mc1_stderr": 0.016260532228493024,
+      "mc2": 0.42794967344995166,
+      "mc2_stderr": 0.017270715140237876
+    }
+  },
+  "versions": {
+    "truthfulqa_gu": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_hi-bloom-7b1.json b/evals/truthfulqa/truthfulqa_hi-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..8576765f053944525c9eb8954a99cd9ce76a4d1c
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_hi-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_hi": {
+      "mc1": 0.2613195342820181,
+      "mc1_stderr": 0.01581268409688839,
+      "mc2": 0.44399239540333224,
+      "mc2_stderr": 0.015881067623592954
+    }
+  },
+  "versions": {
+    "truthfulqa_hi": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_hi-llama-7B.json b/evals/truthfulqa/truthfulqa_hi-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..e21366d36ceaf8601da21d648ee943852d911560
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_hi-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_hi": {
+      "mc1": 0.2794307891332471,
+      "mc1_stderr": 0.016149769533382482,
+      "mc2": 0.47236250377441935,
+      "mc2_stderr": 0.016709755014514986
+    }
+  },
+  "versions": {
+    "truthfulqa_hi": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_hr-bloom-7b1.json b/evals/truthfulqa/truthfulqa_hr-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..672cbb9e39a1a7e019ee45709b90eec7588d5235
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_hr-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_hr": {
+      "mc1": 0.2808842652795839,
+      "mc1_stderr": 0.016217447153754203,
+      "mc2": 0.4793142433106635,
+      "mc2_stderr": 0.01663884163172186
+    }
+  },
+  "versions": {
+    "truthfulqa_hr": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_hr-llama-7B.json b/evals/truthfulqa/truthfulqa_hr-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..3d1d11b77357870c8e0a53dcbafb4e8980c01f9f
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_hr-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_hr": {
+      "mc1": 0.24187256176853056,
+      "mc1_stderr": 0.015451967985505181,
+      "mc2": 0.41709863857620866,
+      "mc2_stderr": 0.01546097371205123
+    }
+  },
+  "versions": {
+    "truthfulqa_hr": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_hu-bloom-7b1.json b/evals/truthfulqa/truthfulqa_hu-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..54432301293d130afd643eb21b0db15d9f209b67
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_hu-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_hu": {
+      "mc1": 0.26718547341115434,
+      "mc1_stderr": 0.015946232556288537,
+      "mc2": 0.49994152241197887,
+      "mc2_stderr": 0.01703257765685213
+    }
+  },
+  "versions": {
+    "truthfulqa_hu": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_hu-llama-7B.json b/evals/truthfulqa/truthfulqa_hu-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..ccaefb69215b32c9208f055af2f3a1cf9c8760bc
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_hu-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_hu": {
+      "mc1": 0.24643320363164722,
+      "mc1_stderr": 0.015529773657188122,
+      "mc2": 0.4311628343540659,
+      "mc2_stderr": 0.01555491548978951
+    }
+  },
+  "versions": {
+    "truthfulqa_hu": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_hy-bloom-7b1.json b/evals/truthfulqa/truthfulqa_hy-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..debcc1a8876d402702e3c9c496eb89bc3ad0f709
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_hy-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_hy": {
+      "mc1": 0.2585895117540687,
+      "mc1_stderr": 0.018636539619637415,
+      "mc2": 0.44943643103428205,
+      "mc2_stderr": 0.02033094239607556
+    }
+  },
+  "versions": {
+    "truthfulqa_hy": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_hy-llama-7B.json b/evals/truthfulqa/truthfulqa_hy-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..433e953ddf49c551d21da840cc57c95f665a192a
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_hy-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_hy": {
+      "mc1": 0.2585895117540687,
+      "mc1_stderr": 0.018636539619637415,
+      "mc2": 0.4550713950263578,
+      "mc2_stderr": 0.020036965332656535
+    }
+  },
+  "versions": {
+    "truthfulqa_hy": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_id-bloom-7b1.json b/evals/truthfulqa/truthfulqa_id-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d6ab9911631d5cf4f7387d705739f249f1da7de2
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_id-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_id": {
+      "mc1": 0.2532133676092545,
+      "mc1_stderr": 0.01560023256901984,
+      "mc2": 0.4031249320049949,
+      "mc2_stderr": 0.015031705347347539
+    }
+  },
+  "versions": {
+    "truthfulqa_id": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_id-llama-7B.json b/evals/truthfulqa/truthfulqa_id-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..0967fc5439ed4e2c5217256c546b2f76aa443e6b
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_id-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_id": {
+      "mc1": 0.2570694087403599,
+      "mc1_stderr": 0.015677933234808462,
+      "mc2": 0.3981714076698207,
+      "mc2_stderr": 0.015520404506158571
+    }
+  },
+  "versions": {
+    "truthfulqa_id": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_it-bloom-7b1.json b/evals/truthfulqa/truthfulqa_it-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..9599a6d59070c187811a37aa2dcaec596f4e300c
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_it-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_it": {
+      "mc1": 0.2707535121328225,
+      "mc1_stderr": 0.015889888362560486,
+      "mc2": 0.4374801864181257,
+      "mc2_stderr": 0.015955762711633903
+    }
+  },
+  "versions": {
+    "truthfulqa_it": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_it-llama-7B.json b/evals/truthfulqa/truthfulqa_it-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..221af91b2b82bf70d904265c27c0279db93872af
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_it-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_it": {
+      "mc1": 0.24521072796934865,
+      "mc1_stderr": 0.015384352284543929,
+      "mc2": 0.39642666716879443,
+      "mc2_stderr": 0.01483705265700183
+    }
+  },
+  "versions": {
+    "truthfulqa_it": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_kn-bloom-7b1.json b/evals/truthfulqa/truthfulqa_kn-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b116af421e76c9c9f0d685f0a1156de33d48fa41
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_kn-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_kn": {
+      "mc1": 0.28466076696165193,
+      "mc1_stderr": 0.017343050775840425,
+      "mc2": 0.49109028617714945,
+      "mc2_stderr": 0.017608862092749467
+    }
+  },
+  "versions": {
+    "truthfulqa_kn": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_kn-llama-7B.json b/evals/truthfulqa/truthfulqa_kn-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..f05f0339406ac5574d7a1dc62bddacb292f097eb
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_kn-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_kn": {
+      "mc1": 0.275811209439528,
+      "mc1_stderr": 0.017176612615872052,
+      "mc2": 0.4635130117214921,
+      "mc2_stderr": 0.01825683954680752
+    }
+  },
+  "versions": {
+    "truthfulqa_kn": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ml-bloom-7b1.json b/evals/truthfulqa/truthfulqa_ml-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d2ada8ce66115bbf7e7e2ac501b996bc7b9ab3a1
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_ml-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ml": {
+      "mc1": 0.260806916426513,
+      "mc1_stderr": 0.01667907195342198,
+      "mc2": 0.47996911862138697,
+      "mc2_stderr": 0.017778690252427683
+    }
+  },
+  "versions": {
+    "truthfulqa_ml": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ml-llama-7B.json b/evals/truthfulqa/truthfulqa_ml-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..4dd3caeb8a76c583e812d275589a2c18156d6935
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_ml-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ml": {
+      "mc1": 0.2824207492795389,
+      "mc1_stderr": 0.01710080754090615,
+      "mc2": 0.5024391989231584,
+      "mc2_stderr": 0.017936047828800445
+    }
+  },
+  "versions": {
+    "truthfulqa_ml": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_mr-bloom-7b1.json b/evals/truthfulqa/truthfulqa_mr-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..181033bdf126dc47bfc09557ea24531f4fead727
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_mr-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_mr": {
+      "mc1": 0.2761780104712042,
+      "mc1_stderr": 0.016186321628712155,
+      "mc2": 0.4765064151203332,
+      "mc2_stderr": 0.016772466571288412
+    }
+  },
+  "versions": {
+    "truthfulqa_mr": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_mr-llama-7B.json b/evals/truthfulqa/truthfulqa_mr-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..a1fcd59738ae0b14a296aba32a13e2bda55370e3
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_mr-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_mr": {
+      "mc1": 0.2905759162303665,
+      "mc1_stderr": 0.016436922328865435,
+      "mc2": 0.49306373435254724,
+      "mc2_stderr": 0.016980148211258952
+    }
+  },
+  "versions": {
+    "truthfulqa_mr": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ne-bloom-7b1.json b/evals/truthfulqa/truthfulqa_ne-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..89defff7cdf83326b83aee4c35f6b7ab666393c0
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_ne-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ne": {
+      "mc1": 0.28811369509043927,
+      "mc1_stderr": 0.0162891162717815,
+      "mc2": 0.46164155205805624,
+      "mc2_stderr": 0.016689007834004295
+    }
+  },
+  "versions": {
+    "truthfulqa_ne": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ne-llama-7B.json b/evals/truthfulqa/truthfulqa_ne-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..b18b50165478e2f5e3938b2978e51ae65ffb09b0
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_ne-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ne": {
+      "mc1": 0.29198966408268734,
+      "mc1_stderr": 0.016353615824015625,
+      "mc2": 0.4636310825029969,
+      "mc2_stderr": 0.016928691048242774
+    }
+  },
+  "versions": {
+    "truthfulqa_ne": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_nl-bloom-7b1.json b/evals/truthfulqa/truthfulqa_nl-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d1cfa8f1fcea4cc13a3119d8a4cf2b83a9a5a879
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_nl-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_nl": {
+      "mc1": 0.25477707006369427,
+      "mc1_stderr": 0.01556199397314563,
+      "mc2": 0.42677675918475044,
+      "mc2_stderr": 0.016186878668566846
+    }
+  },
+  "versions": {
+    "truthfulqa_nl": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_nl_mc_llama-7B.json b/evals/truthfulqa/truthfulqa_nl-llama-7B.json
similarity index 62%
rename from evals/truthfulqa-mc/truthfulqa_nl_mc_llama-7B.json
rename to evals/truthfulqa/truthfulqa_nl-llama-7B.json
index 22e9d2c488076c5884e9224d8636a092fac4fe96..9646b968c2b96cfe4136c6e86627780bff5218ce 100644
--- a/evals/truthfulqa-mc/truthfulqa_nl_mc_llama-7B.json
+++ b/evals/truthfulqa/truthfulqa_nl-llama-7B.json
@@ -1,19 +1,19 @@
 {
   "results": {
-    "truthfulqa_nl_mc": {
+    "truthfulqa_nl": {
       "mc1": 0.24331210191082803,
-      "mc1_stderr": 0.015324355488601159,
-      "mc2": 0.40023342153314706,
-      "mc2_stderr": 0.014679036703865578
+      "mc1_stderr": 0.015324355488601135,
+      "mc2": 0.40023342153314656,
+      "mc2_stderr": 0.014679036703865582
     }
   },
   "versions": {
-    "truthfulqa_nl_mc": 1
+    "truthfulqa_nl": 1
   },
   "config": {
     "model": "hf-auto",
     "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
+    "batch_size": 1,
     "device": "cuda",
     "no_cache": false,
     "limit": null,
diff --git a/evals/truthfulqa/truthfulqa_pt-bloom-7b1.json b/evals/truthfulqa/truthfulqa_pt-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d9c6cefe30e562acfb981870f9e593f27f720a3d
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_pt-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_pt": {
+      "mc1": 0.23857868020304568,
+      "mc1_stderr": 0.015192910034567013,
+      "mc2": 0.38894722340741417,
+      "mc2_stderr": 0.014531269277587645
+    }
+  },
+  "versions": {
+    "truthfulqa_pt": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_pt_mc_llama-7B.json b/evals/truthfulqa/truthfulqa_pt-llama-7B.json
similarity index 69%
rename from evals/truthfulqa-mc/truthfulqa_pt_mc_llama-7B.json
rename to evals/truthfulqa/truthfulqa_pt-llama-7B.json
index 7084df35e971145794041e3080344faabab95729..1ae678becb49d878dc30174f2c390f2c1b5a1f49 100644
--- a/evals/truthfulqa-mc/truthfulqa_pt_mc_llama-7B.json
+++ b/evals/truthfulqa/truthfulqa_pt-llama-7B.json
@@ -1,19 +1,19 @@
 {
   "results": {
-    "truthfulqa_pt_mc": {
+    "truthfulqa_pt": {
       "mc1": 0.22842639593908629,
-      "mc1_stderr": 0.014964922033138024,
+      "mc1_stderr": 0.014964922033138022,
       "mc2": 0.3823261607330551,
-      "mc2_stderr": 0.01463319398314419
+      "mc2_stderr": 0.014633193983144183
     }
   },
   "versions": {
-    "truthfulqa_pt_mc": 1
+    "truthfulqa_pt": 1
   },
   "config": {
     "model": "hf-auto",
     "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
-    "batch_size": "1",
+    "batch_size": 1,
     "device": "cuda",
     "no_cache": false,
     "limit": null,
diff --git a/evals/truthfulqa/truthfulqa_ro-bloom-7b1.json b/evals/truthfulqa/truthfulqa_ro-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e9d6490be6beab45fd85e68d9df1e301bf2dff28
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_ro-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ro": {
+      "mc1": 0.26187419768934533,
+      "mc1_stderr": 0.015762378425124946,
+      "mc2": 0.4605371384706094,
+      "mc2_stderr": 0.016307442681458683
+    }
+  },
+  "versions": {
+    "truthfulqa_ro": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ro-llama-7B.json b/evals/truthfulqa/truthfulqa_ro-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..26abd62509f8f15981ca8051421f879ea16ddc2f
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_ro-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ro": {
+      "mc1": 0.22849807445442877,
+      "mc1_stderr": 0.015052893222788351,
+      "mc2": 0.37047262828252514,
+      "mc2_stderr": 0.015022205435273333
+    }
+  },
+  "versions": {
+    "truthfulqa_ro": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ru-bloom-7b1.json b/evals/truthfulqa/truthfulqa_ru-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..3347a51ef0c14c0658f692111f9112b52f876a5c
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_ru-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ru": {
+      "mc1": 0.30710659898477155,
+      "mc1_stderr": 0.016443354533552747,
+      "mc2": 0.49874761323987404,
+      "mc2_stderr": 0.016167778359600482
+    }
+  },
+  "versions": {
+    "truthfulqa_ru": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ru-llama-7B.json b/evals/truthfulqa/truthfulqa_ru-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..54b06b11d61f59c9f47d987a96a9290c09921a27
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_ru-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ru": {
+      "mc1": 0.24619289340101522,
+      "mc1_stderr": 0.015356084872692898,
+      "mc2": 0.40938277991151933,
+      "mc2_stderr": 0.015252017769860154
+    }
+  },
+  "versions": {
+    "truthfulqa_ru": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_sk-bloom-7b1.json b/evals/truthfulqa/truthfulqa_sk-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1132cb125d8848afa4abc9ecef17405375f5ccc0
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_sk-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_sk": {
+      "mc1": 0.2390745501285347,
+      "mc1_stderr": 0.015301260856408254,
+      "mc2": 0.43782616190313467,
+      "mc2_stderr": 0.01657761354751216
+    }
+  },
+  "versions": {
+    "truthfulqa_sk": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_sk-llama-7B.json b/evals/truthfulqa/truthfulqa_sk-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..71e866145020816a8524a8bd50cddf94af5042ea
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_sk-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_sk": {
+      "mc1": 0.2275064267352185,
+      "mc1_stderr": 0.015039512631474048,
+      "mc2": 0.40729144857566124,
+      "mc2_stderr": 0.015845697731465
+    }
+  },
+  "versions": {
+    "truthfulqa_sk": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_sr-bloom-7b1.json b/evals/truthfulqa/truthfulqa_sr-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..75efa51eca0c0d99414987b87632f9c19f581a21
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_sr-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_sr": {
+      "mc1": 0.2878980891719745,
+      "mc1_stderr": 0.016170834614246097,
+      "mc2": 0.4604993074094113,
+      "mc2_stderr": 0.01649631560714403
+    }
+  },
+  "versions": {
+    "truthfulqa_sr": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_sr-llama-7B.json b/evals/truthfulqa/truthfulqa_sr-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..a65b681172e15a187d19448a27058ec125e2b1f1
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_sr-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_sr": {
+      "mc1": 0.26878980891719745,
+      "mc1_stderr": 0.01583322873155152,
+      "mc2": 0.422701657829082,
+      "mc2_stderr": 0.015374851085961157
+    }
+  },
+  "versions": {
+    "truthfulqa_sr": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa-mc/truthfulqa_sv_mc_bloom-7b1.json b/evals/truthfulqa/truthfulqa_sv-bloom-7b1.json
similarity index 64%
rename from evals/truthfulqa-mc/truthfulqa_sv_mc_bloom-7b1.json
rename to evals/truthfulqa/truthfulqa_sv-bloom-7b1.json
index 9885cf6375b817aa059b00ca8a5df86a2f6bbce4..85698716bf120fd641d6dfcb551bdb145d17bc87 100644
--- a/evals/truthfulqa-mc/truthfulqa_sv_mc_bloom-7b1.json
+++ b/evals/truthfulqa/truthfulqa_sv-bloom-7b1.json
@@ -1,19 +1,19 @@
 {
   "results": {
-    "truthfulqa_sv_mc": {
+    "truthfulqa_sv": {
       "mc1": 0.2622739018087855,
       "mc1_stderr": 0.015821052272364522,
-      "mc2": 0.4457248931967088,
+      "mc2": 0.44572489319670916,
       "mc2_stderr": 0.016517364176123605
     }
   },
   "versions": {
-    "truthfulqa_sv_mc": 1
+    "truthfulqa_sv": 1
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
     "device": "cuda",
     "no_cache": false,
     "limit": null,
diff --git a/evals/truthfulqa-mc/truthfulqa_ar_mc_bloom-7b1.json b/evals/truthfulqa/truthfulqa_sv-llama-7B.json
similarity index 68%
rename from evals/truthfulqa-mc/truthfulqa_ar_mc_bloom-7b1.json
rename to evals/truthfulqa/truthfulqa_sv-llama-7B.json
index d572bf57654e75d51e028e16a79aa73942dadca1..f2f88649e17469e2a7fdc44f296619fe407feac6 100644
--- a/evals/truthfulqa-mc/truthfulqa_ar_mc_bloom-7b1.json
+++ b/evals/truthfulqa/truthfulqa_sv-llama-7B.json
@@ -1,19 +1,19 @@
 {
   "results": {
-    "truthfulqa_ar_mc": {
+    "truthfulqa_sv": {
       "mc1": 0.2596899224806202,
       "mc1_stderr": 0.01577046983489191,
-      "mc2": 0.4250856388236661,
-      "mc2_stderr": 0.01572683307613003
+      "mc2": 0.4052891370296314,
+      "mc2_stderr": 0.01500679891573553
     }
   },
   "versions": {
-    "truthfulqa_ar_mc": 1
+    "truthfulqa_sv": 1
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/bloom-7b1",
-    "batch_size": "1",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
     "device": "cuda",
     "no_cache": false,
     "limit": null,
diff --git a/evals/truthfulqa/truthfulqa_ta-bloom-7b1.json b/evals/truthfulqa/truthfulqa_ta-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..956d773e26ebf10fc23669bb18d5b9df924be462
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_ta-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ta": {
+      "mc1": 0.2651413189771198,
+      "mc1_stderr": 0.016204613164182584,
+      "mc2": 0.48348066773619114,
+      "mc2_stderr": 0.016887213348384833
+    }
+  },
+  "versions": {
+    "truthfulqa_ta": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_ta-llama-7B.json b/evals/truthfulqa/truthfulqa_ta-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..3edaa546d22cbb705a02af8433a7b3ecb4f29213
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_ta-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_ta": {
+      "mc1": 0.28263795423956933,
+      "mc1_stderr": 0.016530366611189357,
+      "mc2": 0.5032626048969708,
+      "mc2_stderr": 0.01719880976895468
+    }
+  },
+  "versions": {
+    "truthfulqa_ta": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_te-bloom-7b1.json b/evals/truthfulqa/truthfulqa_te-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..d139c759617d41dd724dc54443b08c7eba5c2a83
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_te-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_te": {
+      "mc1": 0.2652482269503546,
+      "mc1_stderr": 0.016638349265004355,
+      "mc2": 0.4612285746093752,
+      "mc2_stderr": 0.017504699336599025
+    }
+  },
+  "versions": {
+    "truthfulqa_te": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_te-llama-7B.json b/evals/truthfulqa/truthfulqa_te-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7371487cfa5f3b205258d0c63aa1d722e304a75
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_te-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_te": {
+      "mc1": 0.2851063829787234,
+      "mc1_stderr": 0.01701523103469595,
+      "mc2": 0.4821795923320059,
+      "mc2_stderr": 0.01784811574301116
+    }
+  },
+  "versions": {
+    "truthfulqa_te": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_uk-bloom-7b1.json b/evals/truthfulqa/truthfulqa_uk-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..da866d1706ae757888bf53041acc427c30e98a06
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_uk-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_uk": {
+      "mc1": 0.3090909090909091,
+      "mc1_stderr": 0.01666442755255745,
+      "mc2": 0.5143873310692731,
+      "mc2_stderr": 0.016755211041268873
+    }
+  },
+  "versions": {
+    "truthfulqa_uk": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_uk-llama-7B.json b/evals/truthfulqa/truthfulqa_uk-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a420b35b0478fcc320798e8287f213c061df4fe
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_uk-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_uk": {
+      "mc1": 0.23636363636363636,
+      "mc1_stderr": 0.015320412612327241,
+      "mc2": 0.4141829984231552,
+      "mc2_stderr": 0.01560702677887637
+    }
+  },
+  "versions": {
+    "truthfulqa_uk": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/arc-challenge/arc_hy_challenge_bloom-560.json b/evals/truthfulqa/truthfulqa_vi-bloom-7b1.json
similarity index 50%
rename from evals/arc-challenge/arc_hy_challenge_bloom-560.json
rename to evals/truthfulqa/truthfulqa_vi-bloom-7b1.json
index 38b99f7004830ebf484274ad893c53cff9de33a4..f21113c3d005bd269763438b047147bb50ac5125 100644
--- a/evals/arc-challenge/arc_hy_challenge_bloom-560.json
+++ b/evals/truthfulqa/truthfulqa_vi-bloom-7b1.json
@@ -1,19 +1,19 @@
 {
   "results": {
-    "arc_hy_challenge": {
-      "acc": 0.19655172413793104,
-      "acc_stderr": 0.023375906908472157,
-      "acc_norm": 0.2482758620689655,
-      "acc_norm_stderr": 0.02541251077219611
+    "truthfulqa_vi": {
+      "mc1": 0.2968152866242038,
+      "mc1_stderr": 0.016316229722585934,
+      "mc2": 0.44721474578334436,
+      "mc2_stderr": 0.015073430494043749
     }
   },
   "versions": {
-    "arc_hy_challenge": 0
+    "truthfulqa_vi": 1
   },
   "config": {
     "model": "hf-auto",
-    "model_args": "pretrained=bigscience/bloom-560m",
-    "batch_size": "1",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
     "device": "cuda",
     "no_cache": false,
     "limit": null,
diff --git a/evals/truthfulqa/truthfulqa_vi-llama-7B.json b/evals/truthfulqa/truthfulqa_vi-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..bc5992da0821ee82c8ce26e99fb73e6e2f872651
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_vi-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_vi": {
+      "mc1": 0.2445859872611465,
+      "mc1_stderr": 0.015351480770855935,
+      "mc2": 0.42975481561967727,
+      "mc2_stderr": 0.01625176801732652
+    }
+  },
+  "versions": {
+    "truthfulqa_vi": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_zh-bloom-7b1.json b/evals/truthfulqa/truthfulqa_zh-bloom-7b1.json
new file mode 100644
index 0000000000000000000000000000000000000000..7496dee8d8893c925eac3f5a5de1723f69d1ad77
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_zh-bloom-7b1.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_zh": {
+      "mc1": 0.22842639593908629,
+      "mc1_stderr": 0.014964922033138017,
+      "mc2": 0.38822244050439564,
+      "mc2_stderr": 0.014953544130092178
+    }
+  },
+  "versions": {
+    "truthfulqa_zh": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=bigscience/bloom-7b1",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file
diff --git a/evals/truthfulqa/truthfulqa_zh-llama-7B.json b/evals/truthfulqa/truthfulqa_zh-llama-7B.json
new file mode 100644
index 0000000000000000000000000000000000000000..eeab4eff270462460733b050ac068062679cc507
--- /dev/null
+++ b/evals/truthfulqa/truthfulqa_zh-llama-7B.json
@@ -0,0 +1,23 @@
+{
+  "results": {
+    "truthfulqa_zh": {
+      "mc1": 0.26649746192893403,
+      "mc1_stderr": 0.015760136800242356,
+      "mc2": 0.43598966702035913,
+      "mc2_stderr": 0.015850355717645676
+    }
+  },
+  "versions": {
+    "truthfulqa_zh": 1
+  },
+  "config": {
+    "model": "hf-auto",
+    "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
+    "batch_size": 1,
+    "device": "cuda",
+    "no_cache": false,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
\ No newline at end of file