diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/ar/Answer_Given_options/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/ar/Answer_Given_options/results.json
deleted file mode 100644
index ee687fff025db235d1a0c5e3e7e4d5ea37b70fa5..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/ar/Answer_Given_options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "ar",
-  "template_name": "Answer Given options",
-  "evaluation": {
-    "accuracy": 0.6896095301125083
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/ar/Choose_Story_Ending/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/ar/Choose_Story_Ending/results.json
deleted file mode 100644
index 09913ae524e676cae982ea0bf63fafafde804167..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/ar/Choose_Story_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "ar",
-  "template_name": "Choose Story Ending",
-  "evaluation": {
-    "accuracy": 0.8378557246856386
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/ar/Generate_Ending/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/ar/Generate_Ending/results.json
deleted file mode 100644
index 357752fcef715dd0e9bb3e53f19541a6e4677762..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/ar/Generate_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "ar",
-  "template_name": "Generate Ending",
-  "evaluation": {
-    "accuracy": 0.5956320317670417
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending/results.json
deleted file mode 100644
index 8efe6a29767e7402a35636542d83d0ca0dcc99aa..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "ar",
-  "template_name": "Novel Correct Ending",
-  "evaluation": {
-    "accuracy": 0.8213103904698875
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options/results.json
deleted file mode 100644
index ab27c96b2527d678ce13c15cfdbde2a31adebef5..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "ar",
-  "template_name": "Story Continuation and Options",
-  "evaluation": {
-    "accuracy": 0.8219722038385175
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/es/Answer_Given_options/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/es/Answer_Given_options/results.json
deleted file mode 100644
index e752bca41c96f3e30dfe70484266525ef56f78d6..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/es/Answer_Given_options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "es",
-  "template_name": "Answer Given options",
-  "evaluation": {
-    "accuracy": 0.7683653209794837
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/es/Choose_Story_Ending/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/es/Choose_Story_Ending/results.json
deleted file mode 100644
index fca6014811f0daaa82752ddaaa993561ce108c44..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/es/Choose_Story_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "es",
-  "template_name": "Choose Story Ending",
-  "evaluation": {
-    "accuracy": 0.886168100595632
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/es/Generate_Ending/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/es/Generate_Ending/results.json
deleted file mode 100644
index f3777d75e7650424308f106fd6d3a5df7f787d90..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/es/Generate_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "es",
-  "template_name": "Generate Ending",
-  "evaluation": {
-    "accuracy": 0.6724023825281271
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/es/Novel_Correct_Ending/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/es/Novel_Correct_Ending/results.json
deleted file mode 100644
index 53b09d0e87a347427a568a9a833c4fd900355f1b..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/es/Novel_Correct_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "es",
-  "template_name": "Novel Correct Ending",
-  "evaluation": {
-    "accuracy": 0.8676373262739907
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options/results.json
deleted file mode 100644
index f9518581ecda9f5cf8b5b17defb0740e70dd9640..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "es",
-  "template_name": "Story Continuation and Options",
-  "evaluation": {
-    "accuracy": 0.8769027134348114
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/eu/Answer_Given_options/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/eu/Answer_Given_options/results.json
deleted file mode 100644
index a6bf025a2d3ea4baeb360ae84d4b8b69e15a024c..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/eu/Answer_Given_options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "eu",
-  "template_name": "Answer Given options",
-  "evaluation": {
-    "accuracy": 0.6082064857710126
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/eu/Choose_Story_Ending/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/eu/Choose_Story_Ending/results.json
deleted file mode 100644
index 0632573e4226ca0469eb8baa954d9a633b95c867..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/eu/Choose_Story_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "eu",
-  "template_name": "Choose Story Ending",
-  "evaluation": {
-    "accuracy": 0.7266710787557908
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/eu/Generate_Ending/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/eu/Generate_Ending/results.json
deleted file mode 100644
index 2c7e2be9cbc4fc0796d38b369755eb1270b944d8..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/eu/Generate_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "eu",
-  "template_name": "Generate Ending",
-  "evaluation": {
-    "accuracy": 0.5552614162806089
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending/results.json
deleted file mode 100644
index 386f9bca23da50f8812d5a65c99a454e3bfd3a84..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "eu",
-  "template_name": "Novel Correct Ending",
-  "evaluation": {
-    "accuracy": 0.700198544010589
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options/results.json
deleted file mode 100644
index a706ebc93e3866716af502ede850131876542e30..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "eu",
-  "template_name": "Story Continuation and Options",
-  "evaluation": {
-    "accuracy": 0.7107875579086698
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/hi/Answer_Given_options/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/hi/Answer_Given_options/results.json
deleted file mode 100644
index 36c5bb8484b6d505c4afb870897acae7619e6958..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/hi/Answer_Given_options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "hi",
-  "template_name": "Answer Given options",
-  "evaluation": {
-    "accuracy": 0.6366644606221046
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/hi/Choose_Story_Ending/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/hi/Choose_Story_Ending/results.json
deleted file mode 100644
index 5ed2515ae17faae85ed0bc6a96c0d985ed4bddb5..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/hi/Choose_Story_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "hi",
-  "template_name": "Choose Story Ending",
-  "evaluation": {
-    "accuracy": 0.7882197220383852
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/hi/Generate_Ending/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/hi/Generate_Ending/results.json
deleted file mode 100644
index d807e21b29de04446f4cfe611ab4a9f1c1aac6f3..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/hi/Generate_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "hi",
-  "template_name": "Generate Ending",
-  "evaluation": {
-    "accuracy": 0.5982792852415619
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending/results.json
deleted file mode 100644
index b43a6d1cd13493fd76b1f429cf6b268d8fb3eae9..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "hi",
-  "template_name": "Novel Correct Ending",
-  "evaluation": {
-    "accuracy": 0.7485109199205824
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options/results.json
deleted file mode 100644
index 5d3b8d42e9a6c3e33a1933b8157ddcb0e1e3d5b3..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "hi",
-  "template_name": "Story Continuation and Options",
-  "evaluation": {
-    "accuracy": 0.7683653209794837
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/id/Answer_Given_options/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/id/Answer_Given_options/results.json
deleted file mode 100644
index 42973d245d09708e0cb10381312f81c1a009fd37..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/id/Answer_Given_options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "id",
-  "template_name": "Answer Given options",
-  "evaluation": {
-    "accuracy": 0.7385837193911317
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/id/Choose_Story_Ending/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/id/Choose_Story_Ending/results.json
deleted file mode 100644
index 2083ffe7418714a31fc494282fb2a5d5d913dd16..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/id/Choose_Story_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "id",
-  "template_name": "Choose Story Ending",
-  "evaluation": {
-    "accuracy": 0.8332230311052283
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/id/Generate_Ending/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/id/Generate_Ending/results.json
deleted file mode 100644
index 0553cf8d1be7424ae76ed745bacef1655f0af56c..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/id/Generate_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "id",
-  "template_name": "Generate Ending",
-  "evaluation": {
-    "accuracy": 0.6293845135671741
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/id/Novel_Correct_Ending/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/id/Novel_Correct_Ending/results.json
deleted file mode 100644
index 6884f7ddd6987951ac927728b6032991cbdd40d1..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/id/Novel_Correct_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "id",
-  "template_name": "Novel Correct Ending",
-  "evaluation": {
-    "accuracy": 0.7816015883520847
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options/results.json
deleted file mode 100644
index 623362eb25af762a34a6adced1ec42cb97ec4b41..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "id",
-  "template_name": "Story Continuation and Options",
-  "evaluation": {
-    "accuracy": 0.8226340172071476
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/zh/Answer_Given_options/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/zh/Answer_Given_options/results.json
deleted file mode 100644
index 9197b8cb23453249367d3b46eff4c0c6ec7f5291..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/zh/Answer_Given_options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "zh",
-  "template_name": "Answer Given options",
-  "evaluation": {
-    "accuracy": 0.7498345466578424
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/zh/Choose_Story_Ending/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/zh/Choose_Story_Ending/results.json
deleted file mode 100644
index 672e6984b4f37403144ee2bfabd0f09d2c190db3..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/zh/Choose_Story_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "zh",
-  "template_name": "Choose Story Ending",
-  "evaluation": {
-    "accuracy": 0.8583719391131701
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/zh/Generate_Ending/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/zh/Generate_Ending/results.json
deleted file mode 100644
index 21438162a277b6d07ae6068a23aca83bc6388c96..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/zh/Generate_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "zh",
-  "template_name": "Generate Ending",
-  "evaluation": {
-    "accuracy": 0.6227663798808736
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending/results.json
deleted file mode 100644
index f7d3e0fa656be6fcab419b168f75281f7934d963..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "zh",
-  "template_name": "Novel Correct Ending",
-  "evaluation": {
-    "accuracy": 0.8405029781601588
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options/results.json
deleted file mode 100644
index 8e57f15ca55b9773a5f03692f5e8a03d17676d04..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "zh",
-  "template_name": "Story Continuation and Options",
-  "evaluation": {
-    "accuracy": 0.8385175380542687
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/en/Replace/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/en/Replace/results.json
deleted file mode 100644
index 7cca016bfa305e04743256d35dde6e1dda3b1b4f..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/en/Replace/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "en",
-  "template_name": "Replace",
-  "evaluation": {
-    "accuracy": 0.6576344086021505
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/en/True_or_False/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/en/True_or_False/results.json
deleted file mode 100644
index c438a5bc83423e5519bd90ba7ecff0dc88d0cc5e..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/en/True_or_False/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "en",
-  "template_name": "True or False",
-  "evaluation": {
-    "accuracy": 0.5187096774193548
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/en/does_underscore_refer_to/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/en/does_underscore_refer_to/results.json
deleted file mode 100644
index 0406ac2b9a66283e0e273eb8136dfd7014e870df..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/en/does_underscore_refer_to/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "en",
-  "template_name": "does underscore refer to",
-  "evaluation": {
-    "accuracy": 0.5931182795698925
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/en/stand_for/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/en/stand_for/results.json
deleted file mode 100644
index 6d55f5f2091fe97a7f784f08252944c43ec04ec5..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/en/stand_for/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "en",
-  "template_name": "stand for",
-  "evaluation": {
-    "accuracy": 0.5070967741935484
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/en/underscore_refer_to/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/en/underscore_refer_to/results.json
deleted file mode 100644
index cbbd7c31c32708d98dbbeb064a418844ed5a841f..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/en/underscore_refer_to/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "en",
-  "template_name": "underscore refer to",
-  "evaluation": {
-    "accuracy": 0.6210752688172043
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/fr/Replace/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/fr/Replace/results.json
deleted file mode 100644
index 8a5b58afaeaf436677903e5cebd1e0d95398c2e1..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/fr/Replace/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "fr",
-  "template_name": "Replace",
-  "evaluation": {
-    "accuracy": 0.5180722891566265
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/fr/True_or_False/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/fr/True_or_False/results.json
deleted file mode 100644
index 5fcca5729178fa86a2db2771c52b208d5300ce29..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/fr/True_or_False/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "fr",
-  "template_name": "True or False",
-  "evaluation": {
-    "accuracy": 0.5301204819277109
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/fr/does_underscore_refer_to/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/fr/does_underscore_refer_to/results.json
deleted file mode 100644
index e633ec7e691facaa5d58953c2c1dda2ff8d7244d..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/fr/does_underscore_refer_to/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "fr",
-  "template_name": "does underscore refer to",
-  "evaluation": {
-    "accuracy": 0.5542168674698795
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/fr/stand_for/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/fr/stand_for/results.json
deleted file mode 100644
index 84c78ab68f45d54a9bc57fa5bd2db69a8a3eee1c..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/fr/stand_for/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "fr",
-  "template_name": "stand for",
-  "evaluation": {
-    "accuracy": 0.5180722891566265
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/fr/underscore_refer_to/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/fr/underscore_refer_to/results.json
deleted file mode 100644
index 1c8671c7e91fcfa90f5c712c4cd45bd21bce75b6..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/fr/underscore_refer_to/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "fr",
-  "template_name": "underscore refer to",
-  "evaluation": {
-    "accuracy": 0.5421686746987951
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/pt/Replace/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/pt/Replace/results.json
deleted file mode 100644
index 40eb613c0db3e719a7c14f420c93feaca8db71b9..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/pt/Replace/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "pt",
-  "template_name": "Replace",
-  "evaluation": {
-    "accuracy": 0.5741444866920152
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/pt/True_or_False/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/pt/True_or_False/results.json
deleted file mode 100644
index 56359460ad660a81be9fb14f1b24a16bc423c553..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/pt/True_or_False/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "pt",
-  "template_name": "True or False",
-  "evaluation": {
-    "accuracy": 0.4790874524714829
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/pt/does_underscore_refer_to/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/pt/does_underscore_refer_to/results.json
deleted file mode 100644
index 72c0b538422b56ee2f446ae55334d7f16aa1d5a3..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/pt/does_underscore_refer_to/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "pt",
-  "template_name": "does underscore refer to",
-  "evaluation": {
-    "accuracy": 0.55893536121673
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/pt/stand_for/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/pt/stand_for/results.json
deleted file mode 100644
index 3480298da420f74bcc47be59736b849b7b1d16fd..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/pt/stand_for/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "pt",
-  "template_name": "stand for",
-  "evaluation": {
-    "accuracy": 0.5209125475285171
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/pt/underscore_refer_to/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/pt/underscore_refer_to/results.json
deleted file mode 100644
index e80e9293322a6d429f21f5f14f68c9303eafe211..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/pt/underscore_refer_to/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "pt",
-  "template_name": "underscore refer to",
-  "evaluation": {
-    "accuracy": 0.5437262357414449
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/zh/Replace/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/zh/Replace/results.json
deleted file mode 100644
index 3ca8099b5daa44ef38338971f64730faf00e56bf..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/zh/Replace/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "zh",
-  "template_name": "Replace",
-  "evaluation": {
-    "accuracy": 0.626984126984127
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/zh/True_or_False/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/zh/True_or_False/results.json
deleted file mode 100644
index 4d7ecc65170e29e452aed31e5668815572903efd..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/zh/True_or_False/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "zh",
-  "template_name": "True or False",
-  "evaluation": {
-    "accuracy": 0.503968253968254
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/zh/does_underscore_refer_to/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/zh/does_underscore_refer_to/results.json
deleted file mode 100644
index fbdd04009638a4026303cb221180e18c7179dc8b..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/zh/does_underscore_refer_to/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "zh",
-  "template_name": "does underscore refer to",
-  "evaluation": {
-    "accuracy": 0.5436507936507936
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/zh/stand_for/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/zh/stand_for/results.json
deleted file mode 100644
index 6a6da375fe704c9bb9eeac0c8304879e1fc26865..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/zh/stand_for/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "zh",
-  "template_name": "stand for",
-  "evaluation": {
-    "accuracy": 0.49007936507936506
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/zh/underscore_refer_to/results.json b/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/zh/underscore_refer_to/results.json
deleted file mode 100644
index 3e946b644795bdac06366e70df599e5c70ef095d..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/Muennighoff_xwinograd/zh/underscore_refer_to/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "zh",
-  "template_name": "underscore refer to",
-  "evaluation": {
-    "accuracy": 0.5535714285714286
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r1/GPT-3_style/results.json b/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r1/GPT-3_style/results.json
deleted file mode 100644
index 7d797684c48813fb93ee7fcee75012e9220216bd..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r1/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r1",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.426
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r1/MNLI_crowdsource/results.json b/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r1/MNLI_crowdsource/results.json
deleted file mode 100644
index 20374d1d4070ba04732b4c4875762233e3b51afa..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r1/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r1",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.402
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r1/can_we_infer/results.json b/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r1/can_we_infer/results.json
deleted file mode 100644
index 6a28787134fdaee4350e53df323e24eb4336c101..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r1/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r1",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.401
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r1/guaranteed_possible_impossible/results.json b/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r1/guaranteed_possible_impossible/results.json
deleted file mode 100644
index b12be3ab4bc1fe7034c070a3cd0a856abcdaff89..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r1/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r1",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.314
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r1/justified_in_saying/results.json b/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r1/justified_in_saying/results.json
deleted file mode 100644
index 6893460fc25c185b0f863c0771d50a331a924574..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r1/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r1",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.387
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r2/GPT-3_style/results.json b/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r2/GPT-3_style/results.json
deleted file mode 100644
index 222b602340107db392e64549acd17451eef3b73a..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r2/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r2",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.383
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r2/MNLI_crowdsource/results.json b/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r2/MNLI_crowdsource/results.json
deleted file mode 100644
index 5c7dd6e91ff9d27c32c5412ac78786cf7d2d7c02..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r2/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r2",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.374
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r2/can_we_infer/results.json b/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r2/can_we_infer/results.json
deleted file mode 100644
index af61a800ad5c00e1df7087f25d7189bcd54ed72e..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r2/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r2",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.394
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r2/guaranteed_possible_impossible/results.json b/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r2/guaranteed_possible_impossible/results.json
deleted file mode 100644
index 05a872ed6fea16bda1aa0c8cdb6ab02e80e036ca..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r2/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r2",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.302
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r2/justified_in_saying/results.json b/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r2/justified_in_saying/results.json
deleted file mode 100644
index 2672da6c326b4a26b10b029420fc95e9947f5608..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r2/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r2",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.376
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r3/GPT-3_style/results.json b/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r3/GPT-3_style/results.json
deleted file mode 100644
index f151cc8d2e07b9e0b3ef1edf849dd69a8c748873..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r3/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r3",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.42
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r3/MNLI_crowdsource/results.json b/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r3/MNLI_crowdsource/results.json
deleted file mode 100644
index 82148467711e65c52e039e9346eaecb5d7f3ee84..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r3/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r3",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.4116666666666667
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r3/can_we_infer/results.json b/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r3/can_we_infer/results.json
deleted file mode 100644
index 21fdd815d985648d60398c318c503b30dc209554..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r3/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r3",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.38916666666666666
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r3/guaranteed_possible_impossible/results.json b/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r3/guaranteed_possible_impossible/results.json
deleted file mode 100644
index 8c95797516b28c82b8d3598a8a5dda398e84e5f0..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r3/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r3",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.2966666666666667
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r3/justified_in_saying/results.json b/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r3/justified_in_saying/results.json
deleted file mode 100644
index ec7cea823be814a3a1c396d7e2e6121f6e6cb1ee..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/anli/dev_r3/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "anli",
-  "dataset_config_name": "dev_r3",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.35833333333333334
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/merged.csv b/evaluation_bloommz-7b1/evaluation_l1/merged.csv
deleted file mode 100644
index 7786678ad3e7adbcb02bec034f8fa4362a1bc340..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/merged.csv
+++ /dev/null
@@ -1,194 +0,0 @@
-dataset,prompt,metric,value
-anli_dev_r1,GPT-3 style,accuracy,0.426
-anli_dev_r1,MNLI crowdsource,accuracy,0.402
-anli_dev_r1,can we infer,accuracy,0.401
-anli_dev_r1,guaranteed/possible/impossible,accuracy,0.314
-anli_dev_r1,justified in saying,accuracy,0.387
-anli_dev_r1,median,accuracy,0.401
-anli_dev_r2,GPT-3 style,accuracy,0.383
-anli_dev_r2,MNLI crowdsource,accuracy,0.374
-anli_dev_r2,can we infer,accuracy,0.394
-anli_dev_r2,guaranteed/possible/impossible,accuracy,0.302
-anli_dev_r2,justified in saying,accuracy,0.376
-anli_dev_r2,median,accuracy,0.376
-anli_dev_r3,GPT-3 style,accuracy,0.42
-anli_dev_r3,MNLI crowdsource,accuracy,0.4116666666666667
-anli_dev_r3,can we infer,accuracy,0.38916666666666666
-anli_dev_r3,guaranteed/possible/impossible,accuracy,0.2966666666666667
-anli_dev_r3,justified in saying,accuracy,0.35833333333333334
-anli_dev_r3,median,accuracy,0.38916666666666666
-story_cloze_2016,Answer Given options,accuracy,0.8524853019775521
-story_cloze_2016,Choose Story Ending,accuracy,0.8957776590058792
-story_cloze_2016,Generate Ending,accuracy,0.709246392303581
-story_cloze_2016,Novel Correct Ending,accuracy,0.8888295029396045
-story_cloze_2016,Story Continuation and Options,accuracy,0.8850881881346874
-story_cloze_2016,median,accuracy,0.8850881881346874
-super_glue_cb,GPT-3 style,accuracy,0.8392857142857143
-super_glue_cb,MNLI crowdsource,accuracy,0.35714285714285715
-super_glue_cb,can we infer,accuracy,0.7857142857142857
-super_glue_cb,guaranteed/possible/impossible,accuracy,0.5535714285714286
-super_glue_cb,justified in saying,accuracy,0.7142857142857143
-super_glue_cb,median,accuracy,0.7142857142857143
-super_glue_copa,"C1 or C2? premise, so/because…",accuracy,0.66
-super_glue_copa,best_option,accuracy,0.77
-super_glue_copa,cause_effect,accuracy,0.8
-super_glue_copa,i_am_hesitating,accuracy,0.81
-super_glue_copa,plausible_alternatives,accuracy,0.84
-super_glue_copa,median,accuracy,0.8
-super_glue_rte,GPT-3 style,accuracy,0.7906137184115524
-super_glue_rte,MNLI crowdsource,accuracy,0.8267148014440433
-super_glue_rte,does it follow that,accuracy,0.7942238267148014
-super_glue_rte,guaranteed true,accuracy,0.776173285198556
-super_glue_rte,should assume,accuracy,0.7617328519855595
-super_glue_rte,median,accuracy,0.7906137184115524
-winogrande_winogrande_xl,Replace,accuracy,0.5588003157063931
-winogrande_winogrande_xl,True or False,accuracy,0.5280189423835833
-winogrande_winogrande_xl,does underscore refer to,accuracy,0.5651144435674822
-winogrande_winogrande_xl,stand for,accuracy,0.5082872928176796
-winogrande_winogrande_xl,underscore refer to,accuracy,0.5651144435674822
-winogrande_winogrande_xl,median,accuracy,0.5588003157063931
-xcopa_id,"C1 or C2? premise, so/because…",accuracy,0.46
-xcopa_id,best_option,accuracy,0.7
-xcopa_id,cause_effect,accuracy,0.73
-xcopa_id,i_am_hesitating,accuracy,0.72
-xcopa_id,plausible_alternatives,accuracy,0.67
-xcopa_id,median,accuracy,0.7
-xcopa_sw,"C1 or C2? premise, so/because…",accuracy,0.6
-xcopa_sw,best_option,accuracy,0.55
-xcopa_sw,cause_effect,accuracy,0.54
-xcopa_sw,i_am_hesitating,accuracy,0.51
-xcopa_sw,plausible_alternatives,accuracy,0.52
-xcopa_sw,median,accuracy,0.54
-xcopa_ta,"C1 or C2? premise, so/because…",accuracy,0.59
-xcopa_ta,best_option,accuracy,0.56
-xcopa_ta,cause_effect,accuracy,0.6
-xcopa_ta,i_am_hesitating,accuracy,0.57
-xcopa_ta,plausible_alternatives,accuracy,0.62
-xcopa_ta,median,accuracy,0.59
-xcopa_vi,"C1 or C2? premise, so/because…",accuracy,0.53
-xcopa_vi,best_option,accuracy,0.72
-xcopa_vi,cause_effect,accuracy,0.72
-xcopa_vi,i_am_hesitating,accuracy,0.7
-xcopa_vi,plausible_alternatives,accuracy,0.71
-xcopa_vi,median,accuracy,0.71
-xcopa_zh,"C1 or C2? premise, so/because…",accuracy,0.67
-xcopa_zh,best_option,accuracy,0.7
-xcopa_zh,cause_effect,accuracy,0.8
-xcopa_zh,i_am_hesitating,accuracy,0.77
-xcopa_zh,plausible_alternatives,accuracy,0.79
-xcopa_zh,median,accuracy,0.77
-xnli_ar,GPT-3 style,accuracy,0.5558232931726907
-xnli_ar,MNLI crowdsource,accuracy,0.42128514056224897
-xnli_ar,can we infer,accuracy,0.5148594377510041
-xnli_ar,guaranteed/possible/impossible,accuracy,0.40562248995983935
-xnli_ar,justified in saying,accuracy,0.4927710843373494
-xnli_ar,median,accuracy,0.4927710843373494
-xnli_en,GPT-3 style,accuracy,0.5891566265060241
-xnli_en,MNLI crowdsource,accuracy,0.42610441767068274
-xnli_en,can we infer,accuracy,0.5662650602409639
-xnli_en,guaranteed/possible/impossible,accuracy,0.4614457831325301
-xnli_en,justified in saying,accuracy,0.5437751004016064
-xnli_en,median,accuracy,0.5437751004016064
-xnli_es,GPT-3 style,accuracy,0.5734939759036145
-xnli_es,MNLI crowdsource,accuracy,0.40923694779116465
-xnli_es,can we infer,accuracy,0.5148594377510041
-xnli_es,guaranteed/possible/impossible,accuracy,0.43132530120481927
-xnli_es,justified in saying,accuracy,0.4610441767068273
-xnli_es,median,accuracy,0.4610441767068273
-xnli_fr,GPT-3 style,accuracy,0.5666666666666667
-xnli_fr,MNLI crowdsource,accuracy,0.42208835341365464
-xnli_fr,can we infer,accuracy,0.5385542168674698
-xnli_fr,guaranteed/possible/impossible,accuracy,0.39076305220883534
-xnli_fr,justified in saying,accuracy,0.5100401606425703
-xnli_fr,median,accuracy,0.5100401606425703
-xnli_hi,GPT-3 style,accuracy,0.5345381526104418
-xnli_hi,MNLI crowdsource,accuracy,0.41124497991967873
-xnli_hi,can we infer,accuracy,0.4751004016064257
-xnli_hi,guaranteed/possible/impossible,accuracy,0.40923694779116465
-xnli_hi,justified in saying,accuracy,0.4469879518072289
-xnli_hi,median,accuracy,0.4469879518072289
-xnli_sw,GPT-3 style,accuracy,0.4827309236947791
-xnli_sw,MNLI crowdsource,accuracy,0.40562248995983935
-xnli_sw,can we infer,accuracy,0.44497991967871486
-xnli_sw,guaranteed/possible/impossible,accuracy,0.42289156626506025
-xnli_sw,justified in saying,accuracy,0.41124497991967873
-xnli_sw,median,accuracy,0.42289156626506025
-xnli_ur,GPT-3 style,accuracy,0.4947791164658635
-xnli_ur,MNLI crowdsource,accuracy,0.39759036144578314
-xnli_ur,can we infer,accuracy,0.4502008032128514
-xnli_ur,guaranteed/possible/impossible,accuracy,0.39036144578313253
-xnli_ur,justified in saying,accuracy,0.40843373493975904
-xnli_ur,median,accuracy,0.40843373493975904
-xnli_vi,GPT-3 style,accuracy,0.5449799196787148
-xnli_vi,MNLI crowdsource,accuracy,0.40401606425702813
-xnli_vi,can we infer,accuracy,0.5
-xnli_vi,guaranteed/possible/impossible,accuracy,0.44779116465863456
-xnli_vi,justified in saying,accuracy,0.4650602409638554
-xnli_vi,median,accuracy,0.4650602409638554
-xnli_zh,GPT-3 style,accuracy,0.5429718875502008
-xnli_zh,MNLI crowdsource,accuracy,0.3891566265060241
-xnli_zh,can we infer,accuracy,0.5032128514056224
-xnli_zh,guaranteed/possible/impossible,accuracy,0.38072289156626504
-xnli_zh,justified in saying,accuracy,0.4706827309236948
-xnli_zh,median,accuracy,0.4706827309236948
-xstory_cloze_ar,Answer Given options,accuracy,0.6896095301125083
-xstory_cloze_ar,Choose Story Ending,accuracy,0.8378557246856386
-xstory_cloze_ar,Generate Ending,accuracy,0.5956320317670417
-xstory_cloze_ar,Novel Correct Ending,accuracy,0.8213103904698875
-xstory_cloze_ar,Story Continuation and Options,accuracy,0.8219722038385175
-xstory_cloze_ar,median,accuracy,0.8213103904698875
-xstory_cloze_es,Answer Given options,accuracy,0.7683653209794837
-xstory_cloze_es,Choose Story Ending,accuracy,0.886168100595632
-xstory_cloze_es,Generate Ending,accuracy,0.6724023825281271
-xstory_cloze_es,Novel Correct Ending,accuracy,0.8676373262739907
-xstory_cloze_es,Story Continuation and Options,accuracy,0.8769027134348114
-xstory_cloze_es,median,accuracy,0.8676373262739907
-xstory_cloze_eu,Answer Given options,accuracy,0.6082064857710126
-xstory_cloze_eu,Choose Story Ending,accuracy,0.7266710787557908
-xstory_cloze_eu,Generate Ending,accuracy,0.5552614162806089
-xstory_cloze_eu,Novel Correct Ending,accuracy,0.700198544010589
-xstory_cloze_eu,Story Continuation and Options,accuracy,0.7107875579086698
-xstory_cloze_eu,median,accuracy,0.700198544010589
-xstory_cloze_hi,Answer Given options,accuracy,0.6366644606221046
-xstory_cloze_hi,Choose Story Ending,accuracy,0.7882197220383852
-xstory_cloze_hi,Generate Ending,accuracy,0.5982792852415619
-xstory_cloze_hi,Novel Correct Ending,accuracy,0.7485109199205824
-xstory_cloze_hi,Story Continuation and Options,accuracy,0.7683653209794837
-xstory_cloze_hi,median,accuracy,0.7485109199205824
-xstory_cloze_id,Answer Given options,accuracy,0.7385837193911317
-xstory_cloze_id,Choose Story Ending,accuracy,0.8332230311052283
-xstory_cloze_id,Generate Ending,accuracy,0.6293845135671741
-xstory_cloze_id,Novel Correct Ending,accuracy,0.7816015883520847
-xstory_cloze_id,Story Continuation and Options,accuracy,0.8226340172071476
-xstory_cloze_id,median,accuracy,0.7816015883520847
-xstory_cloze_zh,Answer Given options,accuracy,0.7498345466578424
-xstory_cloze_zh,Choose Story Ending,accuracy,0.8583719391131701
-xstory_cloze_zh,Generate Ending,accuracy,0.6227663798808736
-xstory_cloze_zh,Novel Correct Ending,accuracy,0.8405029781601588
-xstory_cloze_zh,Story Continuation and Options,accuracy,0.8385175380542687
-xstory_cloze_zh,median,accuracy,0.8385175380542687
-xwinograd_en,Replace,accuracy,0.6576344086021505
-xwinograd_en,True or False,accuracy,0.5187096774193548
-xwinograd_en,does underscore refer to,accuracy,0.5931182795698925
-xwinograd_en,stand for,accuracy,0.5070967741935484
-xwinograd_en,underscore refer to,accuracy,0.6210752688172043
-xwinograd_en,median,accuracy,0.5931182795698925
-xwinograd_fr,Replace,accuracy,0.5180722891566265
-xwinograd_fr,True or False,accuracy,0.5301204819277109
-xwinograd_fr,does underscore refer to,accuracy,0.5542168674698795
-xwinograd_fr,stand for,accuracy,0.5180722891566265
-xwinograd_fr,underscore refer to,accuracy,0.5421686746987951
-xwinograd_fr,median,accuracy,0.5301204819277109
-xwinograd_pt,Replace,accuracy,0.5741444866920152
-xwinograd_pt,True or False,accuracy,0.4790874524714829
-xwinograd_pt,does underscore refer to,accuracy,0.55893536121673
-xwinograd_pt,stand for,accuracy,0.5209125475285171
-xwinograd_pt,underscore refer to,accuracy,0.5437262357414449
-xwinograd_pt,median,accuracy,0.5437262357414449
-xwinograd_zh,Replace,accuracy,0.626984126984127
-xwinograd_zh,True or False,accuracy,0.503968253968254
-xwinograd_zh,does underscore refer to,accuracy,0.5436507936507936
-xwinograd_zh,stand for,accuracy,0.49007936507936506
-xwinograd_zh,underscore refer to,accuracy,0.5535714285714286
-xwinograd_zh,median,accuracy,0.5436507936507936
-multiple,average,multiple,0.6067197952551315
diff --git a/evaluation_bloommz-7b1/evaluation_l1/merged.json b/evaluation_bloommz-7b1/evaluation_l1/merged.json
deleted file mode 100644
index c58141c131fb8e3ab2c2f7a6c9c1f0a021862123..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/merged.json
+++ /dev/null
@@ -1 +0,0 @@
-{"Muennighoff/xstory_cloze_ar": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6896095301125083}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8378557246856386}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.5956320317670417}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8213103904698875}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8219722038385175}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xstory_cloze_es": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7683653209794837}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.886168100595632}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6724023825281271}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8676373262739907}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8769027134348114}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xstory_cloze_eu": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6082064857710126}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7266710787557908}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.5552614162806089}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.700198544010589}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7107875579086698}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xstory_cloze_hi": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6366644606221046}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7882197220383852}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.5982792852415619}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7485109199205824}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7683653209794837}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xstory_cloze_id": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7385837193911317}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8332230311052283}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6293845135671741}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7816015883520847}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8226340172071476}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xstory_cloze_zh": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7498345466578424}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8583719391131701}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6227663798808736}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8405029781601588}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8385175380542687}, "template_name": "Story Continuation and Options"}}, "Muennighoff/xwinograd_en": {"Replace": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6576344086021505}, "template_name": "Replace"}, "True or False": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5187096774193548}, "template_name": "True or False"}, "does underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5931182795698925}, "template_name": "does underscore refer to"}, "stand for": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5070967741935484}, "template_name": "stand for"}, "underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6210752688172043}, "template_name": "underscore refer to"}}, "Muennighoff/xwinograd_fr": {"Replace": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5180722891566265}, "template_name": "Replace"}, "True or False": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5301204819277109}, "template_name": "True or False"}, "does underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5542168674698795}, "template_name": "does underscore refer to"}, "stand for": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5180722891566265}, "template_name": "stand for"}, "underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5421686746987951}, "template_name": "underscore refer to"}}, "Muennighoff/xwinograd_pt": {"Replace": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5741444866920152}, "template_name": "Replace"}, "True or False": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.4790874524714829}, "template_name": "True or False"}, "does underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.55893536121673}, "template_name": "does underscore refer to"}, "stand for": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5209125475285171}, "template_name": "stand for"}, "underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5437262357414449}, "template_name": "underscore refer to"}}, "Muennighoff/xwinograd_zh": {"Replace": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.626984126984127}, "template_name": "Replace"}, "True or False": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.503968253968254}, "template_name": "True or False"}, "does underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5436507936507936}, "template_name": "does underscore refer to"}, "stand for": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.49007936507936506}, "template_name": "stand for"}, "underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='en', template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5535714285714286}, "template_name": "underscore refer to"}}, "anli_dev_r1": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r1", "dataset_name": "anli", "evaluation": {"accuracy": 0.426}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r1", "dataset_name": "anli", "evaluation": {"accuracy": 0.402}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r1", "dataset_name": "anli", "evaluation": {"accuracy": 0.401}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r1", "dataset_name": "anli", "evaluation": {"accuracy": 0.314}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r1', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r1', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r1", "dataset_name": "anli", "evaluation": {"accuracy": 0.387}, "template_name": "justified in saying"}}, "anli_dev_r2": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r2", "dataset_name": "anli", "evaluation": {"accuracy": 0.383}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r2", "dataset_name": "anli", "evaluation": {"accuracy": 0.374}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r2", "dataset_name": "anli", "evaluation": {"accuracy": 0.394}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r2", "dataset_name": "anli", "evaluation": {"accuracy": 0.302}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r2', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r2', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r2", "dataset_name": "anli", "evaluation": {"accuracy": 0.376}, "template_name": "justified in saying"}}, "anli_dev_r3": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r3", "dataset_name": "anli", "evaluation": {"accuracy": 0.42}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r3", "dataset_name": "anli", "evaluation": {"accuracy": 0.4116666666666667}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r3", "dataset_name": "anli", "evaluation": {"accuracy": 0.38916666666666666}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r3", "dataset_name": "anli", "evaluation": {"accuracy": 0.2966666666666667}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='dev_r3', dataset_name='anli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='dev_r3', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "dev_r3", "dataset_name": "anli", "evaluation": {"accuracy": 0.35833333333333334}, "template_name": "justified in saying"}}, "story_cloze_2016": {"Answer Given options": {"arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "2016", "dataset_name": "story_cloze", "evaluation": {"accuracy": 0.8524853019775521}, "template_name": "Answer Given options"}, "Choose Story Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "2016", "dataset_name": "story_cloze", "evaluation": {"accuracy": 0.8957776590058792}, "template_name": "Choose Story Ending"}, "Generate Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "2016", "dataset_name": "story_cloze", "evaluation": {"accuracy": 0.709246392303581}, "template_name": "Generate Ending"}, "Novel Correct Ending": {"arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "2016", "dataset_name": "story_cloze", "evaluation": {"accuracy": 0.8888295029396045}, "template_name": "Novel Correct Ending"}, "Story Continuation and Options": {"arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "2016", "dataset_name": "story_cloze", "evaluation": {"accuracy": 0.8850881881346874}, "template_name": "Story Continuation and Options"}}, "super_glue_cb": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cb", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.8392857142857143}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cb", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.35714285714285715}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cb", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.7857142857142857}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cb", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.5535714285714286}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "cb", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.7142857142857143}, "template_name": "justified in saying"}}, "super_glue_copa": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name=None, template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "copa", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.66}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "copa", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.77}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "copa", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.8}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "copa", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.81}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "copa", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.84}, "template_name": "plausible_alternatives"}}, "super_glue_rte": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "rte", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.7906137184115524}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "rte", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.8267148014440433}, "template_name": "MNLI crowdsource"}, "does it follow that": {"arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='does it follow that', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "rte", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.7942238267148014}, "template_name": "does it follow that"}, "guaranteed true": {"arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='guaranteed true', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "rte", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.776173285198556}, "template_name": "guaranteed true"}, "should assume": {"arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='should assume', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "rte", "dataset_name": "super_glue", "evaluation": {"accuracy": 0.7617328519855595}, "template_name": "should assume"}}, "winogrande_winogrande_xl": {"Replace": {"arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "winogrande_xl", "dataset_name": "winogrande", "evaluation": {"accuracy": 0.5588003157063931}, "template_name": "Replace"}, "True or False": {"arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "winogrande_xl", "dataset_name": "winogrande", "evaluation": {"accuracy": 0.5280189423835833}, "template_name": "True or False"}, "does underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "winogrande_xl", "dataset_name": "winogrande", "evaluation": {"accuracy": 0.5651144435674822}, "template_name": "does underscore refer to"}, "stand for": {"arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "winogrande_xl", "dataset_name": "winogrande", "evaluation": {"accuracy": 0.5082872928176796}, "template_name": "stand for"}, "underscore refer to": {"arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "winogrande_xl", "dataset_name": "winogrande", "evaluation": {"accuracy": 0.5651144435674822}, "template_name": "underscore refer to"}}, "xcopa_id": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.46}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.7}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.73}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.72}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.67}, "template_name": "plausible_alternatives"}}, "xcopa_sw": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.6}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.55}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.54}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.51}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.52}, "template_name": "plausible_alternatives"}}, "xcopa_ta": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.59}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.56}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.6}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.57}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.62}, "template_name": "plausible_alternatives"}}, "xcopa_vi": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.53}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.72}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.72}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.7}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.71}, "template_name": "plausible_alternatives"}}, "xcopa_zh": {"C1 or C2? premise, so/because\u2026": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.67}, "template_name": "C1 or C2? premise, so/because\u2026"}, "best_option": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.7}, "template_name": "best_option"}, "cause_effect": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.8}, "template_name": "cause_effect"}, "i_am_hesitating": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.77}, "template_name": "i_am_hesitating"}, "plausible_alternatives": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.79}, "template_name": "plausible_alternatives"}}, "xnli_ar": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5558232931726907}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.42128514056224897}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5148594377510041}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.40562248995983935}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4927710843373494}, "template_name": "justified in saying"}}, "xnli_en": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5891566265060241}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "xnli", "evaluation": {"accuracy": 0.42610441767068274}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5662650602409639}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4614457831325301}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "en", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5437751004016064}, "template_name": "justified in saying"}}, "xnli_es": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5734939759036145}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.40923694779116465}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5148594377510041}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.43132530120481927}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4610441767068273}, "template_name": "justified in saying"}}, "xnli_fr": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5666666666666667}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.42208835341365464}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5385542168674698}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.39076305220883534}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5100401606425703}, "template_name": "justified in saying"}}, "xnli_hi": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5345381526104418}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.41124497991967873}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4751004016064257}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.40923694779116465}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4469879518072289}, "template_name": "justified in saying"}}, "xnli_sw": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4827309236947791}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.40562248995983935}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.44497991967871486}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.42289156626506025}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.41124497991967873}, "template_name": "justified in saying"}}, "xnli_ur": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4947791164658635}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.39759036144578314}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4502008032128514}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.39036144578313253}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.40843373493975904}, "template_name": "justified in saying"}}, "xnli_vi": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5449799196787148}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.40401606425702813}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.44779116465863456}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4650602409638554}, "template_name": "justified in saying"}}, "xnli_zh": {"GPT-3 style": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5429718875502008}, "template_name": "GPT-3 style"}, "MNLI crowdsource": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3891566265060241}, "template_name": "MNLI crowdsource"}, "can we infer": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5032128514056224}, "template_name": "can we infer"}, "guaranteed/possible/impossible": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.38072289156626504}, "template_name": "guaranteed/possible/impossible"}, "justified in saying": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4706827309236948}, "template_name": "justified in saying"}}}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/story_cloze/2016/Answer_Given_options/results.json b/evaluation_bloommz-7b1/evaluation_l1/story_cloze/2016/Answer_Given_options/results.json
deleted file mode 100644
index bc12fbe77a80a053c5eed4af6d4f94b14a58fac8..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/story_cloze/2016/Answer_Given_options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "story_cloze",
-  "dataset_config_name": "2016",
-  "template_name": "Answer Given options",
-  "evaluation": {
-    "accuracy": 0.8524853019775521
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Answer Given options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/story_cloze/2016/Choose_Story_Ending/results.json b/evaluation_bloommz-7b1/evaluation_l1/story_cloze/2016/Choose_Story_Ending/results.json
deleted file mode 100644
index 2a0035cde5af1d05aa6218652e04dd7af635558e..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/story_cloze/2016/Choose_Story_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "story_cloze",
-  "dataset_config_name": "2016",
-  "template_name": "Choose Story Ending",
-  "evaluation": {
-    "accuracy": 0.8957776590058792
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Choose Story Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/story_cloze/2016/Generate_Ending/results.json b/evaluation_bloommz-7b1/evaluation_l1/story_cloze/2016/Generate_Ending/results.json
deleted file mode 100644
index e806bc7ee1a263558fc203b0ac5385ec77a68abe..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/story_cloze/2016/Generate_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "story_cloze",
-  "dataset_config_name": "2016",
-  "template_name": "Generate Ending",
-  "evaluation": {
-    "accuracy": 0.709246392303581
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Generate Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/story_cloze/2016/Novel_Correct_Ending/results.json b/evaluation_bloommz-7b1/evaluation_l1/story_cloze/2016/Novel_Correct_Ending/results.json
deleted file mode 100644
index ad80f193b34315890ca3d493f219209e25720bf0..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/story_cloze/2016/Novel_Correct_Ending/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "story_cloze",
-  "dataset_config_name": "2016",
-  "template_name": "Novel Correct Ending",
-  "evaluation": {
-    "accuracy": 0.8888295029396045
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Novel Correct Ending', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/story_cloze/2016/Story_Continuation_and_Options/results.json b/evaluation_bloommz-7b1/evaluation_l1/story_cloze/2016/Story_Continuation_and_Options/results.json
deleted file mode 100644
index 6b10c247f5761114c564f5bf9b8ca6151706c5d7..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/story_cloze/2016/Story_Continuation_and_Options/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "story_cloze",
-  "dataset_config_name": "2016",
-  "template_name": "Story Continuation and Options",
-  "evaluation": {
-    "accuracy": 0.8850881881346874
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='2016', dataset_name='story_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Story Continuation and Options', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/super_glue/cb/GPT-3_style/results.json b/evaluation_bloommz-7b1/evaluation_l1/super_glue/cb/GPT-3_style/results.json
deleted file mode 100644
index e88caf3c7b226a207a9912791ce8586649b6a2e8..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/super_glue/cb/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "cb",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.8392857142857143
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/super_glue/cb/MNLI_crowdsource/results.json b/evaluation_bloommz-7b1/evaluation_l1/super_glue/cb/MNLI_crowdsource/results.json
deleted file mode 100644
index 019653782e1cd06049e5b706b25af7d186617d43..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/super_glue/cb/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "cb",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.35714285714285715
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/super_glue/cb/can_we_infer/results.json b/evaluation_bloommz-7b1/evaluation_l1/super_glue/cb/can_we_infer/results.json
deleted file mode 100644
index 5b4601cfe5ea3cb7ab2c7fb549deb030b057a8ea..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/super_glue/cb/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "cb",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.7857142857142857
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/super_glue/cb/guaranteed_possible_impossible/results.json b/evaluation_bloommz-7b1/evaluation_l1/super_glue/cb/guaranteed_possible_impossible/results.json
deleted file mode 100644
index 3c083f8ad9fc8581c84080c566637c1ef6a574b3..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/super_glue/cb/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "cb",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.5535714285714286
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/super_glue/cb/justified_in_saying/results.json b/evaluation_bloommz-7b1/evaluation_l1/super_glue/cb/justified_in_saying/results.json
deleted file mode 100644
index 7260dd8971cd4a67d651fa81a27919f3f1e5fe07..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/super_glue/cb/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "cb",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.7142857142857143
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='cb', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/super_glue/copa/C1_or_C2?_premise/results.json b/evaluation_bloommz-7b1/evaluation_l1/super_glue/copa/C1_or_C2?_premise/results.json
deleted file mode 100644
index 4c3b5850c841659be2f1079fa7b5c632e04c5eaa..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/super_glue/copa/C1_or_C2?_premise/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "copa",
-  "template_name": "C1 or C2? premise, so/because\u2026",
-  "evaluation": {
-    "accuracy": 0.66
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name=None, template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/super_glue/copa/best_option/results.json b/evaluation_bloommz-7b1/evaluation_l1/super_glue/copa/best_option/results.json
deleted file mode 100644
index cbba802897e2718d94eaca1ecd34843a2ddcd1d5..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/super_glue/copa/best_option/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "copa",
-  "template_name": "best_option",
-  "evaluation": {
-    "accuracy": 0.77
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/super_glue/copa/cause_effect/results.json b/evaluation_bloommz-7b1/evaluation_l1/super_glue/copa/cause_effect/results.json
deleted file mode 100644
index 9430d172ee9dcd88e334286ec0fc4121ec0114d9..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/super_glue/copa/cause_effect/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "copa",
-  "template_name": "cause_effect",
-  "evaluation": {
-    "accuracy": 0.8
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/super_glue/copa/i_am_hesitating/results.json b/evaluation_bloommz-7b1/evaluation_l1/super_glue/copa/i_am_hesitating/results.json
deleted file mode 100644
index 6f89a66124cbcc5a49ef97ffe0e73dcc262f2b2e..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/super_glue/copa/i_am_hesitating/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "copa",
-  "template_name": "i_am_hesitating",
-  "evaluation": {
-    "accuracy": 0.81
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/super_glue/copa/plausible_alternatives/results.json b/evaluation_bloommz-7b1/evaluation_l1/super_glue/copa/plausible_alternatives/results.json
deleted file mode 100644
index a5aad2abd3c3fe329d7167231d1f0d7a00bab759..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/super_glue/copa/plausible_alternatives/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "copa",
-  "template_name": "plausible_alternatives",
-  "evaluation": {
-    "accuracy": 0.84
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='copa', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/super_glue/rte/GPT-3_style/results.json b/evaluation_bloommz-7b1/evaluation_l1/super_glue/rte/GPT-3_style/results.json
deleted file mode 100644
index e6f0f21d99a6515d49923aa44a3555649a5a1c70..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/super_glue/rte/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "rte",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.7906137184115524
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/super_glue/rte/MNLI_crowdsource/results.json b/evaluation_bloommz-7b1/evaluation_l1/super_glue/rte/MNLI_crowdsource/results.json
deleted file mode 100644
index 19415d9be79a38f6043de7d04791de5cc4bd6b1b..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/super_glue/rte/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "rte",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.8267148014440433
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/super_glue/rte/does_it_follow_that/results.json b/evaluation_bloommz-7b1/evaluation_l1/super_glue/rte/does_it_follow_that/results.json
deleted file mode 100644
index d372e9fb43eba25aac1c7aebf4a8174f2ec79495..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/super_glue/rte/does_it_follow_that/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "rte",
-  "template_name": "does it follow that",
-  "evaluation": {
-    "accuracy": 0.7942238267148014
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='does it follow that', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/super_glue/rte/guaranteed_true/results.json b/evaluation_bloommz-7b1/evaluation_l1/super_glue/rte/guaranteed_true/results.json
deleted file mode 100644
index 644170197819bd48259a1cf22f12340c27cb66e4..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/super_glue/rte/guaranteed_true/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "rte",
-  "template_name": "guaranteed true",
-  "evaluation": {
-    "accuracy": 0.776173285198556
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='guaranteed true', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/super_glue/rte/should_assume/results.json b/evaluation_bloommz-7b1/evaluation_l1/super_glue/rte/should_assume/results.json
deleted file mode 100644
index af31cd812073aff28baff4f147a77e4f11c96ae6..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/super_glue/rte/should_assume/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "super_glue",
-  "dataset_config_name": "rte",
-  "template_name": "should assume",
-  "evaluation": {
-    "accuracy": 0.7617328519855595
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='rte', dataset_name='super_glue', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='should assume', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/winogrande/winogrande_xl/Replace/results.json b/evaluation_bloommz-7b1/evaluation_l1/winogrande/winogrande_xl/Replace/results.json
deleted file mode 100644
index d49b83bf5e085e24d6dc105eaba9f0f021af4049..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/winogrande/winogrande_xl/Replace/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "winogrande",
-  "dataset_config_name": "winogrande_xl",
-  "template_name": "Replace",
-  "evaluation": {
-    "accuracy": 0.5588003157063931
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='Replace', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/winogrande/winogrande_xl/True_or_False/results.json b/evaluation_bloommz-7b1/evaluation_l1/winogrande/winogrande_xl/True_or_False/results.json
deleted file mode 100644
index 64986025f0e3d5fee2f07385485137e563059134..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/winogrande/winogrande_xl/True_or_False/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "winogrande",
-  "dataset_config_name": "winogrande_xl",
-  "template_name": "True or False",
-  "evaluation": {
-    "accuracy": 0.5280189423835833
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='True or False', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/winogrande/winogrande_xl/does_underscore_refer_to/results.json b/evaluation_bloommz-7b1/evaluation_l1/winogrande/winogrande_xl/does_underscore_refer_to/results.json
deleted file mode 100644
index df5c11afcdc5dba943fc3d5ac0d5a0eb0da5442b..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/winogrande/winogrande_xl/does_underscore_refer_to/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "winogrande",
-  "dataset_config_name": "winogrande_xl",
-  "template_name": "does underscore refer to",
-  "evaluation": {
-    "accuracy": 0.5651144435674822
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='does underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/winogrande/winogrande_xl/stand_for/results.json b/evaluation_bloommz-7b1/evaluation_l1/winogrande/winogrande_xl/stand_for/results.json
deleted file mode 100644
index 647e2857c0218f770c23fff801cace541e8fdf83..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/winogrande/winogrande_xl/stand_for/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "winogrande",
-  "dataset_config_name": "winogrande_xl",
-  "template_name": "stand for",
-  "evaluation": {
-    "accuracy": 0.5082872928176796
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='stand for', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/winogrande/winogrande_xl/underscore_refer_to/results.json b/evaluation_bloommz-7b1/evaluation_l1/winogrande/winogrande_xl/underscore_refer_to/results.json
deleted file mode 100644
index 826e581f8690184e6cc5502e619730ee5927a9f5..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/winogrande/winogrande_xl/underscore_refer_to/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "winogrande",
-  "dataset_config_name": "winogrande_xl",
-  "template_name": "underscore refer to",
-  "evaluation": {
-    "accuracy": 0.5651144435674822
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='winogrande_xl', dataset_name='winogrande', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name=None, template_name='underscore refer to', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/id/C1_or_C2?_premise/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/id/C1_or_C2?_premise/results.json
deleted file mode 100644
index 293fdb26f92984e0268401b1f1a14b028be2162f..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/id/C1_or_C2?_premise/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "id",
-  "template_name": "C1 or C2? premise, so/because\u2026",
-  "evaluation": {
-    "accuracy": 0.46
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/id/best_option/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/id/best_option/results.json
deleted file mode 100644
index 376f92d4d78eeb841b35b92e71a0a764063c2184..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/id/best_option/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "id",
-  "template_name": "best_option",
-  "evaluation": {
-    "accuracy": 0.7
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/id/cause_effect/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/id/cause_effect/results.json
deleted file mode 100644
index eb6e7d90f444d9b8c2b91f303b89e1fcd30e9022..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/id/cause_effect/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "id",
-  "template_name": "cause_effect",
-  "evaluation": {
-    "accuracy": 0.73
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/id/i_am_hesitating/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/id/i_am_hesitating/results.json
deleted file mode 100644
index 3d8d8315697a9d8b396bdcf751d5a1125676ff60..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/id/i_am_hesitating/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "id",
-  "template_name": "i_am_hesitating",
-  "evaluation": {
-    "accuracy": 0.72
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/id/plausible_alternatives/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/id/plausible_alternatives/results.json
deleted file mode 100644
index 32f9c69ce80a887a576c10e9ff705584cbc7c06b..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/id/plausible_alternatives/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "id",
-  "template_name": "plausible_alternatives",
-  "evaluation": {
-    "accuracy": 0.67
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/sw/C1_or_C2?_premise/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/sw/C1_or_C2?_premise/results.json
deleted file mode 100644
index 96f7c8b279f70afb01baa8eef9e50ca71882b6cc..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/sw/C1_or_C2?_premise/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "sw",
-  "template_name": "C1 or C2? premise, so/because\u2026",
-  "evaluation": {
-    "accuracy": 0.6
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/sw/best_option/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/sw/best_option/results.json
deleted file mode 100644
index 5111b0ad2d757cbdce85a1168f2fbe34d3c5e38d..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/sw/best_option/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "sw",
-  "template_name": "best_option",
-  "evaluation": {
-    "accuracy": 0.55
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/sw/cause_effect/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/sw/cause_effect/results.json
deleted file mode 100644
index 0ad7e70a308353cf9d37764620d23de296526cc2..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/sw/cause_effect/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "sw",
-  "template_name": "cause_effect",
-  "evaluation": {
-    "accuracy": 0.54
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/sw/i_am_hesitating/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/sw/i_am_hesitating/results.json
deleted file mode 100644
index 3d49f6c7b070a1968668977bf88c48e43f1323f7..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/sw/i_am_hesitating/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "sw",
-  "template_name": "i_am_hesitating",
-  "evaluation": {
-    "accuracy": 0.51
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/sw/plausible_alternatives/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/sw/plausible_alternatives/results.json
deleted file mode 100644
index b536b7058230d3d08e87e8f6b91aeb84a91dbbf3..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/sw/plausible_alternatives/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "sw",
-  "template_name": "plausible_alternatives",
-  "evaluation": {
-    "accuracy": 0.52
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/ta/C1_or_C2?_premise/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/ta/C1_or_C2?_premise/results.json
deleted file mode 100644
index 37b0615e662498d1b2d51ccf76f87ed84baf4d4d..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/ta/C1_or_C2?_premise/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "ta",
-  "template_name": "C1 or C2? premise, so/because\u2026",
-  "evaluation": {
-    "accuracy": 0.59
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/ta/best_option/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/ta/best_option/results.json
deleted file mode 100644
index ad0d3fb1e2334d8f5a58ca45fdbc52234a155c5a..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/ta/best_option/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "ta",
-  "template_name": "best_option",
-  "evaluation": {
-    "accuracy": 0.56
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/ta/cause_effect/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/ta/cause_effect/results.json
deleted file mode 100644
index d0767df2f080cde3ed0b36bb73af7a735d30e1e1..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/ta/cause_effect/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "ta",
-  "template_name": "cause_effect",
-  "evaluation": {
-    "accuracy": 0.6
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/ta/i_am_hesitating/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/ta/i_am_hesitating/results.json
deleted file mode 100644
index 11ad208825e71b0e4b8c55821fb1c06bb0d2b8f8..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/ta/i_am_hesitating/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "ta",
-  "template_name": "i_am_hesitating",
-  "evaluation": {
-    "accuracy": 0.57
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/ta/plausible_alternatives/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/ta/plausible_alternatives/results.json
deleted file mode 100644
index 650cd2908644df0288ff02176a27ba6ccdd8430d..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/ta/plausible_alternatives/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "ta",
-  "template_name": "plausible_alternatives",
-  "evaluation": {
-    "accuracy": 0.62
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/vi/C1_or_C2?_premise/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/vi/C1_or_C2?_premise/results.json
deleted file mode 100644
index 4822d0163e81a6188e592916c789a7edf82fc895..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/vi/C1_or_C2?_premise/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "vi",
-  "template_name": "C1 or C2? premise, so/because\u2026",
-  "evaluation": {
-    "accuracy": 0.53
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/vi/best_option/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/vi/best_option/results.json
deleted file mode 100644
index 205c5e683fbb3fc4634f9fbfd28097c65d60620b..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/vi/best_option/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "vi",
-  "template_name": "best_option",
-  "evaluation": {
-    "accuracy": 0.72
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/vi/cause_effect/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/vi/cause_effect/results.json
deleted file mode 100644
index b8675fb5e6b88fd2daee764a82897ff98fc80d33..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/vi/cause_effect/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "vi",
-  "template_name": "cause_effect",
-  "evaluation": {
-    "accuracy": 0.72
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/vi/i_am_hesitating/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/vi/i_am_hesitating/results.json
deleted file mode 100644
index 7a1e8a30a979e9b007efbf5ea69c4308305c6979..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/vi/i_am_hesitating/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "vi",
-  "template_name": "i_am_hesitating",
-  "evaluation": {
-    "accuracy": 0.7
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/vi/plausible_alternatives/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/vi/plausible_alternatives/results.json
deleted file mode 100644
index a754e5ef467a38023545605ed176de98e200240c..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/vi/plausible_alternatives/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "vi",
-  "template_name": "plausible_alternatives",
-  "evaluation": {
-    "accuracy": 0.71
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/zh/C1_or_C2?_premise/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/zh/C1_or_C2?_premise/results.json
deleted file mode 100644
index 8785356a62493902dc90f963cf0c4e512544d145..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/zh/C1_or_C2?_premise/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "zh",
-  "template_name": "C1 or C2? premise, so/because\u2026",
-  "evaluation": {
-    "accuracy": 0.67
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='so/because\u2026,validation', target_max_length=256, template_config_name='en', template_name='C1 or C2? premise', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/zh/best_option/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/zh/best_option/results.json
deleted file mode 100644
index 66cc8b6be554597540a596a1e28b9e7b0322a4f6..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/zh/best_option/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "zh",
-  "template_name": "best_option",
-  "evaluation": {
-    "accuracy": 0.7
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='best_option', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/zh/cause_effect/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/zh/cause_effect/results.json
deleted file mode 100644
index 6d6d9852b695c1d3c8725ca07fe44727f9072ce6..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/zh/cause_effect/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "zh",
-  "template_name": "cause_effect",
-  "evaluation": {
-    "accuracy": 0.8
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='cause_effect', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/zh/i_am_hesitating/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/zh/i_am_hesitating/results.json
deleted file mode 100644
index fdd91c8fe51bf2cd88cecc831c3e10fb23956bb9..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/zh/i_am_hesitating/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "zh",
-  "template_name": "i_am_hesitating",
-  "evaluation": {
-    "accuracy": 0.77
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='i_am_hesitating', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xcopa/zh/plausible_alternatives/results.json b/evaluation_bloommz-7b1/evaluation_l1/xcopa/zh/plausible_alternatives/results.json
deleted file mode 100644
index 2c703eb186efceb2023af78cdd7bb346f3c129c0..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xcopa/zh/plausible_alternatives/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "zh",
-  "template_name": "plausible_alternatives",
-  "evaluation": {
-    "accuracy": 0.79
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='plausible_alternatives', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/ar/GPT-3_style/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/ar/GPT-3_style/results.json
deleted file mode 100644
index 546639690a91af2f7657d5e3d11086607d70d92e..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/ar/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.5558232931726907
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/ar/MNLI_crowdsource/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/ar/MNLI_crowdsource/results.json
deleted file mode 100644
index c5ca5a5de45223fdf04ffb89b42bd5210a6b05f4..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/ar/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.42128514056224897
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/ar/can_we_infer/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/ar/can_we_infer/results.json
deleted file mode 100644
index b4e668e7b7ac4839f822b30191a815ac8c574928..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/ar/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.5148594377510041
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/ar/guaranteed_possible_impossible/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/ar/guaranteed_possible_impossible/results.json
deleted file mode 100644
index e912b2ece40d36999fd229f24fc8ef3b329cc1a6..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/ar/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.40562248995983935
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/ar/justified_in_saying/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/ar/justified_in_saying/results.json
deleted file mode 100644
index fefe3744fd1e94bc92a8643385c9ca4150c0ddf6..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/ar/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.4927710843373494
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/en/GPT-3_style/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/en/GPT-3_style/results.json
deleted file mode 100644
index ce728dd89ed583d89ff66c427bc1e26e4c94b7c2..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/en/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "en",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.5891566265060241
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/en/MNLI_crowdsource/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/en/MNLI_crowdsource/results.json
deleted file mode 100644
index b501c0c19d450e9ee0010114f311cb4265b37242..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/en/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "en",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.42610441767068274
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/en/can_we_infer/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/en/can_we_infer/results.json
deleted file mode 100644
index 606887a2517b3583f762bbe473a0b0edebabbb4a..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/en/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "en",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.5662650602409639
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/en/guaranteed_possible_impossible/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/en/guaranteed_possible_impossible/results.json
deleted file mode 100644
index 8bb4647db40a2b53df96ab55502c1b458a16ba63..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/en/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "en",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.4614457831325301
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/en/justified_in_saying/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/en/justified_in_saying/results.json
deleted file mode 100644
index fdf68b1b3c2db3ef8c9dd6d81fcb17a89e736e5a..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/en/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "en",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.5437751004016064
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='en', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/es/GPT-3_style/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/es/GPT-3_style/results.json
deleted file mode 100644
index fbb0fbe72ce9a2366a5fbdfd9da6eb0f9d9fd040..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/es/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.5734939759036145
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/es/MNLI_crowdsource/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/es/MNLI_crowdsource/results.json
deleted file mode 100644
index 9ad337da8b03428488d1855d4bb5512047f8b8b2..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/es/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.40923694779116465
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/es/can_we_infer/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/es/can_we_infer/results.json
deleted file mode 100644
index 7c5b9dc83a39b27ee74ebc89a068c2f2257f6bb7..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/es/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.5148594377510041
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/es/guaranteed_possible_impossible/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/es/guaranteed_possible_impossible/results.json
deleted file mode 100644
index 6ad61caa2724825a2fe37d749ad9a5a0049b367c..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/es/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.43132530120481927
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/es/justified_in_saying/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/es/justified_in_saying/results.json
deleted file mode 100644
index 88a666927b00d9871cca5682f0ed58f889592173..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/es/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.4610441767068273
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/fr/GPT-3_style/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/fr/GPT-3_style/results.json
deleted file mode 100644
index b7406b5bc3ed6a427267f31abae653fa9308c394..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/fr/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.5666666666666667
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/fr/MNLI_crowdsource/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/fr/MNLI_crowdsource/results.json
deleted file mode 100644
index d4d16731dcbfbb4413466d1be513e13815368d38..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/fr/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.42208835341365464
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/fr/can_we_infer/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/fr/can_we_infer/results.json
deleted file mode 100644
index 42b889b3a9d7148ac2c31a64cdc3eff07565c38a..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/fr/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.5385542168674698
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/fr/guaranteed_possible_impossible/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/fr/guaranteed_possible_impossible/results.json
deleted file mode 100644
index 1fb16a2540011994a7c5dec4cf07dad0c3ff536b..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/fr/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.39076305220883534
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/fr/justified_in_saying/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/fr/justified_in_saying/results.json
deleted file mode 100644
index 4f42e8570f20fcf1c468a2695865355610b7e971..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/fr/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.5100401606425703
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/hi/GPT-3_style/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/hi/GPT-3_style/results.json
deleted file mode 100644
index 3ff8a878683ceacf02f36c301dd43ce7bee51134..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/hi/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.5345381526104418
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/hi/MNLI_crowdsource/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/hi/MNLI_crowdsource/results.json
deleted file mode 100644
index 46fd2c4c63efd9ffb52ca3cfcd385a92f730b432..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/hi/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.41124497991967873
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/hi/can_we_infer/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/hi/can_we_infer/results.json
deleted file mode 100644
index ef0c2d34b4062ad7db36cc80415d5e6b588e4577..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/hi/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.4751004016064257
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/hi/guaranteed_possible_impossible/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/hi/guaranteed_possible_impossible/results.json
deleted file mode 100644
index 1637ac7ae1350312fe7e9291d2416cd3e50a9c79..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/hi/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.40923694779116465
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/hi/justified_in_saying/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/hi/justified_in_saying/results.json
deleted file mode 100644
index 951e5f90e621992983980077a06098e556e294af..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/hi/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.4469879518072289
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/sw/GPT-3_style/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/sw/GPT-3_style/results.json
deleted file mode 100644
index c819b498e7f52c050ecde967d689ef49c5ead831..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/sw/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.4827309236947791
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/sw/MNLI_crowdsource/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/sw/MNLI_crowdsource/results.json
deleted file mode 100644
index 65398c49b780d2e20b7069e487311b6b981d4623..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/sw/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.40562248995983935
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/sw/can_we_infer/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/sw/can_we_infer/results.json
deleted file mode 100644
index 39921e787dd0094bbc08607bf47d2937abbfd42a..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/sw/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.44497991967871486
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/sw/guaranteed_possible_impossible/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/sw/guaranteed_possible_impossible/results.json
deleted file mode 100644
index 318bbebe5b8253865063239ce33e84307900a27c..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/sw/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.42289156626506025
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/sw/justified_in_saying/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/sw/justified_in_saying/results.json
deleted file mode 100644
index cfff418d842a01be4a3654a50c02f6289a1fc703..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/sw/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.41124497991967873
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/ur/GPT-3_style/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/ur/GPT-3_style/results.json
deleted file mode 100644
index 24b3b4d9b3e429b610fb0a95c7722f51f1d0550a..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/ur/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.4947791164658635
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/ur/MNLI_crowdsource/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/ur/MNLI_crowdsource/results.json
deleted file mode 100644
index 67148069721a4c14b3c11b50b2aa8a4c09fe355b..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/ur/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.39759036144578314
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/ur/can_we_infer/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/ur/can_we_infer/results.json
deleted file mode 100644
index 22e46c6c09a04bf78053dd339ab28489f7704108..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/ur/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.4502008032128514
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/ur/guaranteed_possible_impossible/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/ur/guaranteed_possible_impossible/results.json
deleted file mode 100644
index b810a40e3e235790207376b32af2a299d4f6fe24..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/ur/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.39036144578313253
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/ur/justified_in_saying/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/ur/justified_in_saying/results.json
deleted file mode 100644
index 20245923546de8f7e85e8dd7e2bc1db0847212c3..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/ur/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.40843373493975904
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/vi/GPT-3_style/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/vi/GPT-3_style/results.json
deleted file mode 100644
index a13ae51ee9460f9666c435cf5fa50778f44d72bb..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/vi/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.5449799196787148
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/vi/MNLI_crowdsource/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/vi/MNLI_crowdsource/results.json
deleted file mode 100644
index ccbec33b9e9b2ed4137fee347c4b91c0d2da5e34..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/vi/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.40401606425702813
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/vi/can_we_infer/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/vi/can_we_infer/results.json
deleted file mode 100644
index 92a1a38226419e2f25f841111418028bc08cc0cf..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/vi/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.5
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/vi/guaranteed_possible_impossible/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/vi/guaranteed_possible_impossible/results.json
deleted file mode 100644
index afde16840d9ce35ba02f3c2fcd7460a447b59b6a..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/vi/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.44779116465863456
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/vi/justified_in_saying/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/vi/justified_in_saying/results.json
deleted file mode 100644
index 1afbfdbf7c938a67dd496eb79cde783df539b46a..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/vi/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.4650602409638554
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/zh/GPT-3_style/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/zh/GPT-3_style/results.json
deleted file mode 100644
index 2bb9b56add737900ef37773f0080e24295befdd1..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/zh/GPT-3_style/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "GPT-3 style",
-  "evaluation": {
-    "accuracy": 0.5429718875502008
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='GPT-3 style', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/zh/MNLI_crowdsource/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/zh/MNLI_crowdsource/results.json
deleted file mode 100644
index 7dcfbc8f34fbd0614d389653502b05cc134b2cd7..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/zh/MNLI_crowdsource/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "MNLI crowdsource",
-  "evaluation": {
-    "accuracy": 0.3891566265060241
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='MNLI crowdsource', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/zh/can_we_infer/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/zh/can_we_infer/results.json
deleted file mode 100644
index ed0c5d7b69442e3a1dc012e67cc6dc92f926cb83..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/zh/can_we_infer/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "can we infer",
-  "evaluation": {
-    "accuracy": 0.5032128514056224
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='can we infer', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/zh/guaranteed_possible_impossible/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/zh/guaranteed_possible_impossible/results.json
deleted file mode 100644
index ab2e4c3507538fdc53545cdaa4b371ce69faba4f..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/zh/guaranteed_possible_impossible/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "guaranteed/possible/impossible",
-  "evaluation": {
-    "accuracy": 0.38072289156626504
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='guaranteed/possible/impossible', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_l1/xnli/zh/justified_in_saying/results.json b/evaluation_bloommz-7b1/evaluation_l1/xnli/zh/justified_in_saying/results.json
deleted file mode 100644
index 3e5f64a5ee3df4e2035281194d7f24398a2cb0fe..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_l1/xnli/zh/justified_in_saying/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "justified in saying",
-  "evaluation": {
-    "accuracy": 0.4706827309236948
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='en', template_name='justified in saying', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ar/GPT-3_style_arht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ar/GPT-3_style_arht/results.json
deleted file mode 100644
index 89e43bafb8376a00e9550fe6e6650f8fa643b203..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ar/GPT-3_style_arht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "GPT-3 style_arht",
-  "evaluation": {
-    "accuracy": 0.4610441767068273
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='GPT-3 style_arht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ar/MNLI_crowdsource_arht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ar/MNLI_crowdsource_arht/results.json
deleted file mode 100644
index 9f61804d9d11f714cbd249fc74308c8fbc0699c3..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ar/MNLI_crowdsource_arht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "MNLI crowdsource_arht",
-  "evaluation": {
-    "accuracy": 0.3899598393574297
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='MNLI crowdsource_arht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ar/can_we_infer_arht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ar/can_we_infer_arht/results.json
deleted file mode 100644
index 390cd9f97ace2cb2f965cf64641250954c6e21fb..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ar/can_we_infer_arht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "can we infer_arht",
-  "evaluation": {
-    "accuracy": 0.3550200803212851
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='can we infer_arht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ar/guaranteed_possible_impossible_arht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ar/guaranteed_possible_impossible_arht/results.json
deleted file mode 100644
index b6e2d83aadea0aca86a997c8bf99a61c2ab7063d..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ar/guaranteed_possible_impossible_arht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "guaranteed/possible/impossible_arht",
-  "evaluation": {
-    "accuracy": 0.45461847389558235
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='guaranteed/possible/impossible_arht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ar/justified_in_saying_arht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ar/justified_in_saying_arht/results.json
deleted file mode 100644
index 82e207df2d935a573bad6b7501d6aaf42dcc1cb3..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ar/justified_in_saying_arht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "justified in saying_arht",
-  "evaluation": {
-    "accuracy": 0.3538152610441767
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='justified in saying_arht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/es/GPT-3_style_esht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/es/GPT-3_style_esht/results.json
deleted file mode 100644
index cffee59d56f1ebbb02db7d7721ae08ba1a4cddc8..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/es/GPT-3_style_esht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "GPT-3 style_esht",
-  "evaluation": {
-    "accuracy": 0.5313253012048192
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='GPT-3 style_esht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/es/MNLI_crowdsource_esht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/es/MNLI_crowdsource_esht/results.json
deleted file mode 100644
index 49638017ffc7ea741445cda68c9f802078b835d9..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/es/MNLI_crowdsource_esht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "MNLI crowdsource_esht",
-  "evaluation": {
-    "accuracy": 0.334136546184739
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='MNLI crowdsource_esht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/es/can_we_infer_esht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/es/can_we_infer_esht/results.json
deleted file mode 100644
index de94c4c56f39e025ea75d755f6e27e9137a4bee7..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/es/can_we_infer_esht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "can we infer_esht",
-  "evaluation": {
-    "accuracy": 0.36987951807228914
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='can we infer_esht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/es/guaranteed_possible_impossible_esht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/es/guaranteed_possible_impossible_esht/results.json
deleted file mode 100644
index 4ab106672cdff63d80d3a3f17628c4df8299825c..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/es/guaranteed_possible_impossible_esht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "guaranteed/possible/impossible_esht",
-  "evaluation": {
-    "accuracy": 0.4686746987951807
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='guaranteed/possible/impossible_esht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/es/justified_in_saying_esht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/es/justified_in_saying_esht/results.json
deleted file mode 100644
index 0e1a78271a48bb847416de3e695e9ce8b9b0313e..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/es/justified_in_saying_esht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "justified in saying_esht",
-  "evaluation": {
-    "accuracy": 0.37630522088353413
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='justified in saying_esht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/fr/GPT-3_style_frht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/fr/GPT-3_style_frht/results.json
deleted file mode 100644
index ee71295e8d994debecfdb9ef2012121716cfbcae..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/fr/GPT-3_style_frht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "GPT-3 style_frht",
-  "evaluation": {
-    "accuracy": 0.5345381526104418
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='GPT-3 style_frht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/fr/MNLI_crowdsource_frht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/fr/MNLI_crowdsource_frht/results.json
deleted file mode 100644
index 7a42eef9058f181bac68d4804d7eb2e9f70fe4d6..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/fr/MNLI_crowdsource_frht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "MNLI crowdsource_frht",
-  "evaluation": {
-    "accuracy": 0.3357429718875502
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='MNLI crowdsource_frht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/fr/can_we_infer_frht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/fr/can_we_infer_frht/results.json
deleted file mode 100644
index 90a19abe8e6077dacc39c17c9bc153c75706f052..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/fr/can_we_infer_frht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "can we infer_frht",
-  "evaluation": {
-    "accuracy": 0.5224899598393574
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='can we infer_frht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/fr/guaranteed_possible_impossible_frht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/fr/guaranteed_possible_impossible_frht/results.json
deleted file mode 100644
index 1c3df1fa5dac930fd4024d0cf9f26589a2ec3188..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/fr/guaranteed_possible_impossible_frht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "guaranteed/possible/impossible_frht",
-  "evaluation": {
-    "accuracy": 0.46586345381526106
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='guaranteed/possible/impossible_frht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/fr/justified_in_saying_frht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/fr/justified_in_saying_frht/results.json
deleted file mode 100644
index 2bfb1f45fb8d49e5c2b459fdd5d3d8faefd65832..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/fr/justified_in_saying_frht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "justified in saying_frht",
-  "evaluation": {
-    "accuracy": 0.4891566265060241
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='justified in saying_frht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/hi/GPT-3_style_hiht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/hi/GPT-3_style_hiht/results.json
deleted file mode 100644
index 81126b8423ccfb2a338316e6d739e62f5b2ad908..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/hi/GPT-3_style_hiht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "GPT-3 style_hiht",
-  "evaluation": {
-    "accuracy": 0.3325301204819277
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='GPT-3 style_hiht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/hi/MNLI_crowdsource_hiht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/hi/MNLI_crowdsource_hiht/results.json
deleted file mode 100644
index 72cc6afcf061bd27160824802dc13eea9fc91e3e..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/hi/MNLI_crowdsource_hiht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "MNLI crowdsource_hiht",
-  "evaluation": {
-    "accuracy": 0.470281124497992
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='MNLI crowdsource_hiht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/hi/can_we_infer_hiht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/hi/can_we_infer_hiht/results.json
deleted file mode 100644
index d7bac61e418793f962192a821514feaa7e84b689..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/hi/can_we_infer_hiht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "can we infer_hiht",
-  "evaluation": {
-    "accuracy": 0.37309236947791163
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='can we infer_hiht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/hi/guaranteed_possible_impossible_hiht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/hi/guaranteed_possible_impossible_hiht/results.json
deleted file mode 100644
index edc1bf25e960593d049d046709a7306a683453d1..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/hi/guaranteed_possible_impossible_hiht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "guaranteed/possible/impossible_hiht",
-  "evaluation": {
-    "accuracy": 0.3514056224899598
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='guaranteed/possible/impossible_hiht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/hi/justified_in_saying_hiht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/hi/justified_in_saying_hiht/results.json
deleted file mode 100644
index 8c7b4bf0fd416f731a886fbef444cfd580229a52..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/hi/justified_in_saying_hiht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "justified in saying_hiht",
-  "evaluation": {
-    "accuracy": 0.3746987951807229
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='justified in saying_hiht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/merged.csv b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/merged.csv
deleted file mode 100644
index 63d9419c23b657c9221fbcd5ed16fbbc8b1ca096..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/merged.csv
+++ /dev/null
@@ -1,50 +0,0 @@
-dataset,prompt,metric,value
-xnli_ar,GPT-3 style_arht,accuracy,0.4610441767068273
-xnli_ar,MNLI crowdsource_arht,accuracy,0.3899598393574297
-xnli_ar,can we infer_arht,accuracy,0.3550200803212851
-xnli_ar,guaranteed/possible/impossible_arht,accuracy,0.45461847389558235
-xnli_ar,justified in saying_arht,accuracy,0.3538152610441767
-xnli_ar,median,accuracy,0.3899598393574297
-xnli_es,GPT-3 style_esht,accuracy,0.5313253012048192
-xnli_es,MNLI crowdsource_esht,accuracy,0.334136546184739
-xnli_es,can we infer_esht,accuracy,0.36987951807228914
-xnli_es,guaranteed/possible/impossible_esht,accuracy,0.4686746987951807
-xnli_es,justified in saying_esht,accuracy,0.37630522088353413
-xnli_es,median,accuracy,0.37630522088353413
-xnli_fr,GPT-3 style_frht,accuracy,0.5345381526104418
-xnli_fr,MNLI crowdsource_frht,accuracy,0.3357429718875502
-xnli_fr,can we infer_frht,accuracy,0.5224899598393574
-xnli_fr,guaranteed/possible/impossible_frht,accuracy,0.46586345381526106
-xnli_fr,justified in saying_frht,accuracy,0.4891566265060241
-xnli_fr,median,accuracy,0.4891566265060241
-xnli_hi,GPT-3 style_hiht,accuracy,0.3325301204819277
-xnli_hi,MNLI crowdsource_hiht,accuracy,0.470281124497992
-xnli_hi,can we infer_hiht,accuracy,0.37309236947791163
-xnli_hi,guaranteed/possible/impossible_hiht,accuracy,0.3514056224899598
-xnli_hi,justified in saying_hiht,accuracy,0.3746987951807229
-xnli_hi,median,accuracy,0.37309236947791163
-xnli_sw,GPT-3 style_swht,accuracy,0.336144578313253
-xnli_sw,MNLI crowdsource_swht,accuracy,0.3333333333333333
-xnli_sw,can we infer_swht,accuracy,0.3453815261044177
-xnli_sw,guaranteed/possible/impossible_swht,accuracy,0.35582329317269074
-xnli_sw,justified in saying_swht,accuracy,0.3269076305220884
-xnli_sw,median,accuracy,0.336144578313253
-xnli_ur,GPT-3 style_urht,accuracy,0.4
-xnli_ur,MNLI crowdsource_urht,accuracy,0.3562248995983936
-xnli_ur,can we infer_urht,accuracy,0.3349397590361446
-xnli_ur,guaranteed/possible/impossible_urht,accuracy,0.37630522088353413
-xnli_ur,justified in saying_urht,accuracy,0.3405622489959839
-xnli_ur,median,accuracy,0.3562248995983936
-xnli_vi,GPT-3 style_viht,accuracy,0.5265060240963856
-xnli_vi,MNLI crowdsource_viht,accuracy,0.37710843373493974
-xnli_vi,can we infer_viht,accuracy,0.5116465863453815
-xnli_vi,guaranteed/possible/impossible_viht,accuracy,0.3578313253012048
-xnli_vi,justified in saying_viht,accuracy,0.5028112449799197
-xnli_vi,median,accuracy,0.5028112449799197
-xnli_zh,GPT-3 style_zhht,accuracy,0.3196787148594378
-xnli_zh,MNLI crowdsource_zhht,accuracy,0.38112449799196785
-xnli_zh,can we infer_zhht,accuracy,0.40642570281124496
-xnli_zh,guaranteed/possible/impossible_zhht,accuracy,0.344578313253012
-xnli_zh,justified in saying_zhht,accuracy,0.3369477911646586
-xnli_zh,median,accuracy,0.344578313253012
-multiple,average,multiple,0.39603413654618475
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/merged.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/merged.json
deleted file mode 100644
index efc985e13b0d05e58fa83cc086ba7da085ef4275..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/merged.json
+++ /dev/null
@@ -1 +0,0 @@
-{"xnli_ar": {"GPT-3 style_arht": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='GPT-3 style_arht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4610441767068273}, "template_name": "GPT-3 style_arht"}, "MNLI crowdsource_arht": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='MNLI crowdsource_arht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3899598393574297}, "template_name": "MNLI crowdsource_arht"}, "can we infer_arht": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='can we infer_arht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3550200803212851}, "template_name": "can we infer_arht"}, "guaranteed/possible/impossible_arht": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='guaranteed/possible/impossible_arht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.45461847389558235}, "template_name": "guaranteed/possible/impossible_arht"}, "justified in saying_arht": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='justified in saying_arht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3538152610441767}, "template_name": "justified in saying_arht"}}, "xnli_es": {"GPT-3 style_esht": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='GPT-3 style_esht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5313253012048192}, "template_name": "GPT-3 style_esht"}, "MNLI crowdsource_esht": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='MNLI crowdsource_esht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.334136546184739}, "template_name": "MNLI crowdsource_esht"}, "can we infer_esht": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='can we infer_esht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.36987951807228914}, "template_name": "can we infer_esht"}, "guaranteed/possible/impossible_esht": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='guaranteed/possible/impossible_esht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4686746987951807}, "template_name": "guaranteed/possible/impossible_esht"}, "justified in saying_esht": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='justified in saying_esht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.37630522088353413}, "template_name": "justified in saying_esht"}}, "xnli_fr": {"GPT-3 style_frht": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='GPT-3 style_frht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5345381526104418}, "template_name": "GPT-3 style_frht"}, "MNLI crowdsource_frht": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='MNLI crowdsource_frht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3357429718875502}, "template_name": "MNLI crowdsource_frht"}, "can we infer_frht": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='can we infer_frht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5224899598393574}, "template_name": "can we infer_frht"}, "guaranteed/possible/impossible_frht": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='guaranteed/possible/impossible_frht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.46586345381526106}, "template_name": "guaranteed/possible/impossible_frht"}, "justified in saying_frht": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='justified in saying_frht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4891566265060241}, "template_name": "justified in saying_frht"}}, "xnli_hi": {"GPT-3 style_hiht": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='GPT-3 style_hiht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3325301204819277}, "template_name": "GPT-3 style_hiht"}, "MNLI crowdsource_hiht": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='MNLI crowdsource_hiht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.470281124497992}, "template_name": "MNLI crowdsource_hiht"}, "can we infer_hiht": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='can we infer_hiht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.37309236947791163}, "template_name": "can we infer_hiht"}, "guaranteed/possible/impossible_hiht": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='guaranteed/possible/impossible_hiht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3514056224899598}, "template_name": "guaranteed/possible/impossible_hiht"}, "justified in saying_hiht": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='justified in saying_hiht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3746987951807229}, "template_name": "justified in saying_hiht"}}, "xnli_sw": {"GPT-3 style_swht": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='GPT-3 style_swht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.336144578313253}, "template_name": "GPT-3 style_swht"}, "MNLI crowdsource_swht": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='MNLI crowdsource_swht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "MNLI crowdsource_swht"}, "can we infer_swht": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='can we infer_swht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3453815261044177}, "template_name": "can we infer_swht"}, "guaranteed/possible/impossible_swht": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='guaranteed/possible/impossible_swht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.35582329317269074}, "template_name": "guaranteed/possible/impossible_swht"}, "justified in saying_swht": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='justified in saying_swht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3269076305220884}, "template_name": "justified in saying_swht"}}, "xnli_ur": {"GPT-3 style_urht": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='GPT-3 style_urht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4}, "template_name": "GPT-3 style_urht"}, "MNLI crowdsource_urht": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='MNLI crowdsource_urht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3562248995983936}, "template_name": "MNLI crowdsource_urht"}, "can we infer_urht": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='can we infer_urht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3349397590361446}, "template_name": "can we infer_urht"}, "guaranteed/possible/impossible_urht": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='guaranteed/possible/impossible_urht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.37630522088353413}, "template_name": "guaranteed/possible/impossible_urht"}, "justified in saying_urht": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='justified in saying_urht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3405622489959839}, "template_name": "justified in saying_urht"}}, "xnli_vi": {"GPT-3 style_viht": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='GPT-3 style_viht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5265060240963856}, "template_name": "GPT-3 style_viht"}, "MNLI crowdsource_viht": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='MNLI crowdsource_viht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.37710843373493974}, "template_name": "MNLI crowdsource_viht"}, "can we infer_viht": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='can we infer_viht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5116465863453815}, "template_name": "can we infer_viht"}, "guaranteed/possible/impossible_viht": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='guaranteed/possible/impossible_viht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3578313253012048}, "template_name": "guaranteed/possible/impossible_viht"}, "justified in saying_viht": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='justified in saying_viht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5028112449799197}, "template_name": "justified in saying_viht"}}, "xnli_zh": {"GPT-3 style_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='GPT-3 style_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3196787148594378}, "template_name": "GPT-3 style_zhht"}, "MNLI crowdsource_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='MNLI crowdsource_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.38112449799196785}, "template_name": "MNLI crowdsource_zhht"}, "can we infer_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='can we infer_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.40642570281124496}, "template_name": "can we infer_zhht"}, "guaranteed/possible/impossible_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='guaranteed/possible/impossible_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.344578313253012}, "template_name": "guaranteed/possible/impossible_zhht"}, "justified in saying_zhht": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='justified in saying_zhht', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3369477911646586}, "template_name": "justified in saying_zhht"}}}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/sw/GPT-3_style_swht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/sw/GPT-3_style_swht/results.json
deleted file mode 100644
index de666e20d45de89d7d2e0b87d2df2e5e636806a2..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/sw/GPT-3_style_swht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "GPT-3 style_swht",
-  "evaluation": {
-    "accuracy": 0.336144578313253
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='GPT-3 style_swht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/sw/MNLI_crowdsource_swht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/sw/MNLI_crowdsource_swht/results.json
deleted file mode 100644
index 0b299c5ccdc242196add1443520436e5e01b82db..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/sw/MNLI_crowdsource_swht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "MNLI crowdsource_swht",
-  "evaluation": {
-    "accuracy": 0.3333333333333333
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='MNLI crowdsource_swht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/sw/can_we_infer_swht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/sw/can_we_infer_swht/results.json
deleted file mode 100644
index b02921e07618dddbe9b0c9baa1bcf83488ffd879..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/sw/can_we_infer_swht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "can we infer_swht",
-  "evaluation": {
-    "accuracy": 0.3453815261044177
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='can we infer_swht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/sw/guaranteed_possible_impossible_swht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/sw/guaranteed_possible_impossible_swht/results.json
deleted file mode 100644
index 0d0c7de87ad44d55d779ab418e11dc04eff4163a..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/sw/guaranteed_possible_impossible_swht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "guaranteed/possible/impossible_swht",
-  "evaluation": {
-    "accuracy": 0.35582329317269074
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='guaranteed/possible/impossible_swht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/sw/justified_in_saying_swht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/sw/justified_in_saying_swht/results.json
deleted file mode 100644
index 09f98868f8a3f64df83efbfe2d61dc4507e3622e..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/sw/justified_in_saying_swht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "justified in saying_swht",
-  "evaluation": {
-    "accuracy": 0.3269076305220884
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='justified in saying_swht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ur/GPT-3_style_urht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ur/GPT-3_style_urht/results.json
deleted file mode 100644
index cde9a7dc932d7fd11b347b05243ec4274928b418..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ur/GPT-3_style_urht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "GPT-3 style_urht",
-  "evaluation": {
-    "accuracy": 0.4
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='GPT-3 style_urht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ur/MNLI_crowdsource_urht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ur/MNLI_crowdsource_urht/results.json
deleted file mode 100644
index a9c37fed7c6eaa7f42cae33a5ba7e0d82355619e..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ur/MNLI_crowdsource_urht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "MNLI crowdsource_urht",
-  "evaluation": {
-    "accuracy": 0.3562248995983936
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='MNLI crowdsource_urht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ur/can_we_infer_urht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ur/can_we_infer_urht/results.json
deleted file mode 100644
index 0275da97b071d9797bb41c979be0490b20c1a378..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ur/can_we_infer_urht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "can we infer_urht",
-  "evaluation": {
-    "accuracy": 0.3349397590361446
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='can we infer_urht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ur/guaranteed_possible_impossible_urht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ur/guaranteed_possible_impossible_urht/results.json
deleted file mode 100644
index 854f22010ac359d6fb9fd0586a760bdcfd66de48..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ur/guaranteed_possible_impossible_urht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "guaranteed/possible/impossible_urht",
-  "evaluation": {
-    "accuracy": 0.37630522088353413
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='guaranteed/possible/impossible_urht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ur/justified_in_saying_urht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ur/justified_in_saying_urht/results.json
deleted file mode 100644
index 1b761bd5ca49c65720cd1d210425f00c303cb2ad..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/ur/justified_in_saying_urht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "justified in saying_urht",
-  "evaluation": {
-    "accuracy": 0.3405622489959839
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='justified in saying_urht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/vi/GPT-3_style_viht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/vi/GPT-3_style_viht/results.json
deleted file mode 100644
index 911fb94874ba7bd2ea8692a7c8021ac66b5153c2..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/vi/GPT-3_style_viht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "GPT-3 style_viht",
-  "evaluation": {
-    "accuracy": 0.5265060240963856
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='GPT-3 style_viht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/vi/MNLI_crowdsource_viht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/vi/MNLI_crowdsource_viht/results.json
deleted file mode 100644
index 2a896f3f9b9c16c00619928134371c5d8a06c8cf..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/vi/MNLI_crowdsource_viht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "MNLI crowdsource_viht",
-  "evaluation": {
-    "accuracy": 0.37710843373493974
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='MNLI crowdsource_viht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/vi/can_we_infer_viht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/vi/can_we_infer_viht/results.json
deleted file mode 100644
index 5592cf57ddff8a5624c29225b518325819692f3a..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/vi/can_we_infer_viht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "can we infer_viht",
-  "evaluation": {
-    "accuracy": 0.5116465863453815
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='can we infer_viht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/vi/guaranteed_possible_impossible_viht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/vi/guaranteed_possible_impossible_viht/results.json
deleted file mode 100644
index 243d01383b0e661431abaf57e9da32a578e9728b..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/vi/guaranteed_possible_impossible_viht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "guaranteed/possible/impossible_viht",
-  "evaluation": {
-    "accuracy": 0.3578313253012048
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='guaranteed/possible/impossible_viht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/vi/justified_in_saying_viht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/vi/justified_in_saying_viht/results.json
deleted file mode 100644
index d4eadded27b8fe4d524b10ca3bb45f007dfeca90..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/vi/justified_in_saying_viht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "justified in saying_viht",
-  "evaluation": {
-    "accuracy": 0.5028112449799197
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='justified in saying_viht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/zh/GPT-3_style_zhht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/zh/GPT-3_style_zhht/results.json
deleted file mode 100644
index d4254e656fdcba212a82643bc0e1902dc64567ff..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/zh/GPT-3_style_zhht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "GPT-3 style_zhht",
-  "evaluation": {
-    "accuracy": 0.3196787148594378
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='GPT-3 style_zhht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/zh/MNLI_crowdsource_zhht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/zh/MNLI_crowdsource_zhht/results.json
deleted file mode 100644
index 10dbab5a4fe31be7999b7a95f3eee8e15829d6fb..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/zh/MNLI_crowdsource_zhht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "MNLI crowdsource_zhht",
-  "evaluation": {
-    "accuracy": 0.38112449799196785
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='MNLI crowdsource_zhht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/zh/can_we_infer_zhht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/zh/can_we_infer_zhht/results.json
deleted file mode 100644
index 620f357300bfe4c9817e823269ac9f4e1f4f44a5..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/zh/can_we_infer_zhht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "can we infer_zhht",
-  "evaluation": {
-    "accuracy": 0.40642570281124496
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='can we infer_zhht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/zh/guaranteed_possible_impossible_zhht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/zh/guaranteed_possible_impossible_zhht/results.json
deleted file mode 100644
index 65f9e6a084a0d11b193fa456270487f87f7d9b80..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/zh/guaranteed_possible_impossible_zhht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "guaranteed/possible/impossible_zhht",
-  "evaluation": {
-    "accuracy": 0.344578313253012
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='guaranteed/possible/impossible_zhht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/zh/justified_in_saying_zhht/results.json b/evaluation_bloommz-7b1/evaluation_xnliht/xnli/zh/justified_in_saying_zhht/results.json
deleted file mode 100644
index 1e999809f3f18ddd914a16705b305ea1a99d9038..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnliht/xnli/zh/justified_in_saying_zhht/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "justified in saying_zhht",
-  "evaluation": {
-    "accuracy": 0.3369477911646586
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='justified in saying_zhht', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ar/GPT-3_style_armt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ar/GPT-3_style_armt/results.json
deleted file mode 100644
index 77cfdf21c92824d790481216193934744f3571dd..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ar/GPT-3_style_armt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "GPT-3 style_armt",
-  "evaluation": {
-    "accuracy": 0.3333333333333333
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='GPT-3 style_armt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ar/MNLI_crowdsource_armt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ar/MNLI_crowdsource_armt/results.json
deleted file mode 100644
index 50233f2f7b48a56410b663a1e53062b48cb34970..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ar/MNLI_crowdsource_armt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "MNLI crowdsource_armt",
-  "evaluation": {
-    "accuracy": 0.4855421686746988
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='MNLI crowdsource_armt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ar/can_we_infer_armt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ar/can_we_infer_armt/results.json
deleted file mode 100644
index 3c38307364767eeff17f0b7017a55ec6d59caccf..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ar/can_we_infer_armt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "can we infer_armt",
-  "evaluation": {
-    "accuracy": 0.3413654618473896
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='can we infer_armt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ar/guaranteed_possible_impossible_armt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ar/guaranteed_possible_impossible_armt/results.json
deleted file mode 100644
index 62e5b9267ca8b55ec7427d8d404b4dc69e57c9ff..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ar/guaranteed_possible_impossible_armt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "guaranteed/possible/impossible_armt",
-  "evaluation": {
-    "accuracy": 0.35542168674698793
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='guaranteed/possible/impossible_armt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ar/justified_in_saying_armt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ar/justified_in_saying_armt/results.json
deleted file mode 100644
index 1f1f936f73b6370418470a32df93bc6ecc389d39..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ar/justified_in_saying_armt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ar",
-  "template_name": "justified in saying_armt",
-  "evaluation": {
-    "accuracy": 0.3465863453815261
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='justified in saying_armt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/es/GPT-3_style_esmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/es/GPT-3_style_esmt/results.json
deleted file mode 100644
index e1647e62c5110a1aa782d3f6bd3584976d02e177..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/es/GPT-3_style_esmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "GPT-3 style_esmt",
-  "evaluation": {
-    "accuracy": 0.5385542168674698
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='GPT-3 style_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/es/MNLI_crowdsource_esmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/es/MNLI_crowdsource_esmt/results.json
deleted file mode 100644
index 90b3f6eff9418f33f8581c2425f1323cbac2198a..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/es/MNLI_crowdsource_esmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "MNLI crowdsource_esmt",
-  "evaluation": {
-    "accuracy": 0.42690763052208835
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='MNLI crowdsource_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/es/can_we_infer_esmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/es/can_we_infer_esmt/results.json
deleted file mode 100644
index 0e88590482c46b2a0f9c50c71bbba04503a2c8c8..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/es/can_we_infer_esmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "can we infer_esmt",
-  "evaluation": {
-    "accuracy": 0.3895582329317269
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='can we infer_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/es/guaranteed_possible_impossible_esmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/es/guaranteed_possible_impossible_esmt/results.json
deleted file mode 100644
index 7c45c42fc502a194e4734d1f527f10eff0e86e04..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/es/guaranteed_possible_impossible_esmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "guaranteed/possible/impossible_esmt",
-  "evaluation": {
-    "accuracy": 0.3477911646586345
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='guaranteed/possible/impossible_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/es/justified_in_saying_esmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/es/justified_in_saying_esmt/results.json
deleted file mode 100644
index 6ab59224bbf868b322f353466f443457107cc276..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/es/justified_in_saying_esmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "es",
-  "template_name": "justified in saying_esmt",
-  "evaluation": {
-    "accuracy": 0.39799196787148594
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='justified in saying_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/fr/GPT-3_style_frmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/fr/GPT-3_style_frmt/results.json
deleted file mode 100644
index d588894667d032fd6e29b6ba7e9bcd4024fcd59c..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/fr/GPT-3_style_frmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "GPT-3 style_frmt",
-  "evaluation": {
-    "accuracy": 0.5220883534136547
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='GPT-3 style_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/fr/MNLI_crowdsource_frmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/fr/MNLI_crowdsource_frmt/results.json
deleted file mode 100644
index dd229693dc3267fffb9ffef13310ce4b41f16259..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/fr/MNLI_crowdsource_frmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "MNLI crowdsource_frmt",
-  "evaluation": {
-    "accuracy": 0.3192771084337349
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='MNLI crowdsource_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/fr/can_we_infer_frmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/fr/can_we_infer_frmt/results.json
deleted file mode 100644
index a40ee36a56db7f5039b164cc5f93fc9bf0d4c10e..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/fr/can_we_infer_frmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "can we infer_frmt",
-  "evaluation": {
-    "accuracy": 0.5240963855421686
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='can we infer_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/fr/guaranteed_possible_impossible_frmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/fr/guaranteed_possible_impossible_frmt/results.json
deleted file mode 100644
index d3c99fff4f20ce371d4abf4534e0a3de1fbfd1f0..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/fr/guaranteed_possible_impossible_frmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "guaranteed/possible/impossible_frmt",
-  "evaluation": {
-    "accuracy": 0.3819277108433735
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='guaranteed/possible/impossible_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/fr/justified_in_saying_frmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/fr/justified_in_saying_frmt/results.json
deleted file mode 100644
index 3c556932c2368ab5dad3ae7ea52e812891e569b1..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/fr/justified_in_saying_frmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "fr",
-  "template_name": "justified in saying_frmt",
-  "evaluation": {
-    "accuracy": 0.472289156626506
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='justified in saying_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/hi/GPT-3_style_himt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/hi/GPT-3_style_himt/results.json
deleted file mode 100644
index 64d2695e1222b8e2ef70a6be0f451c25b35dc558..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/hi/GPT-3_style_himt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "GPT-3 style_himt",
-  "evaluation": {
-    "accuracy": 0.3317269076305221
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='GPT-3 style_himt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/hi/MNLI_crowdsource_himt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/hi/MNLI_crowdsource_himt/results.json
deleted file mode 100644
index 73ae164a98d33cdf0f9767e574f0b858a4f5c78b..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/hi/MNLI_crowdsource_himt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "MNLI crowdsource_himt",
-  "evaluation": {
-    "accuracy": 0.3333333333333333
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='MNLI crowdsource_himt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/hi/can_we_infer_himt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/hi/can_we_infer_himt/results.json
deleted file mode 100644
index d20367db7addbcc1a429ed512438bca992387d0d..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/hi/can_we_infer_himt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "can we infer_himt",
-  "evaluation": {
-    "accuracy": 0.35943775100401604
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='can we infer_himt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/hi/guaranteed_possible_impossible_himt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/hi/guaranteed_possible_impossible_himt/results.json
deleted file mode 100644
index 34b08178a14f9f6812d8c27edcbeba951547d81e..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/hi/guaranteed_possible_impossible_himt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "guaranteed/possible/impossible_himt",
-  "evaluation": {
-    "accuracy": 0.3449799196787149
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='guaranteed/possible/impossible_himt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/hi/justified_in_saying_himt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/hi/justified_in_saying_himt/results.json
deleted file mode 100644
index 1aaa5f37e224219682c46b7e450a46500addd5f4..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/hi/justified_in_saying_himt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "hi",
-  "template_name": "justified in saying_himt",
-  "evaluation": {
-    "accuracy": 0.3654618473895582
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='justified in saying_himt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/merged.csv b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/merged.csv
deleted file mode 100644
index f0ea0a8121ebbea29225f2ac4c3308b146e7b28c..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/merged.csv
+++ /dev/null
@@ -1,50 +0,0 @@
-dataset,prompt,metric,value
-xnli_ar,GPT-3 style_armt,accuracy,0.3333333333333333
-xnli_ar,MNLI crowdsource_armt,accuracy,0.4855421686746988
-xnli_ar,can we infer_armt,accuracy,0.3413654618473896
-xnli_ar,guaranteed/possible/impossible_armt,accuracy,0.35542168674698793
-xnli_ar,justified in saying_armt,accuracy,0.3465863453815261
-xnli_ar,median,accuracy,0.3465863453815261
-xnli_es,GPT-3 style_esmt,accuracy,0.5385542168674698
-xnli_es,MNLI crowdsource_esmt,accuracy,0.42690763052208835
-xnli_es,can we infer_esmt,accuracy,0.3895582329317269
-xnli_es,guaranteed/possible/impossible_esmt,accuracy,0.3477911646586345
-xnli_es,justified in saying_esmt,accuracy,0.39799196787148594
-xnli_es,median,accuracy,0.39799196787148594
-xnli_fr,GPT-3 style_frmt,accuracy,0.5220883534136547
-xnli_fr,MNLI crowdsource_frmt,accuracy,0.3192771084337349
-xnli_fr,can we infer_frmt,accuracy,0.5240963855421686
-xnli_fr,guaranteed/possible/impossible_frmt,accuracy,0.3819277108433735
-xnli_fr,justified in saying_frmt,accuracy,0.472289156626506
-xnli_fr,median,accuracy,0.472289156626506
-xnli_hi,GPT-3 style_himt,accuracy,0.3317269076305221
-xnli_hi,MNLI crowdsource_himt,accuracy,0.3333333333333333
-xnli_hi,can we infer_himt,accuracy,0.35943775100401604
-xnli_hi,guaranteed/possible/impossible_himt,accuracy,0.3449799196787149
-xnli_hi,justified in saying_himt,accuracy,0.3654618473895582
-xnli_hi,median,accuracy,0.3449799196787149
-xnli_sw,GPT-3 style_swmt,accuracy,0.334136546184739
-xnli_sw,MNLI crowdsource_swmt,accuracy,0.3333333333333333
-xnli_sw,can we infer_swmt,accuracy,0.3337349397590361
-xnli_sw,guaranteed/possible/impossible_swmt,accuracy,0.3261044176706827
-xnli_sw,justified in saying_swmt,accuracy,0.334136546184739
-xnli_sw,median,accuracy,0.3337349397590361
-xnli_ur,GPT-3 style_urmt,accuracy,0.3377510040160643
-xnli_ur,MNLI crowdsource_urmt,accuracy,0.3337349397590361
-xnli_ur,can we infer_urmt,accuracy,0.3333333333333333
-xnli_ur,guaranteed/possible/impossible_urmt,accuracy,0.3333333333333333
-xnli_ur,justified in saying_urmt,accuracy,0.3337349397590361
-xnli_ur,median,accuracy,0.3337349397590361
-xnli_vi,GPT-3 style_vimt,accuracy,0.3333333333333333
-xnli_vi,MNLI crowdsource_vimt,accuracy,0.3887550200803213
-xnli_vi,can we infer_vimt,accuracy,0.3333333333333333
-xnli_vi,guaranteed/possible/impossible_vimt,accuracy,0.3321285140562249
-xnli_vi,justified in saying_vimt,accuracy,0.3333333333333333
-xnli_vi,median,accuracy,0.3333333333333333
-xnli_zh,GPT-3 style_zhmt,accuracy,0.4634538152610442
-xnli_zh,MNLI crowdsource_zhmt,accuracy,0.3345381526104418
-xnli_zh,can we infer_zhmt,accuracy,0.4891566265060241
-xnli_zh,guaranteed/possible/impossible_zhmt,accuracy,0.3393574297188755
-xnli_zh,justified in saying_zhmt,accuracy,0.48032128514056227
-xnli_zh,median,accuracy,0.4634538152610442
-multiple,average,multiple,0.3782630522088353
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/merged.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/merged.json
deleted file mode 100644
index 12399b33f08c836ec1b9d142e3e493fe304cdf20..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/merged.json
+++ /dev/null
@@ -1 +0,0 @@
-{"xnli_ar": {"GPT-3 style_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='GPT-3 style_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "GPT-3 style_armt"}, "MNLI crowdsource_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='MNLI crowdsource_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4855421686746988}, "template_name": "MNLI crowdsource_armt"}, "can we infer_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='can we infer_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3413654618473896}, "template_name": "can we infer_armt"}, "guaranteed/possible/impossible_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='guaranteed/possible/impossible_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.35542168674698793}, "template_name": "guaranteed/possible/impossible_armt"}, "justified in saying_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='justified in saying_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3465863453815261}, "template_name": "justified in saying_armt"}}, "xnli_es": {"GPT-3 style_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='GPT-3 style_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5385542168674698}, "template_name": "GPT-3 style_esmt"}, "MNLI crowdsource_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='MNLI crowdsource_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.42690763052208835}, "template_name": "MNLI crowdsource_esmt"}, "can we infer_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='can we infer_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3895582329317269}, "template_name": "can we infer_esmt"}, "guaranteed/possible/impossible_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='guaranteed/possible/impossible_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3477911646586345}, "template_name": "guaranteed/possible/impossible_esmt"}, "justified in saying_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='justified in saying_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "xnli", "evaluation": {"accuracy": 0.39799196787148594}, "template_name": "justified in saying_esmt"}}, "xnli_fr": {"GPT-3 style_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='GPT-3 style_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5220883534136547}, "template_name": "GPT-3 style_frmt"}, "MNLI crowdsource_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='MNLI crowdsource_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3192771084337349}, "template_name": "MNLI crowdsource_frmt"}, "can we infer_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='can we infer_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.5240963855421686}, "template_name": "can we infer_frmt"}, "guaranteed/possible/impossible_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='guaranteed/possible/impossible_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3819277108433735}, "template_name": "guaranteed/possible/impossible_frmt"}, "justified in saying_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='fr', template_name='justified in saying_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "xnli", "evaluation": {"accuracy": 0.472289156626506}, "template_name": "justified in saying_frmt"}}, "xnli_hi": {"GPT-3 style_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='GPT-3 style_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3317269076305221}, "template_name": "GPT-3 style_himt"}, "MNLI crowdsource_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='MNLI crowdsource_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "MNLI crowdsource_himt"}, "can we infer_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='can we infer_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.35943775100401604}, "template_name": "can we infer_himt"}, "guaranteed/possible/impossible_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='guaranteed/possible/impossible_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3449799196787149}, "template_name": "guaranteed/possible/impossible_himt"}, "justified in saying_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='justified in saying_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3654618473895582}, "template_name": "justified in saying_himt"}}, "xnli_sw": {"GPT-3 style_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='GPT-3 style_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.334136546184739}, "template_name": "GPT-3 style_swmt"}, "MNLI crowdsource_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='MNLI crowdsource_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "MNLI crowdsource_swmt"}, "can we infer_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='can we infer_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3337349397590361}, "template_name": "can we infer_swmt"}, "guaranteed/possible/impossible_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='guaranteed/possible/impossible_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3261044176706827}, "template_name": "guaranteed/possible/impossible_swmt"}, "justified in saying_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='justified in saying_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xnli", "evaluation": {"accuracy": 0.334136546184739}, "template_name": "justified in saying_swmt"}}, "xnli_ur": {"GPT-3 style_urmt": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='GPT-3 style_urmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3377510040160643}, "template_name": "GPT-3 style_urmt"}, "MNLI crowdsource_urmt": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='MNLI crowdsource_urmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3337349397590361}, "template_name": "MNLI crowdsource_urmt"}, "can we infer_urmt": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='can we infer_urmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "can we infer_urmt"}, "guaranteed/possible/impossible_urmt": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='guaranteed/possible/impossible_urmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "guaranteed/possible/impossible_urmt"}, "justified in saying_urmt": {"arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='justified in saying_urmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ur", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3337349397590361}, "template_name": "justified in saying_urmt"}}, "xnli_vi": {"GPT-3 style_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='GPT-3 style_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "GPT-3 style_vimt"}, "MNLI crowdsource_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='MNLI crowdsource_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3887550200803213}, "template_name": "MNLI crowdsource_vimt"}, "can we infer_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='can we infer_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "can we infer_vimt"}, "guaranteed/possible/impossible_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='guaranteed/possible/impossible_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3321285140562249}, "template_name": "guaranteed/possible/impossible_vimt"}, "justified in saying_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='justified in saying_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3333333333333333}, "template_name": "justified in saying_vimt"}}, "xnli_zh": {"GPT-3 style_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='GPT-3 style_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4634538152610442}, "template_name": "GPT-3 style_zhmt"}, "MNLI crowdsource_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='MNLI crowdsource_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3345381526104418}, "template_name": "MNLI crowdsource_zhmt"}, "can we infer_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='can we infer_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.4891566265060241}, "template_name": "can we infer_zhmt"}, "guaranteed/possible/impossible_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='guaranteed/possible/impossible_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.3393574297188755}, "template_name": "guaranteed/possible/impossible_zhmt"}, "justified in saying_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='justified in saying_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xnli", "evaluation": {"accuracy": 0.48032128514056227}, "template_name": "justified in saying_zhmt"}}}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/sw/GPT-3_style_swmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/sw/GPT-3_style_swmt/results.json
deleted file mode 100644
index 5242f09c1270f3f83829ac98159f36869eb2490d..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/sw/GPT-3_style_swmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "GPT-3 style_swmt",
-  "evaluation": {
-    "accuracy": 0.334136546184739
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='GPT-3 style_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/sw/MNLI_crowdsource_swmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/sw/MNLI_crowdsource_swmt/results.json
deleted file mode 100644
index 86775e878263814a2b8403e981105a19842c035d..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/sw/MNLI_crowdsource_swmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "MNLI crowdsource_swmt",
-  "evaluation": {
-    "accuracy": 0.3333333333333333
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='MNLI crowdsource_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/sw/can_we_infer_swmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/sw/can_we_infer_swmt/results.json
deleted file mode 100644
index 0b2fb84df09912986eea008200711b1d278d5042..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/sw/can_we_infer_swmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "can we infer_swmt",
-  "evaluation": {
-    "accuracy": 0.3337349397590361
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='can we infer_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/sw/guaranteed_possible_impossible_swmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/sw/guaranteed_possible_impossible_swmt/results.json
deleted file mode 100644
index 219047a34626d95888a5b5f9099345086a00fa27..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/sw/guaranteed_possible_impossible_swmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "guaranteed/possible/impossible_swmt",
-  "evaluation": {
-    "accuracy": 0.3261044176706827
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='guaranteed/possible/impossible_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/sw/justified_in_saying_swmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/sw/justified_in_saying_swmt/results.json
deleted file mode 100644
index d506bf07cfb0240b57bcac472cae7b35158da95b..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/sw/justified_in_saying_swmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "sw",
-  "template_name": "justified in saying_swmt",
-  "evaluation": {
-    "accuracy": 0.334136546184739
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='justified in saying_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ur/GPT-3_style_urmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ur/GPT-3_style_urmt/results.json
deleted file mode 100644
index 28cce4c610b89834c6313c33d9972021040e95a4..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ur/GPT-3_style_urmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "GPT-3 style_urmt",
-  "evaluation": {
-    "accuracy": 0.3377510040160643
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='GPT-3 style_urmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ur/MNLI_crowdsource_urmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ur/MNLI_crowdsource_urmt/results.json
deleted file mode 100644
index 8f961654aeec2e7a890b280917a887603c772bc9..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ur/MNLI_crowdsource_urmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "MNLI crowdsource_urmt",
-  "evaluation": {
-    "accuracy": 0.3337349397590361
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='MNLI crowdsource_urmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ur/can_we_infer_urmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ur/can_we_infer_urmt/results.json
deleted file mode 100644
index 99675a52d98c51024e91224b14cf355c1cae1161..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ur/can_we_infer_urmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "can we infer_urmt",
-  "evaluation": {
-    "accuracy": 0.3333333333333333
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='can we infer_urmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ur/guaranteed_possible_impossible_urmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ur/guaranteed_possible_impossible_urmt/results.json
deleted file mode 100644
index 0e5a632d847e31bf188b7a6924e8385f1af5dbd0..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ur/guaranteed_possible_impossible_urmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "guaranteed/possible/impossible_urmt",
-  "evaluation": {
-    "accuracy": 0.3333333333333333
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='guaranteed/possible/impossible_urmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ur/justified_in_saying_urmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ur/justified_in_saying_urmt/results.json
deleted file mode 100644
index 12d7fd8973ceeba90f0b3973829fd6297a4bf5e4..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/ur/justified_in_saying_urmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "ur",
-  "template_name": "justified in saying_urmt",
-  "evaluation": {
-    "accuracy": 0.3337349397590361
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ur', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ur', template_name='justified in saying_urmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/vi/GPT-3_style_vimt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/vi/GPT-3_style_vimt/results.json
deleted file mode 100644
index dfce0056ada3bdb2448acc5af08e6d58192bd49a..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/vi/GPT-3_style_vimt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "GPT-3 style_vimt",
-  "evaluation": {
-    "accuracy": 0.3333333333333333
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='GPT-3 style_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/vi/MNLI_crowdsource_vimt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/vi/MNLI_crowdsource_vimt/results.json
deleted file mode 100644
index b68db45df1b97750e4ca83fbcebe1c2ba51204b0..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/vi/MNLI_crowdsource_vimt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "MNLI crowdsource_vimt",
-  "evaluation": {
-    "accuracy": 0.3887550200803213
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='MNLI crowdsource_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/vi/can_we_infer_vimt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/vi/can_we_infer_vimt/results.json
deleted file mode 100644
index 399f1e12e2e5e1a960f554afebda1db41b5e54cf..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/vi/can_we_infer_vimt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "can we infer_vimt",
-  "evaluation": {
-    "accuracy": 0.3333333333333333
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='can we infer_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/vi/guaranteed_possible_impossible_vimt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/vi/guaranteed_possible_impossible_vimt/results.json
deleted file mode 100644
index 8555f1f55886cabfadd5a156a2e887a5606cca06..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/vi/guaranteed_possible_impossible_vimt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "guaranteed/possible/impossible_vimt",
-  "evaluation": {
-    "accuracy": 0.3321285140562249
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='guaranteed/possible/impossible_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/vi/justified_in_saying_vimt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/vi/justified_in_saying_vimt/results.json
deleted file mode 100644
index e1d3100447b06c579e62865f80bac3260b900fb0..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/vi/justified_in_saying_vimt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "vi",
-  "template_name": "justified in saying_vimt",
-  "evaluation": {
-    "accuracy": 0.3333333333333333
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='justified in saying_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/zh/GPT-3_style_zhmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/zh/GPT-3_style_zhmt/results.json
deleted file mode 100644
index a9e431c729c8176051311252cb6df006feb918b8..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/zh/GPT-3_style_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "GPT-3 style_zhmt",
-  "evaluation": {
-    "accuracy": 0.4634538152610442
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='GPT-3 style_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/zh/MNLI_crowdsource_zhmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/zh/MNLI_crowdsource_zhmt/results.json
deleted file mode 100644
index c8548b52fac45ccbd571c2f3c5ddf1de6d342e04..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/zh/MNLI_crowdsource_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "MNLI crowdsource_zhmt",
-  "evaluation": {
-    "accuracy": 0.3345381526104418
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='MNLI crowdsource_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/zh/can_we_infer_zhmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/zh/can_we_infer_zhmt/results.json
deleted file mode 100644
index 469b0a1ae48467123059fdd73ca7a3aa38150589..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/zh/can_we_infer_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "can we infer_zhmt",
-  "evaluation": {
-    "accuracy": 0.4891566265060241
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='can we infer_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/zh/guaranteed_possible_impossible_zhmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/zh/guaranteed_possible_impossible_zhmt/results.json
deleted file mode 100644
index 85c0552653669d8b485159da28454b6b60e4cb91..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/zh/guaranteed_possible_impossible_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "guaranteed/possible/impossible_zhmt",
-  "evaluation": {
-    "accuracy": 0.3393574297188755
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='guaranteed/possible/impossible_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/zh/justified_in_saying_zhmt/results.json b/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/zh/justified_in_saying_zhmt/results.json
deleted file mode 100644
index f90bef80a421b6c9c19603659c8c32d751877973..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xnlimt/xnli/zh/justified_in_saying_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xnli",
-  "dataset_config_name": "zh",
-  "template_name": "justified in saying_zhmt",
-  "evaluation": {
-    "accuracy": 0.48032128514056227
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xnli', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='justified in saying_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/ar/Answer_Given_options_armt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/ar/Answer_Given_options_armt/results.json
deleted file mode 100644
index 00b0919de5c779c5aded5f7884023a3e2b61f7ad..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/ar/Answer_Given_options_armt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "ar",
-  "template_name": "Answer Given options_armt",
-  "evaluation": {
-    "accuracy": 0.6664460622104567
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Answer Given options_armt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/ar/Choose_Story_Ending_armt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/ar/Choose_Story_Ending_armt/results.json
deleted file mode 100644
index a18875179d57b9179bf7e77c0044a588a4da04a6..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/ar/Choose_Story_Ending_armt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "ar",
-  "template_name": "Choose Story Ending_armt",
-  "evaluation": {
-    "accuracy": 0.8385175380542687
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Choose Story Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/ar/Generate_Ending_armt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/ar/Generate_Ending_armt/results.json
deleted file mode 100644
index 417a9cacc699edaa2406c8aed141e11bf6f02342..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/ar/Generate_Ending_armt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "ar",
-  "template_name": "Generate Ending_armt",
-  "evaluation": {
-    "accuracy": 0.5843812045003309
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Generate Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending_armt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending_armt/results.json
deleted file mode 100644
index 3dddd8a167406b0f6a4f128561262f51a6da0a0d..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/ar/Novel_Correct_Ending_armt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "ar",
-  "template_name": "Novel Correct Ending_armt",
-  "evaluation": {
-    "accuracy": 0.827928524156188
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Novel Correct Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options_armt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options_armt/results.json
deleted file mode 100644
index 3dc021bd784256e5b7ad89410662d4bbe23814e3..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/ar/Story_Continuation_and_Options_armt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "ar",
-  "template_name": "Story Continuation and Options_armt",
-  "evaluation": {
-    "accuracy": 0.8246194573130378
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Story Continuation and Options_armt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/es/Answer_Given_options_esmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/es/Answer_Given_options_esmt/results.json
deleted file mode 100644
index 2f80fdef48d02a647d1a396f1a1184b9de98f54d..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/es/Answer_Given_options_esmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "es",
-  "template_name": "Answer Given options_esmt",
-  "evaluation": {
-    "accuracy": 0.8325612177365983
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Answer Given options_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/es/Choose_Story_Ending_esmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/es/Choose_Story_Ending_esmt/results.json
deleted file mode 100644
index ac33228a2c44794760c0541a9deca5bca51ddc3f..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/es/Choose_Story_Ending_esmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "es",
-  "template_name": "Choose Story Ending_esmt",
-  "evaluation": {
-    "accuracy": 0.8881535407015222
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Choose Story Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/es/Generate_Ending_esmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/es/Generate_Ending_esmt/results.json
deleted file mode 100644
index ef96b59e5aaf8917c3133bbbfdcda66635648262..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/es/Generate_Ending_esmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "es",
-  "template_name": "Generate Ending_esmt",
-  "evaluation": {
-    "accuracy": 0.6776968894771674
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Generate Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/es/Novel_Correct_Ending_esmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/es/Novel_Correct_Ending_esmt/results.json
deleted file mode 100644
index 592e9750b5c5381a60ad2a9b0e637323d4c28a2d..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/es/Novel_Correct_Ending_esmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "es",
-  "template_name": "Novel Correct Ending_esmt",
-  "evaluation": {
-    "accuracy": 0.8656518861681006
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Novel Correct Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options_esmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options_esmt/results.json
deleted file mode 100644
index 472c5c2061df541500d7e2e9e6eef586123bcea7..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/es/Story_Continuation_and_Options_esmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "es",
-  "template_name": "Story Continuation and Options_esmt",
-  "evaluation": {
-    "accuracy": 0.886168100595632
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Story Continuation and Options_esmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/eu/Answer_Given_options_eumt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/eu/Answer_Given_options_eumt/results.json
deleted file mode 100644
index c5f4b5d3ddd16a50e5746e8a401ebfef6af241af..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/eu/Answer_Given_options_eumt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "eu",
-  "template_name": "Answer Given options_eumt",
-  "evaluation": {
-    "accuracy": 0.5678358702845797
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Answer Given options_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/eu/Choose_Story_Ending_eumt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/eu/Choose_Story_Ending_eumt/results.json
deleted file mode 100644
index 8da88a477ea78793b373952fb1be96f481e158fe..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/eu/Choose_Story_Ending_eumt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "eu",
-  "template_name": "Choose Story Ending_eumt",
-  "evaluation": {
-    "accuracy": 0.7326273990734613
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Choose Story Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/eu/Generate_Ending_eumt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/eu/Generate_Ending_eumt/results.json
deleted file mode 100644
index be084bbe7b32af5b5443df6df39bfdb1d06222b9..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/eu/Generate_Ending_eumt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "eu",
-  "template_name": "Generate Ending_eumt",
-  "evaluation": {
-    "accuracy": 0.5095962938451357
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Generate Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending_eumt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending_eumt/results.json
deleted file mode 100644
index ffd920c5fe19f4120e2f7b8cb9b03e75b4922a30..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/eu/Novel_Correct_Ending_eumt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "eu",
-  "template_name": "Novel Correct Ending_eumt",
-  "evaluation": {
-    "accuracy": 0.6558570483123759
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Novel Correct Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options_eumt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options_eumt/results.json
deleted file mode 100644
index 28733e61a32f89b95d611c458ff107eeaf7fa921..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/eu/Story_Continuation_and_Options_eumt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "eu",
-  "template_name": "Story Continuation and Options_eumt",
-  "evaluation": {
-    "accuracy": 0.7193911317008603
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Story Continuation and Options_eumt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/hi/Answer_Given_options_himt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/hi/Answer_Given_options_himt/results.json
deleted file mode 100644
index 7c0aab623963b0e207dfa345e07433e8f4b2a462..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/hi/Answer_Given_options_himt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "hi",
-  "template_name": "Answer Given options_himt",
-  "evaluation": {
-    "accuracy": 0.7054930509596293
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Answer Given options_himt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/hi/Choose_Story_Ending_himt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/hi/Choose_Story_Ending_himt/results.json
deleted file mode 100644
index 9a75b90868fbec40451f3664a316ab82d5ba43e7..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/hi/Choose_Story_Ending_himt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "hi",
-  "template_name": "Choose Story Ending_himt",
-  "evaluation": {
-    "accuracy": 0.8041032428855063
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Choose Story Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/hi/Generate_Ending_himt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/hi/Generate_Ending_himt/results.json
deleted file mode 100644
index 911d012a6b28cd79a12173b18896de370e018fd8..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/hi/Generate_Ending_himt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "hi",
-  "template_name": "Generate Ending_himt",
-  "evaluation": {
-    "accuracy": 0.614824619457313
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Generate Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending_himt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending_himt/results.json
deleted file mode 100644
index bdfdab2b619c12c446422d1160f1eaa296b71b61..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/hi/Novel_Correct_Ending_himt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "hi",
-  "template_name": "Novel Correct Ending_himt",
-  "evaluation": {
-    "accuracy": 0.7584381204500331
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Novel Correct Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options_himt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options_himt/results.json
deleted file mode 100644
index 1f4e6dd2de684bd0e4bcb456a7484fa808f7339c..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/hi/Story_Continuation_and_Options_himt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "hi",
-  "template_name": "Story Continuation and Options_himt",
-  "evaluation": {
-    "accuracy": 0.7981469225678358
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Story Continuation and Options_himt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/id/Answer_Given_options_idmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/id/Answer_Given_options_idmt/results.json
deleted file mode 100644
index 6618121e00b79e2467033caee48503ab2b22c2cf..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/id/Answer_Given_options_idmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "id",
-  "template_name": "Answer Given options_idmt",
-  "evaluation": {
-    "accuracy": 0.7326273990734613
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Answer Given options_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/id/Choose_Story_Ending_idmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/id/Choose_Story_Ending_idmt/results.json
deleted file mode 100644
index a3aa3e18e51c6b62cbb7cc2a97700041334c7e25..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/id/Choose_Story_Ending_idmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "id",
-  "template_name": "Choose Story Ending_idmt",
-  "evaluation": {
-    "accuracy": 0.8457974851091992
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Choose Story Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/id/Generate_Ending_idmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/id/Generate_Ending_idmt/results.json
deleted file mode 100644
index c7b80d028191f714bf9413a37c734491b302092d..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/id/Generate_Ending_idmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "id",
-  "template_name": "Generate Ending_idmt",
-  "evaluation": {
-    "accuracy": 0.5678358702845797
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Generate Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/id/Novel_Correct_Ending_idmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/id/Novel_Correct_Ending_idmt/results.json
deleted file mode 100644
index 3ea18beca2d084a68d50e0be3f03c41be93f2262..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/id/Novel_Correct_Ending_idmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "id",
-  "template_name": "Novel Correct Ending_idmt",
-  "evaluation": {
-    "accuracy": 0.8226340172071476
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Novel Correct Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options_idmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options_idmt/results.json
deleted file mode 100644
index e0984dea347a6d4a481eb42b9dc0c80f26693c89..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/id/Story_Continuation_and_Options_idmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "id",
-  "template_name": "Story Continuation and Options_idmt",
-  "evaluation": {
-    "accuracy": 0.8246194573130378
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Story Continuation and Options_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhmt/results.json
deleted file mode 100644
index e6ecf52678671b31c2c202352c3f9ee45d2103ff..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/zh/Answer_Given_options_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "zh",
-  "template_name": "Answer Given options_zhmt",
-  "evaluation": {
-    "accuracy": 0.7935142289874255
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Answer Given options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhmt/results.json
deleted file mode 100644
index 8254a6ef94f7708021f814ae35ccb144ccf766b2..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/zh/Choose_Story_Ending_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "zh",
-  "template_name": "Choose Story Ending_zhmt",
-  "evaluation": {
-    "accuracy": 0.8590337524818001
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Choose Story Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/zh/Generate_Ending_zhmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/zh/Generate_Ending_zhmt/results.json
deleted file mode 100644
index 86be5b81897365d846fdbc1c0212f1ca30b86bff..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/zh/Generate_Ending_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "zh",
-  "template_name": "Generate Ending_zhmt",
-  "evaluation": {
-    "accuracy": 0.6307081403044341
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Generate Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhmt/results.json
deleted file mode 100644
index a022479727e3f95ddd43e7725b85deff75157f3a..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/zh/Novel_Correct_Ending_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "zh",
-  "template_name": "Novel Correct Ending_zhmt",
-  "evaluation": {
-    "accuracy": 0.8590337524818001
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Novel Correct Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhmt/results.json
deleted file mode 100644
index 8a07d07dfc971f75f05a047a59fa74a581aae831..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xstory_cloze/zh/Story_Continuation_and_Options_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xstory_cloze",
-  "dataset_config_name": "zh",
-  "template_name": "Story Continuation and Options_zhmt",
-  "evaluation": {
-    "accuracy": 0.8464592984778293
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Story Continuation and Options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/fr/Replace_frmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/fr/Replace_frmt/results.json
deleted file mode 100644
index 7c1c9dbf4c2515bb812f95d4e251685bc13e5b98..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/fr/Replace_frmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "fr",
-  "template_name": "Replace_frmt",
-  "evaluation": {
-    "accuracy": 0.5542168674698795
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='Replace_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/fr/True_or_False_frmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/fr/True_or_False_frmt/results.json
deleted file mode 100644
index 67bf5b6066b7e1974c71a7431bc80f60f6a36a99..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/fr/True_or_False_frmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "fr",
-  "template_name": "True or False_frmt",
-  "evaluation": {
-    "accuracy": 0.46987951807228917
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='True or False_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/fr/does_underscore_refer_to_frmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/fr/does_underscore_refer_to_frmt/results.json
deleted file mode 100644
index 5be30d949e856fd08b7d57ec7e75260ecce3b1d4..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/fr/does_underscore_refer_to_frmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "fr",
-  "template_name": "does underscore refer to_frmt",
-  "evaluation": {
-    "accuracy": 0.5301204819277109
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='does underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/fr/stand_for_frmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/fr/stand_for_frmt/results.json
deleted file mode 100644
index aec05834b002a9c8c05edfa69aa43246be7035fd..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/fr/stand_for_frmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "fr",
-  "template_name": "stand for_frmt",
-  "evaluation": {
-    "accuracy": 0.5662650602409639
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='stand for_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/fr/underscore_refer_to_frmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/fr/underscore_refer_to_frmt/results.json
deleted file mode 100644
index 3be3c52fa7e754148451902755693f0495f4e710..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/fr/underscore_refer_to_frmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "fr",
-  "template_name": "underscore refer to_frmt",
-  "evaluation": {
-    "accuracy": 0.5783132530120482
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/pt/Replace_ptmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/pt/Replace_ptmt/results.json
deleted file mode 100644
index 949c31b60675cdcef7fd55b20c28040f16d3882b..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/pt/Replace_ptmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "pt",
-  "template_name": "Replace_ptmt",
-  "evaluation": {
-    "accuracy": 0.5551330798479087
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='Replace_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/pt/True_or_False_ptmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/pt/True_or_False_ptmt/results.json
deleted file mode 100644
index ca312b24a35e6eee10689ce9b36562e3525a6146..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/pt/True_or_False_ptmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "pt",
-  "template_name": "True or False_ptmt",
-  "evaluation": {
-    "accuracy": 0.4600760456273764
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='True or False_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/pt/does_underscore_refer_to_ptmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/pt/does_underscore_refer_to_ptmt/results.json
deleted file mode 100644
index 6bc1ed74c203359004d055de6ccfb7029966c693..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/pt/does_underscore_refer_to_ptmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "pt",
-  "template_name": "does underscore refer to_ptmt",
-  "evaluation": {
-    "accuracy": 0.5513307984790875
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='does underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/pt/stand_for_ptmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/pt/stand_for_ptmt/results.json
deleted file mode 100644
index 7aaa161d306ac5cf9ac2dbc18dfa356c209a237f..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/pt/stand_for_ptmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "pt",
-  "template_name": "stand for_ptmt",
-  "evaluation": {
-    "accuracy": 0.532319391634981
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='stand for_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/pt/underscore_refer_to_ptmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/pt/underscore_refer_to_ptmt/results.json
deleted file mode 100644
index 96182d824f2bb313ce572d25990c7d647e6d7177..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/pt/underscore_refer_to_ptmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "pt",
-  "template_name": "underscore refer to_ptmt",
-  "evaluation": {
-    "accuracy": 0.5361216730038023
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/zh/Replace_zhmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/zh/Replace_zhmt/results.json
deleted file mode 100644
index 3f844872631efc4a4bdae4466528d694216e5834..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/zh/Replace_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "zh",
-  "template_name": "Replace_zhmt",
-  "evaluation": {
-    "accuracy": 0.6130952380952381
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='Replace_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/zh/True_or_False_zhmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/zh/True_or_False_zhmt/results.json
deleted file mode 100644
index 3936a48ccce8997a4f8b4667cc4ba8bfbc3c0b49..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/zh/True_or_False_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "zh",
-  "template_name": "True or False_zhmt",
-  "evaluation": {
-    "accuracy": 0.5416666666666666
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='True or False_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhmt/results.json
deleted file mode 100644
index 3aa2985b0967db2c47e54ddaa98b4f757e85a5b5..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/zh/does_underscore_refer_to_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "zh",
-  "template_name": "does underscore refer to_zhmt",
-  "evaluation": {
-    "accuracy": 0.5793650793650794
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='does underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/zh/stand_for_zhmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/zh/stand_for_zhmt/results.json
deleted file mode 100644
index 0037df9b42fa6ab40eb5106ae130594838d95548..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/zh/stand_for_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "zh",
-  "template_name": "stand for_zhmt",
-  "evaluation": {
-    "accuracy": 0.5158730158730159
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='stand for_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/zh/underscore_refer_to_zhmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/zh/underscore_refer_to_zhmt/results.json
deleted file mode 100644
index 703bbb0e5938c8a1219a5452fb1692557ed43b4d..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/Muennighoff_xwinograd/zh/underscore_refer_to_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "Muennighoff/xwinograd",
-  "dataset_config_name": "zh",
-  "template_name": "underscore refer to_zhmt",
-  "evaluation": {
-    "accuracy": 0.625
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/merged.csv b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/merged.csv
deleted file mode 100644
index 4a9402f0137311e94081fe18b349de0b71ea3071..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/merged.csv
+++ /dev/null
@@ -1,86 +0,0 @@
-dataset,prompt,metric,value
-xcopa_id,C1 or C2? premise_idmt,accuracy,0.52
-xcopa_id,best_option_idmt,accuracy,0.63
-xcopa_id,cause_effect_idmt,accuracy,0.64
-xcopa_id,i_am_hesitating_idmt,accuracy,0.66
-xcopa_id,plausible_alternatives_idmt,accuracy,0.71
-xcopa_id,median,accuracy,0.64
-xcopa_sw,C1 or C2? premise_swmt,accuracy,0.6
-xcopa_sw,best_option_swmt,accuracy,0.59
-xcopa_sw,cause_effect_swmt,accuracy,0.57
-xcopa_sw,i_am_hesitating_swmt,accuracy,0.61
-xcopa_sw,plausible_alternatives_swmt,accuracy,0.59
-xcopa_sw,median,accuracy,0.59
-xcopa_ta,C1 or C2? premise_tamt,accuracy,0.6
-xcopa_ta,best_option_tamt,accuracy,0.56
-xcopa_ta,cause_effect_tamt,accuracy,0.58
-xcopa_ta,i_am_hesitating_tamt,accuracy,0.54
-xcopa_ta,plausible_alternatives_tamt,accuracy,0.54
-xcopa_ta,median,accuracy,0.56
-xcopa_vi,C1 or C2? premise_vimt,accuracy,0.63
-xcopa_vi,best_option_vimt,accuracy,0.73
-xcopa_vi,cause_effect_vimt,accuracy,0.72
-xcopa_vi,i_am_hesitating_vimt,accuracy,0.71
-xcopa_vi,plausible_alternatives_vimt,accuracy,0.77
-xcopa_vi,median,accuracy,0.72
-xcopa_zh,C1 or C2? premise_zhmt,accuracy,0.61
-xcopa_zh,best_option_zhmt,accuracy,0.69
-xcopa_zh,cause_effect_zhmt,accuracy,0.8
-xcopa_zh,i_am_hesitating_zhmt,accuracy,0.74
-xcopa_zh,plausible_alternatives_zhmt,accuracy,0.76
-xcopa_zh,median,accuracy,0.74
-xstory_cloze_ar,Answer Given options_armt,accuracy,0.6664460622104567
-xstory_cloze_ar,Choose Story Ending_armt,accuracy,0.8385175380542687
-xstory_cloze_ar,Generate Ending_armt,accuracy,0.5843812045003309
-xstory_cloze_ar,Novel Correct Ending_armt,accuracy,0.827928524156188
-xstory_cloze_ar,Story Continuation and Options_armt,accuracy,0.8246194573130378
-xstory_cloze_ar,median,accuracy,0.8246194573130378
-xstory_cloze_es,Answer Given options_esmt,accuracy,0.8325612177365983
-xstory_cloze_es,Choose Story Ending_esmt,accuracy,0.8881535407015222
-xstory_cloze_es,Generate Ending_esmt,accuracy,0.6776968894771674
-xstory_cloze_es,Novel Correct Ending_esmt,accuracy,0.8656518861681006
-xstory_cloze_es,Story Continuation and Options_esmt,accuracy,0.886168100595632
-xstory_cloze_es,median,accuracy,0.8656518861681006
-xstory_cloze_eu,Answer Given options_eumt,accuracy,0.5678358702845797
-xstory_cloze_eu,Choose Story Ending_eumt,accuracy,0.7326273990734613
-xstory_cloze_eu,Generate Ending_eumt,accuracy,0.5095962938451357
-xstory_cloze_eu,Novel Correct Ending_eumt,accuracy,0.6558570483123759
-xstory_cloze_eu,Story Continuation and Options_eumt,accuracy,0.7193911317008603
-xstory_cloze_eu,median,accuracy,0.6558570483123759
-xstory_cloze_hi,Answer Given options_himt,accuracy,0.7054930509596293
-xstory_cloze_hi,Choose Story Ending_himt,accuracy,0.8041032428855063
-xstory_cloze_hi,Generate Ending_himt,accuracy,0.614824619457313
-xstory_cloze_hi,Novel Correct Ending_himt,accuracy,0.7584381204500331
-xstory_cloze_hi,Story Continuation and Options_himt,accuracy,0.7981469225678358
-xstory_cloze_hi,median,accuracy,0.7584381204500331
-xstory_cloze_id,Answer Given options_idmt,accuracy,0.7326273990734613
-xstory_cloze_id,Choose Story Ending_idmt,accuracy,0.8457974851091992
-xstory_cloze_id,Generate Ending_idmt,accuracy,0.5678358702845797
-xstory_cloze_id,Novel Correct Ending_idmt,accuracy,0.8226340172071476
-xstory_cloze_id,Story Continuation and Options_idmt,accuracy,0.8246194573130378
-xstory_cloze_id,median,accuracy,0.8226340172071476
-xstory_cloze_zh,Answer Given options_zhmt,accuracy,0.7935142289874255
-xstory_cloze_zh,Choose Story Ending_zhmt,accuracy,0.8590337524818001
-xstory_cloze_zh,Generate Ending_zhmt,accuracy,0.6307081403044341
-xstory_cloze_zh,Novel Correct Ending_zhmt,accuracy,0.8590337524818001
-xstory_cloze_zh,Story Continuation and Options_zhmt,accuracy,0.8464592984778293
-xstory_cloze_zh,median,accuracy,0.8464592984778293
-xwinograd_fr,Replace_frmt,accuracy,0.5542168674698795
-xwinograd_fr,True or False_frmt,accuracy,0.46987951807228917
-xwinograd_fr,does underscore refer to_frmt,accuracy,0.5301204819277109
-xwinograd_fr,stand for_frmt,accuracy,0.5662650602409639
-xwinograd_fr,underscore refer to_frmt,accuracy,0.5783132530120482
-xwinograd_fr,median,accuracy,0.5542168674698795
-xwinograd_pt,Replace_ptmt,accuracy,0.5551330798479087
-xwinograd_pt,True or False_ptmt,accuracy,0.4600760456273764
-xwinograd_pt,does underscore refer to_ptmt,accuracy,0.5513307984790875
-xwinograd_pt,stand for_ptmt,accuracy,0.532319391634981
-xwinograd_pt,underscore refer to_ptmt,accuracy,0.5361216730038023
-xwinograd_pt,median,accuracy,0.5361216730038023
-xwinograd_zh,Replace_zhmt,accuracy,0.6130952380952381
-xwinograd_zh,True or False_zhmt,accuracy,0.5416666666666666
-xwinograd_zh,does underscore refer to_zhmt,accuracy,0.5793650793650794
-xwinograd_zh,stand for_zhmt,accuracy,0.5158730158730159
-xwinograd_zh,underscore refer to_zhmt,accuracy,0.625
-xwinograd_zh,median,accuracy,0.5793650793650794
-multiple,average,multiple,0.692383103411949
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/merged.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/merged.json
deleted file mode 100644
index 623985f31ca9ea9cd805f20932eab97e9204faa9..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/merged.json
+++ /dev/null
@@ -1 +0,0 @@
-{"Muennighoff/xstory_cloze_ar": {"Answer Given options_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Answer Given options_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6664460622104567}, "template_name": "Answer Given options_armt"}, "Choose Story Ending_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Choose Story Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8385175380542687}, "template_name": "Choose Story Ending_armt"}, "Generate Ending_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Generate Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.5843812045003309}, "template_name": "Generate Ending_armt"}, "Novel Correct Ending_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Novel Correct Ending_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.827928524156188}, "template_name": "Novel Correct Ending_armt"}, "Story Continuation and Options_armt": {"arguments": "Namespace(config_name=None, dataset_config_name='ar', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ar', template_name='Story Continuation and Options_armt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ar", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8246194573130378}, "template_name": "Story Continuation and Options_armt"}}, "Muennighoff/xstory_cloze_es": {"Answer Given options_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Answer Given options_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8325612177365983}, "template_name": "Answer Given options_esmt"}, "Choose Story Ending_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Choose Story Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8881535407015222}, "template_name": "Choose Story Ending_esmt"}, "Generate Ending_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Generate Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6776968894771674}, "template_name": "Generate Ending_esmt"}, "Novel Correct Ending_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Novel Correct Ending_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8656518861681006}, "template_name": "Novel Correct Ending_esmt"}, "Story Continuation and Options_esmt": {"arguments": "Namespace(config_name=None, dataset_config_name='es', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='es', template_name='Story Continuation and Options_esmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "es", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.886168100595632}, "template_name": "Story Continuation and Options_esmt"}}, "Muennighoff/xstory_cloze_eu": {"Answer Given options_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Answer Given options_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.5678358702845797}, "template_name": "Answer Given options_eumt"}, "Choose Story Ending_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Choose Story Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7326273990734613}, "template_name": "Choose Story Ending_eumt"}, "Generate Ending_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Generate Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.5095962938451357}, "template_name": "Generate Ending_eumt"}, "Novel Correct Ending_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Novel Correct Ending_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6558570483123759}, "template_name": "Novel Correct Ending_eumt"}, "Story Continuation and Options_eumt": {"arguments": "Namespace(config_name=None, dataset_config_name='eu', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='eu', template_name='Story Continuation and Options_eumt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "eu", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7193911317008603}, "template_name": "Story Continuation and Options_eumt"}}, "Muennighoff/xstory_cloze_hi": {"Answer Given options_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Answer Given options_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7054930509596293}, "template_name": "Answer Given options_himt"}, "Choose Story Ending_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Choose Story Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8041032428855063}, "template_name": "Choose Story Ending_himt"}, "Generate Ending_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Generate Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.614824619457313}, "template_name": "Generate Ending_himt"}, "Novel Correct Ending_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Novel Correct Ending_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7584381204500331}, "template_name": "Novel Correct Ending_himt"}, "Story Continuation and Options_himt": {"arguments": "Namespace(config_name=None, dataset_config_name='hi', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='hi', template_name='Story Continuation and Options_himt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "hi", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7981469225678358}, "template_name": "Story Continuation and Options_himt"}}, "Muennighoff/xstory_cloze_id": {"Answer Given options_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Answer Given options_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7326273990734613}, "template_name": "Answer Given options_idmt"}, "Choose Story Ending_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Choose Story Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8457974851091992}, "template_name": "Choose Story Ending_idmt"}, "Generate Ending_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Generate Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.5678358702845797}, "template_name": "Generate Ending_idmt"}, "Novel Correct Ending_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Novel Correct Ending_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8226340172071476}, "template_name": "Novel Correct Ending_idmt"}, "Story Continuation and Options_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='Story Continuation and Options_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8246194573130378}, "template_name": "Story Continuation and Options_idmt"}}, "Muennighoff/xstory_cloze_zh": {"Answer Given options_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Answer Given options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.7935142289874255}, "template_name": "Answer Given options_zhmt"}, "Choose Story Ending_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Choose Story Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8590337524818001}, "template_name": "Choose Story Ending_zhmt"}, "Generate Ending_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Generate Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.6307081403044341}, "template_name": "Generate Ending_zhmt"}, "Novel Correct Ending_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Novel Correct Ending_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8590337524818001}, "template_name": "Novel Correct Ending_zhmt"}, "Story Continuation and Options_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xstory_cloze', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='Story Continuation and Options_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xstory_cloze", "evaluation": {"accuracy": 0.8464592984778293}, "template_name": "Story Continuation and Options_zhmt"}}, "Muennighoff/xwinograd_fr": {"Replace_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='Replace_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5542168674698795}, "template_name": "Replace_frmt"}, "True or False_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='True or False_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.46987951807228917}, "template_name": "True or False_frmt"}, "does underscore refer to_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='does underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5301204819277109}, "template_name": "does underscore refer to_frmt"}, "stand for_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='stand for_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5662650602409639}, "template_name": "stand for_frmt"}, "underscore refer to_frmt": {"arguments": "Namespace(config_name=None, dataset_config_name='fr', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='fr', template_name='underscore refer to_frmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "fr", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5783132530120482}, "template_name": "underscore refer to_frmt"}}, "Muennighoff/xwinograd_pt": {"Replace_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='Replace_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5551330798479087}, "template_name": "Replace_ptmt"}, "True or False_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='True or False_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.4600760456273764}, "template_name": "True or False_ptmt"}, "does underscore refer to_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='does underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5513307984790875}, "template_name": "does underscore refer to_ptmt"}, "stand for_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='stand for_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.532319391634981}, "template_name": "stand for_ptmt"}, "underscore refer to_ptmt": {"arguments": "Namespace(config_name=None, dataset_config_name='pt', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='pt', template_name='underscore refer to_ptmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "pt", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5361216730038023}, "template_name": "underscore refer to_ptmt"}}, "Muennighoff/xwinograd_zh": {"Replace_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='Replace_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.6130952380952381}, "template_name": "Replace_zhmt"}, "True or False_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='True or False_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5416666666666666}, "template_name": "True or False_zhmt"}, "does underscore refer to_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='does underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5793650793650794}, "template_name": "does underscore refer to_zhmt"}, "stand for_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='stand for_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.5158730158730159}, "template_name": "stand for_zhmt"}, "underscore refer to_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='Muennighoff/xwinograd', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='test', target_max_length=256, template_config_name='zh', template_name='underscore refer to_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "Muennighoff/xwinograd", "evaluation": {"accuracy": 0.625}, "template_name": "underscore refer to_zhmt"}}, "xcopa_id": {"C1 or C2? premise_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='C1 or C2? premise_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.52}, "template_name": "C1 or C2? premise_idmt"}, "best_option_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='best_option_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.63}, "template_name": "best_option_idmt"}, "cause_effect_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='cause_effect_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.64}, "template_name": "cause_effect_idmt"}, "i_am_hesitating_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='i_am_hesitating_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.66}, "template_name": "i_am_hesitating_idmt"}, "plausible_alternatives_idmt": {"arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='plausible_alternatives_idmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "id", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.71}, "template_name": "plausible_alternatives_idmt"}}, "xcopa_sw": {"C1 or C2? premise_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='C1 or C2? premise_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.6}, "template_name": "C1 or C2? premise_swmt"}, "best_option_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='best_option_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.59}, "template_name": "best_option_swmt"}, "cause_effect_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='cause_effect_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.57}, "template_name": "cause_effect_swmt"}, "i_am_hesitating_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='i_am_hesitating_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.61}, "template_name": "i_am_hesitating_swmt"}, "plausible_alternatives_swmt": {"arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='plausible_alternatives_swmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "sw", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.59}, "template_name": "plausible_alternatives_swmt"}}, "xcopa_ta": {"C1 or C2? premise_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='C1 or C2? premise_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.6}, "template_name": "C1 or C2? premise_tamt"}, "best_option_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='best_option_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.56}, "template_name": "best_option_tamt"}, "cause_effect_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='cause_effect_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.58}, "template_name": "cause_effect_tamt"}, "i_am_hesitating_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='i_am_hesitating_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.54}, "template_name": "i_am_hesitating_tamt"}, "plausible_alternatives_tamt": {"arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='plausible_alternatives_tamt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "ta", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.54}, "template_name": "plausible_alternatives_tamt"}}, "xcopa_vi": {"C1 or C2? premise_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='C1 or C2? premise_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.63}, "template_name": "C1 or C2? premise_vimt"}, "best_option_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='best_option_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.73}, "template_name": "best_option_vimt"}, "cause_effect_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='cause_effect_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.72}, "template_name": "cause_effect_vimt"}, "i_am_hesitating_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='i_am_hesitating_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.71}, "template_name": "i_am_hesitating_vimt"}, "plausible_alternatives_vimt": {"arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='plausible_alternatives_vimt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "vi", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.77}, "template_name": "plausible_alternatives_vimt"}}, "xcopa_zh": {"C1 or C2? premise_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='C1 or C2? premise_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.61}, "template_name": "C1 or C2? premise_zhmt"}, "best_option_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='best_option_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.69}, "template_name": "best_option_zhmt"}, "cause_effect_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='cause_effect_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.8}, "template_name": "cause_effect_zhmt"}, "i_am_hesitating_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='i_am_hesitating_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.74}, "template_name": "i_am_hesitating_zhmt"}, "plausible_alternatives_zhmt": {"arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='plausible_alternatives_zhmt', tokenizer_name=None, use_slow_tokenizer=False)", "dataset_config_name": "zh", "dataset_name": "xcopa", "evaluation": {"accuracy": 0.76}, "template_name": "plausible_alternatives_zhmt"}}}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/id/C1_or_C2?_premise_idmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/id/C1_or_C2?_premise_idmt/results.json
deleted file mode 100644
index 770aecf4ada66a83385145616e7007789a7cb8d9..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/id/C1_or_C2?_premise_idmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "id",
-  "template_name": "C1 or C2? premise_idmt",
-  "evaluation": {
-    "accuracy": 0.52
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='C1 or C2? premise_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/id/best_option_idmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/id/best_option_idmt/results.json
deleted file mode 100644
index 423b21a96fd8c6769ab5ebcaac6e25e336722eaf..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/id/best_option_idmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "id",
-  "template_name": "best_option_idmt",
-  "evaluation": {
-    "accuracy": 0.63
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='best_option_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/id/cause_effect_idmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/id/cause_effect_idmt/results.json
deleted file mode 100644
index b4e8f8d7a3504693c80597ba0e247f27744daf0c..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/id/cause_effect_idmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "id",
-  "template_name": "cause_effect_idmt",
-  "evaluation": {
-    "accuracy": 0.64
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='cause_effect_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/id/i_am_hesitating_idmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/id/i_am_hesitating_idmt/results.json
deleted file mode 100644
index aff6b8404ecd49a65dcf440e4a77ed71facf41cd..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/id/i_am_hesitating_idmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "id",
-  "template_name": "i_am_hesitating_idmt",
-  "evaluation": {
-    "accuracy": 0.66
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='i_am_hesitating_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/id/plausible_alternatives_idmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/id/plausible_alternatives_idmt/results.json
deleted file mode 100644
index 093ee47bb082a31a917c4f4e1bdffbfcff027208..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/id/plausible_alternatives_idmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "id",
-  "template_name": "plausible_alternatives_idmt",
-  "evaluation": {
-    "accuracy": 0.71
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='id', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='id', template_name='plausible_alternatives_idmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/sw/C1_or_C2?_premise_swmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/sw/C1_or_C2?_premise_swmt/results.json
deleted file mode 100644
index 0d0b7e0c2986140a20ac0007be6a9a7d9c7f3e06..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/sw/C1_or_C2?_premise_swmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "sw",
-  "template_name": "C1 or C2? premise_swmt",
-  "evaluation": {
-    "accuracy": 0.6
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='C1 or C2? premise_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/sw/best_option_swmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/sw/best_option_swmt/results.json
deleted file mode 100644
index 30ec8563658088b618c110d1449b85a22476eb41..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/sw/best_option_swmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "sw",
-  "template_name": "best_option_swmt",
-  "evaluation": {
-    "accuracy": 0.59
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='best_option_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/sw/cause_effect_swmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/sw/cause_effect_swmt/results.json
deleted file mode 100644
index 8695c2a6f0d6784b1b6df9924a65ba837756f004..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/sw/cause_effect_swmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "sw",
-  "template_name": "cause_effect_swmt",
-  "evaluation": {
-    "accuracy": 0.57
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='cause_effect_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/sw/i_am_hesitating_swmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/sw/i_am_hesitating_swmt/results.json
deleted file mode 100644
index 1c30790a87601adf49bb4b88c549419044c85c1a..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/sw/i_am_hesitating_swmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "sw",
-  "template_name": "i_am_hesitating_swmt",
-  "evaluation": {
-    "accuracy": 0.61
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='i_am_hesitating_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/sw/plausible_alternatives_swmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/sw/plausible_alternatives_swmt/results.json
deleted file mode 100644
index cbb5a24fee689670d04bddd69bcae9af62589b0d..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/sw/plausible_alternatives_swmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "sw",
-  "template_name": "plausible_alternatives_swmt",
-  "evaluation": {
-    "accuracy": 0.59
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='sw', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='sw', template_name='plausible_alternatives_swmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/ta/C1_or_C2?_premise_tamt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/ta/C1_or_C2?_premise_tamt/results.json
deleted file mode 100644
index 28dc514e513e8365cf469b4c6bcb29b7a578e100..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/ta/C1_or_C2?_premise_tamt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "ta",
-  "template_name": "C1 or C2? premise_tamt",
-  "evaluation": {
-    "accuracy": 0.6
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='C1 or C2? premise_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/ta/best_option_tamt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/ta/best_option_tamt/results.json
deleted file mode 100644
index 78542a8227f63dcf1c8e7a433b7bb8027d262573..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/ta/best_option_tamt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "ta",
-  "template_name": "best_option_tamt",
-  "evaluation": {
-    "accuracy": 0.56
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='best_option_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/ta/cause_effect_tamt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/ta/cause_effect_tamt/results.json
deleted file mode 100644
index 0b63cfc7d5d6f767e20bed692b47acbf6644acda..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/ta/cause_effect_tamt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "ta",
-  "template_name": "cause_effect_tamt",
-  "evaluation": {
-    "accuracy": 0.58
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='cause_effect_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/ta/i_am_hesitating_tamt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/ta/i_am_hesitating_tamt/results.json
deleted file mode 100644
index addeff34489644602e10a13a32136a853d11a027..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/ta/i_am_hesitating_tamt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "ta",
-  "template_name": "i_am_hesitating_tamt",
-  "evaluation": {
-    "accuracy": 0.54
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='i_am_hesitating_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/ta/plausible_alternatives_tamt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/ta/plausible_alternatives_tamt/results.json
deleted file mode 100644
index 956fbb245f1ce4cb95cabc8bb699a728716e7a49..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/ta/plausible_alternatives_tamt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "ta",
-  "template_name": "plausible_alternatives_tamt",
-  "evaluation": {
-    "accuracy": 0.54
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='ta', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='ta', template_name='plausible_alternatives_tamt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/vi/C1_or_C2?_premise_vimt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/vi/C1_or_C2?_premise_vimt/results.json
deleted file mode 100644
index 2028137a9042824411e47cae1ea23f520716e0c0..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/vi/C1_or_C2?_premise_vimt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "vi",
-  "template_name": "C1 or C2? premise_vimt",
-  "evaluation": {
-    "accuracy": 0.63
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='C1 or C2? premise_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/vi/best_option_vimt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/vi/best_option_vimt/results.json
deleted file mode 100644
index 67fb890f45a0d2e5054c512361c5454687ba31e1..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/vi/best_option_vimt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "vi",
-  "template_name": "best_option_vimt",
-  "evaluation": {
-    "accuracy": 0.73
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='best_option_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/vi/cause_effect_vimt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/vi/cause_effect_vimt/results.json
deleted file mode 100644
index 33fcb6b0693c93cd8908bca9e278290442c11cf0..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/vi/cause_effect_vimt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "vi",
-  "template_name": "cause_effect_vimt",
-  "evaluation": {
-    "accuracy": 0.72
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='cause_effect_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/vi/i_am_hesitating_vimt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/vi/i_am_hesitating_vimt/results.json
deleted file mode 100644
index 93cc0bc854520f7c93ebeb07b7d3d73ecc93cf19..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/vi/i_am_hesitating_vimt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "vi",
-  "template_name": "i_am_hesitating_vimt",
-  "evaluation": {
-    "accuracy": 0.71
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='i_am_hesitating_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/vi/plausible_alternatives_vimt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/vi/plausible_alternatives_vimt/results.json
deleted file mode 100644
index bdd4d1a4aa56e09ec45ee9b8f071edae2dd7450b..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/vi/plausible_alternatives_vimt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "vi",
-  "template_name": "plausible_alternatives_vimt",
-  "evaluation": {
-    "accuracy": 0.77
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='vi', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='vi', template_name='plausible_alternatives_vimt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/zh/C1_or_C2?_premise_zhmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/zh/C1_or_C2?_premise_zhmt/results.json
deleted file mode 100644
index 2970afcaeb8d36e3a35214b3c4c778ac18752cf0..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/zh/C1_or_C2?_premise_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "zh",
-  "template_name": "C1 or C2? premise_zhmt",
-  "evaluation": {
-    "accuracy": 0.61
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='C1 or C2? premise_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/zh/best_option_zhmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/zh/best_option_zhmt/results.json
deleted file mode 100644
index 4cd6431f41ee122238b455e34f0bf4c95e383c5e..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/zh/best_option_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "zh",
-  "template_name": "best_option_zhmt",
-  "evaluation": {
-    "accuracy": 0.69
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='best_option_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/zh/cause_effect_zhmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/zh/cause_effect_zhmt/results.json
deleted file mode 100644
index aa0aa530ab5f84188622f842b58fd663198c7bb7..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/zh/cause_effect_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "zh",
-  "template_name": "cause_effect_zhmt",
-  "evaluation": {
-    "accuracy": 0.8
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='cause_effect_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/zh/i_am_hesitating_zhmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/zh/i_am_hesitating_zhmt/results.json
deleted file mode 100644
index b61d725b211203c13659dc312de7ca007207fa17..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/zh/i_am_hesitating_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "zh",
-  "template_name": "i_am_hesitating_zhmt",
-  "evaluation": {
-    "accuracy": 0.74
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='i_am_hesitating_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file
diff --git a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/zh/plausible_alternatives_zhmt/results.json b/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/zh/plausible_alternatives_zhmt/results.json
deleted file mode 100644
index 049f76fe863b55ba2f0881cb3b146bba011f82cc..0000000000000000000000000000000000000000
--- a/evaluation_bloommz-7b1/evaluation_xwinstorycopamt/xcopa/zh/plausible_alternatives_zhmt/results.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-  "dataset_name": "xcopa",
-  "dataset_config_name": "zh",
-  "template_name": "plausible_alternatives_zhmt",
-  "evaluation": {
-    "accuracy": 0.76
-  },
-  "arguments": "Namespace(config_name=None, dataset_config_name='zh', dataset_name='xcopa', debug=False, dtype='float16', max_length=2048, model_name_or_path='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt', output_dir='/gpfsscratch/rech/six/commun/experiments/muennighoff/bloomckpt/6b3t0/tr13f-6b3-ml-t0-lmtoks341b-t0toks4b-xp3mt/evaluation', pad_to_max_length=False, per_device_eval_batch_size=8, prefixlm=False, split='validation', target_max_length=256, template_config_name='zh', template_name='plausible_alternatives_zhmt', tokenizer_name=None, use_slow_tokenizer=False)"
-}
\ No newline at end of file