{ "config_general": { "lighteval_sha": "?", "num_fewshot_seeds": 1, "override_batch_size": 1, "max_samples": null, "job_id": "", "start_time": 1048188.318193615, "end_time": 1048420.852422956, "total_evaluation_time_secondes": "232.53422934107948", "model_name": "teknium/OpenHermes-2.5-Mistral-7B", "model_sha": "24c0bea14d53e6f67f1fbe2eca5bfe7cae389b33", "model_dtype": "torch.bfloat16", "model_size": "13.99 GB", "config": null }, "results": { "harness|bbh:causal_judgment|3": { "em": 0.5935828877005348, "em_stderr": 0.036013904358574426, "qem": 0.5935828877005348, "qem_stderr": 0.036013904358574426, "pem": 0.5935828877005348, "pem_stderr": 0.036013904358574426, "pqem": 0.5935828877005348, "pqem_stderr": 0.036013904358574426, "perfect_em": 0.5935828877005348, "perfect_em_stderr": 0.036013904358574426 }, "harness|bbh:date_understanding|3": { "em": 0.428, "em_stderr": 0.031355968923772626, "qem": 0.428, "qem_stderr": 0.031355968923772626, "pem": 0.428, "pem_stderr": 0.031355968923772626, "pqem": 0.528, "pqem_stderr": 0.03163648953154439, "perfect_em": 0.428, "perfect_em_stderr": 0.031355968923772626 }, "harness|bbh:disambiguation_qa|3": { "em": 0.664, "em_stderr": 0.029933259094191516, "qem": 0.664, "qem_stderr": 0.029933259094191516, "pem": 0.664, "pem_stderr": 0.029933259094191516, "pqem": 0.736, "pqem_stderr": 0.027934518957690908, "perfect_em": 0.664, "perfect_em_stderr": 0.029933259094191516 }, "harness|bbh:geometric_shapes|3": { "em": 0.252, "em_stderr": 0.027513851933031363, "qem": 0.252, "qem_stderr": 0.027513851933031363, "pem": 0.252, "pem_stderr": 0.027513851933031363, "pqem": 0.252, "pqem_stderr": 0.027513851933031363, "perfect_em": 0.252, "perfect_em_stderr": 0.027513851933031363 }, "harness|bbh:logical_deduction_five_objects|3": { "em": 0.48, "em_stderr": 0.03166085340849519, "qem": 0.48, "qem_stderr": 0.03166085340849519, "pem": 0.48, "pem_stderr": 0.03166085340849519, "pqem": 0.58, "pqem_stderr": 0.03127799950463661, "perfect_em": 0.48, "perfect_em_stderr": 0.03166085340849519 }, "harness|bbh:logical_deduction_seven_objects|3": { "em": 0.408, "em_stderr": 0.031145209846548488, "qem": 0.408, "qem_stderr": 0.031145209846548488, "pem": 0.408, "pem_stderr": 0.031145209846548488, "pqem": 0.516, "pqem_stderr": 0.03166998503010741, "perfect_em": 0.408, "perfect_em_stderr": 0.031145209846548488 }, "harness|bbh:logical_deduction_three_objects|3": { "em": 0.648, "em_stderr": 0.030266288057359942, "qem": 0.648, "qem_stderr": 0.030266288057359942, "pem": 0.648, "pem_stderr": 0.030266288057359942, "pqem": 0.84, "pqem_stderr": 0.02323271478206066, "perfect_em": 0.648, "perfect_em_stderr": 0.030266288057359942 }, "harness|bbh:movie_recommendation|3": { "em": 0.6867469879518072, "em_stderr": 0.02945236466291991, "qem": 0.6867469879518072, "qem_stderr": 0.02945236466291991, "pem": 0.6867469879518072, "pem_stderr": 0.02945236466291991, "pqem": 0.7349397590361446, "pqem_stderr": 0.028026723251674716, "perfect_em": 0.6867469879518072, "perfect_em_stderr": 0.02945236466291991 }, "harness|bbh:navigate|3": { "em": 0.572, "em_stderr": 0.031355968923772626, "qem": 0.572, "qem_stderr": 0.031355968923772626, "pem": 0.572, "pem_stderr": 0.031355968923772626, "pqem": 0.572, "pqem_stderr": 0.031355968923772626, "perfect_em": 0.572, "perfect_em_stderr": 0.031355968923772626 }, "harness|bbh:reasoning_about_colored_objects|3": { "em": 0.332, "em_stderr": 0.02984403904746591, "qem": 0.332, "qem_stderr": 0.02984403904746591, "pem": 0.428, "pem_stderr": 0.03135596892377261, "pqem": 0.528, "pqem_stderr": 0.031636489531544396, "perfect_em": 0.332, "perfect_em_stderr": 0.02984403904746591 }, "harness|bbh:ruin_names|3": { "em": 0.5403225806451613, "em_stderr": 0.031710615183950554, "qem": 0.5403225806451613, "qem_stderr": 0.031710615183950554, "pem": 0.5403225806451613, "pem_stderr": 0.031710615183950554, "pqem": 0.6330645161290323, "pqem_stderr": 0.030666934450850083, "perfect_em": 0.5403225806451613, "perfect_em_stderr": 0.031710615183950554 }, "harness|bbh:salient_translation_error_detection|3": { "em": 0.344, "em_stderr": 0.03010450339231639, "qem": 0.344, "qem_stderr": 0.03010450339231639, "pem": 0.344, "pem_stderr": 0.03010450339231639, "pqem": 0.484, "pqem_stderr": 0.031669985030107414, "perfect_em": 0.344, "perfect_em_stderr": 0.03010450339231639 }, "harness|bbh:snarks|3": { "em": 0.7696629213483146, "em_stderr": 0.03164794946543343, "qem": 0.7696629213483146, "qem_stderr": 0.03164794946543343, "pem": 0.7696629213483146, "pem_stderr": 0.03164794946543343, "pqem": 0.8426966292134831, "pqem_stderr": 0.027366421373452483, "perfect_em": 0.7696629213483146, "perfect_em_stderr": 0.03164794946543343 }, "harness|bbh:sports_understanding|3": { "em": 0.824, "em_stderr": 0.024133497525457112, "qem": 0.824, "qem_stderr": 0.024133497525457112, "pem": 0.824, "pem_stderr": 0.024133497525457112, "pqem": 0.824, "pqem_stderr": 0.024133497525457112, "perfect_em": 0.824, "perfect_em_stderr": 0.024133497525457112 }, "harness|bbh:temporal_sequences|3": { "em": 0.296, "em_stderr": 0.028928939388379635, "qem": 0.296, "qem_stderr": 0.028928939388379635, "pem": 0.296, "pem_stderr": 0.028928939388379635, "pqem": 0.472, "pqem_stderr": 0.0316364895315444, "perfect_em": 0.296, "perfect_em_stderr": 0.028928939388379635 }, "harness|bbh:tracking_shuffled_objects_five_objects|3": { "em": 0.2, "em_stderr": 0.02534897002097908, "qem": 0.2, "qem_stderr": 0.02534897002097908, "pem": 0.2, "pem_stderr": 0.02534897002097908, "pqem": 0.388, "pqem_stderr": 0.030881038748993908, "perfect_em": 0.2, "perfect_em_stderr": 0.02534897002097908 }, "harness|bbh:tracking_shuffled_objects_seven_objects|3": { "em": 0.12, "em_stderr": 0.02059360059683994, "qem": 0.12, "qem_stderr": 0.02059360059683994, "pem": 0.12, "pem_stderr": 0.02059360059683994, "pqem": 0.252, "pqem_stderr": 0.02751385193303136, "perfect_em": 0.12, "perfect_em_stderr": 0.02059360059683994 }, "harness|bbh:tracking_shuffled_objects_three_objects|3": { "em": 0.388, "em_stderr": 0.03088103874899391, "qem": 0.388, "qem_stderr": 0.03088103874899391, "pem": 0.388, "pem_stderr": 0.03088103874899391, "pqem": 0.684, "pqem_stderr": 0.029462657598578676, "perfect_em": 0.388, "perfect_em_stderr": 0.03088103874899391 }, "harness|bbh:_average|3": { "em": 0.47479529875810095, "em_stderr": 0.02954949014324901, "qem": 0.47479529875810095, "qem_stderr": 0.02954949014324901, "pem": 0.48012863209143425, "pem_stderr": 0.02963348624748827, "pqem": 0.5811268773377329, "pqem_stderr": 0.029646084555369608, "perfect_em": 0.47479529875810095, "perfect_em_stderr": 0.02954949014324901 } }, "versions": { "harness|bbh:causal_judgment|3": 0, "harness|bbh:date_understanding|3": 0, "harness|bbh:disambiguation_qa|3": 0, "harness|bbh:geometric_shapes|3": 0, "harness|bbh:logical_deduction_five_objects|3": 0, "harness|bbh:logical_deduction_seven_objects|3": 0, "harness|bbh:logical_deduction_three_objects|3": 0, "harness|bbh:movie_recommendation|3": 0, "harness|bbh:navigate|3": 0, "harness|bbh:reasoning_about_colored_objects|3": 0, "harness|bbh:ruin_names|3": 0, "harness|bbh:salient_translation_error_detection|3": 0, "harness|bbh:snarks|3": 0, "harness|bbh:sports_understanding|3": 0, "harness|bbh:temporal_sequences|3": 0, "harness|bbh:tracking_shuffled_objects_five_objects|3": 0, "harness|bbh:tracking_shuffled_objects_seven_objects|3": 0, "harness|bbh:tracking_shuffled_objects_three_objects|3": 0 }, "config_tasks": { "harness|bbh:causal_judgment": { "name": "bbh:causal_judgment", "prompt_function": "bbh_causal_judgment", "hf_repo": "lukaemon/bbh", "hf_subset": "causal_judgement", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 187, "effective_num_docs": 187, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:date_understanding": { "name": "bbh:date_understanding", "prompt_function": "bbh_date_understanding", "hf_repo": "lukaemon/bbh", "hf_subset": "date_understanding", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:disambiguation_qa": { "name": "bbh:disambiguation_qa", "prompt_function": "bbh_disambiguation_qa", "hf_repo": "lukaemon/bbh", "hf_subset": "disambiguation_qa", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:geometric_shapes": { "name": "bbh:geometric_shapes", "prompt_function": "bbh_geometric_shapes", "hf_repo": "lukaemon/bbh", "hf_subset": "geometric_shapes", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:logical_deduction_five_objects": { "name": "bbh:logical_deduction_five_objects", "prompt_function": "bbh_logical_deduction_five_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "logical_deduction_five_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:logical_deduction_seven_objects": { "name": "bbh:logical_deduction_seven_objects", "prompt_function": "bbh_logical_deduction_seven_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "logical_deduction_seven_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:logical_deduction_three_objects": { "name": "bbh:logical_deduction_three_objects", "prompt_function": "bbh_logical_deduction_three_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "logical_deduction_three_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:movie_recommendation": { "name": "bbh:movie_recommendation", "prompt_function": "bbh_movie_recommendation", "hf_repo": "lukaemon/bbh", "hf_subset": "movie_recommendation", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 249, "effective_num_docs": 249, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:navigate": { "name": "bbh:navigate", "prompt_function": "bbh_navigate", "hf_repo": "lukaemon/bbh", "hf_subset": "navigate", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:reasoning_about_colored_objects": { "name": "bbh:reasoning_about_colored_objects", "prompt_function": "bbh_reasoning_about_colored_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "reasoning_about_colored_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:ruin_names": { "name": "bbh:ruin_names", "prompt_function": "bbh_ruin_names", "hf_repo": "lukaemon/bbh", "hf_subset": "ruin_names", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 248, "effective_num_docs": 248, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:salient_translation_error_detection": { "name": "bbh:salient_translation_error_detection", "prompt_function": "bbh_salient_translation_error_detection", "hf_repo": "lukaemon/bbh", "hf_subset": "salient_translation_error_detection", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:snarks": { "name": "bbh:snarks", "prompt_function": "bbh_snarks", "hf_repo": "lukaemon/bbh", "hf_subset": "snarks", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 178, "effective_num_docs": 178, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:sports_understanding": { "name": "bbh:sports_understanding", "prompt_function": "bbh_sports_understanding", "hf_repo": "lukaemon/bbh", "hf_subset": "sports_understanding", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:temporal_sequences": { "name": "bbh:temporal_sequences", "prompt_function": "bbh_temporal_sequences", "hf_repo": "lukaemon/bbh", "hf_subset": "temporal_sequences", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:tracking_shuffled_objects_five_objects": { "name": "bbh:tracking_shuffled_objects_five_objects", "prompt_function": "bbh_tracking_shuffled_objects_five_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "tracking_shuffled_objects_five_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:tracking_shuffled_objects_seven_objects": { "name": "bbh:tracking_shuffled_objects_seven_objects", "prompt_function": "bbh_tracking_shuffled_objects_seven_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "tracking_shuffled_objects_seven_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null }, "harness|bbh:tracking_shuffled_objects_three_objects": { "name": "bbh:tracking_shuffled_objects_three_objects", "prompt_function": "bbh_tracking_shuffled_objects_three_objects", "hf_repo": "lukaemon/bbh", "hf_subset": "tracking_shuffled_objects_three_objects", "metric": [ "exact_match", "quasi_exact_match", "prefix_exact_match", "prefix_quasi_exact_match", "perfect_exact_match" ], "hf_avail_splits": [ "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": null, "generation_size": 20, "stop_sequence": [ "", "Q:", "\n\n" ], "output_regex": null, "frozen": false, "suite": [ "harness" ], "original_num_docs": 250, "effective_num_docs": 250, "trust_dataset": true, "must_remove_duplicate_docs": null } }, "summary_tasks": { "harness|bbh:causal_judgment|3": { "hashes": { "hash_examples": "63218f5ae055ab2b", "hash_full_prompts": "7303fa1d0fe0b29a", "hash_input_tokens": "94e6ca97dc7a8d65", "hash_cont_tokens": "c3d0b9e4e0ee81b9" }, "truncated": 187, "non_truncated": 0, "padded": 0, "non_padded": 187, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:date_understanding|3": { "hashes": { "hash_examples": "f145c7a06def3c8e", "hash_full_prompts": "69e60d10afa5a6f1", "hash_input_tokens": "56c1b1dfb318cc75", "hash_cont_tokens": "13813e073a67c71c" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:disambiguation_qa|3": { "hashes": { "hash_examples": "19677fd1773f7eb9", "hash_full_prompts": "ae0a8fd428f9aee3", "hash_input_tokens": "bc3e442621b75177", "hash_cont_tokens": "cd37ffdb5b2c05eb" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:geometric_shapes|3": { "hashes": { "hash_examples": "76c7b11a13cc72a9", "hash_full_prompts": "76633257f67207f9", "hash_input_tokens": "18d576df2960751d", "hash_cont_tokens": "665887996d172717" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:logical_deduction_five_objects|3": { "hashes": { "hash_examples": "0e958c856332a745", "hash_full_prompts": "3c96645848786efd", "hash_input_tokens": "36a60b866a1bf813", "hash_cont_tokens": "314ac0615c6ba8b2" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:logical_deduction_seven_objects|3": { "hashes": { "hash_examples": "ab9de25a5eb40d09", "hash_full_prompts": "185c5851c101ee66", "hash_input_tokens": "c1e2e1d71455bb49", "hash_cont_tokens": "8be80ca215d5b2a3" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:logical_deduction_three_objects|3": { "hashes": { "hash_examples": "3c6bf52517714218", "hash_full_prompts": "8ba2d94357e589d0", "hash_input_tokens": "70f1b3c78b924815", "hash_cont_tokens": "9066a1f8bf0c0fc5" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:movie_recommendation|3": { "hashes": { "hash_examples": "2d9dc4975935d31a", "hash_full_prompts": "a411e216d0f5f626", "hash_input_tokens": "d671ce3b88ee45cd", "hash_cont_tokens": "4ddad062def5e8ef" }, "truncated": 249, "non_truncated": 0, "padded": 0, "non_padded": 249, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:navigate|3": { "hashes": { "hash_examples": "ba91dcdb9a064255", "hash_full_prompts": "ebb3084ecc78a46a", "hash_input_tokens": "51743c1fef4a5482", "hash_cont_tokens": "11beb4a48b985d44" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:reasoning_about_colored_objects|3": { "hashes": { "hash_examples": "a6ba328c4c3385d2", "hash_full_prompts": "38328d016a4ebef3", "hash_input_tokens": "6897c18acd616cb9", "hash_cont_tokens": "675a7012cb001b34" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:ruin_names|3": { "hashes": { "hash_examples": "2ef28d5f2d4fdd25", "hash_full_prompts": "9c7d0493c37182d6", "hash_input_tokens": "4d618e950c8d013d", "hash_cont_tokens": "8ede606d015dca4f" }, "truncated": 248, "non_truncated": 0, "padded": 0, "non_padded": 248, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:salient_translation_error_detection|3": { "hashes": { "hash_examples": "c13f25ec8ffed496", "hash_full_prompts": "edccd4061b168b78", "hash_input_tokens": "fcdd25281b1eba05", "hash_cont_tokens": "cbf517be41f28f3d" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:snarks|3": { "hashes": { "hash_examples": "5f6db7bff7f6f22e", "hash_full_prompts": "31cafd95ab850a44", "hash_input_tokens": "16886c991ce348c1", "hash_cont_tokens": "df20170bf621a2f3" }, "truncated": 178, "non_truncated": 0, "padded": 0, "non_padded": 178, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:sports_understanding|3": { "hashes": { "hash_examples": "042afbe5d9c1f02d", "hash_full_prompts": "3d46581e9bbec2d0", "hash_input_tokens": "117a2d8c0e6cb894", "hash_cont_tokens": "ec37569600892a26" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:temporal_sequences|3": { "hashes": { "hash_examples": "803a05f352eb6afc", "hash_full_prompts": "4a54db144a5dd222", "hash_input_tokens": "70740a88f84e4a13", "hash_cont_tokens": "41d96a59ba9c957b" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:tracking_shuffled_objects_five_objects|3": { "hashes": { "hash_examples": "2bbac6db7ab0d527", "hash_full_prompts": "e3079106787cc311", "hash_input_tokens": "c8d7203b8c369cb8", "hash_cont_tokens": "fd6b5bd0d85dd2e6" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:tracking_shuffled_objects_seven_objects|3": { "hashes": { "hash_examples": "845caf093ac2b58c", "hash_full_prompts": "6364e5b860590ec8", "hash_input_tokens": "beb0b08cec3d048f", "hash_cont_tokens": "2707546adcfc144d" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 }, "harness|bbh:tracking_shuffled_objects_three_objects|3": { "hashes": { "hash_examples": "9004f14d5a32b9a8", "hash_full_prompts": "01aef56c4d1fe9fe", "hash_input_tokens": "9642e09abf045647", "hash_cont_tokens": "8468cb37dd7c4590" }, "truncated": 250, "non_truncated": 0, "padded": 0, "non_padded": 250, "effective_few_shots": 3.0, "num_truncated_few_shots": 0 } }, "summary_general": { "hashes": { "hash_examples": "4ff1e3dc5703575d", "hash_full_prompts": "1cbeab0a00117cb8", "hash_input_tokens": "2a35e6d8f7c2fc79", "hash_cont_tokens": "bd9a6fe0e1a8deb1" }, "truncated": 4362, "non_truncated": 0, "padded": 0, "non_padded": 4362, "num_truncated_few_shots": 0 } }