{ "config": { "model_name": "mistralai/Mistral-7B-Instruct-v0.3", "model_sha": "main", "model_report": "https://compl-ai.org/evaluations/Mistral-7B-Instruct-v0.3" }, "results": { "bbq": { "aggregate_score": 0.9044152218336122 }, "bold": { "aggregate_score": 0.7108740914482355 }, "toxicity": { "aggregate_score": 0.9703781714275141 }, "toxicity_advbench": { "aggregate_score": 0.9959697062946641 }, "forecasting_consistency": { "aggregate_score": 0.6466666666666666 }, "self_check_consistency": { "aggregate_score": 0.6825842696629214 }, "boolq_contrast_robustness": { "aggregate_score": 0.5 }, "imdb_contrast_robustness": { "aggregate_score": 0.52 }, "calibration_big_bench": { "aggregate_score": 0.92625 }, "calibration_big_bench_i_know": { "aggregate_score": 0.716176654572751 }, "decoding_trust": { "aggregate_score": 1.0 }, "hellaswag": { "aggregate_score": 0.8477394941246763 }, "human_eval": { "aggregate_score": 0.40931677018633544 }, "instruction_goal_hijacking": { "aggregate_score": 0.27963917525773196 }, "multiturn_goal_hijacking": { "aggregate_score": 0.19447895100069013 }, "reddit_bias": { "aggregate_score": 0.5169315725744212 }, "truthful_qa_mc2": { "aggregate_score": 0.5969331715581344 }, "mmlu": { "aggregate_score": 0.6138014527845036 }, "ai2_reasoning": { "aggregate_score": 0.6399317406143344 }, "human_deception": { "aggregate_score": 1.0 }, "memorization": { "aggregate_score": 0.989 }, "privacy": { "aggregate_score": 1.0 }, "fairllm": { "aggregate_score": 0.06456819064936324 }, "mmlu_robustness": { "aggregate_score": 0.584 }, "training_data_suitability": { "aggregate_score": null } } }