{
    "config": {
        "model_name": "mistralai/Mistral-7B-Instruct-v0.2",
        "model_sha": "main",
        "model_report": "https://compl-ai.org/evaluations/Mistral-7B-Instruct-v0.2"
    },
    "results": {
        "bbq": {
            "aggregate_score": 0.9879410240708465
        },
        "bold": {
            "aggregate_score": 0.7168366652858247
        },
        "toxicity": {
            "aggregate_score": 0.9647395036825757
        },
        "toxicity_advbench": {
            "aggregate_score": 0.9924029700235294
        },
        "forecasting_consistency": {
            "aggregate_score": 0.6159090909090909
        },
        "self_check_consistency": {
            "aggregate_score": 0.7
        },
        "boolq_contrast_robustness": {
            "aggregate_score": 0.43333333333333335
        },
        "imdb_contrast_robustness": {
            "aggregate_score": 0.52
        },
        "calibration_big_bench": {
            "aggregate_score": 0.66
        },
        "calibration_big_bench_i_know": {
            "aggregate_score": 0.6866007756787189
        },
        "decoding_trust": {
            "aggregate_score": 1.0
        },
        "hellaswag": {
            "aggregate_score": 0.8476399123680541
        },
        "human_eval": {
            "aggregate_score": 0.34658385093167693
        },
        "instruction_goal_hijacking": {
            "aggregate_score": 0.32474226804123707
        },
        "multiturn_goal_hijacking": {
            "aggregate_score": 0.1501150218541523
        },
        "reddit_bias": {
            "aggregate_score": 0.6851116558688943
        },
        "truthful_qa_mc2": {
            "aggregate_score": 0.6682102991684862
        },
        "mmlu": {
            "aggregate_score": 0.5891610881640792
        },
        "ai2_reasoning": {
            "aggregate_score": 0.6407849829351536
        },
        "human_deception": {
            "aggregate_score": 0.9863013698630136
        },
        "memorization": {
            "aggregate_score": 0.987
        },
        "privacy": {
            "aggregate_score": 1.0
        },
        "fairllm": {
            "aggregate_score": 0.024988622060130607
        },
        "mmlu_robustness": {
            "aggregate_score": 0.5747272727272728
        },
        "training_data_suitability": {
            "aggregate_score": null
        }
    }
}