WildBench-V1-legacy / data_dir /elo_ranks.all.L=1.0.jsonl
yuchenlin's picture
remove winrates and update the length penalty method
960f5ed
raw
history blame
8.57 kB
{"model name ": "claude-3-opus-20240229", "elo overall": 1123, "Information seeking": 1146, "Creative Writing": 1120, "Coding & Debugging": 1082, "Reasoning": 1135, "Editing": 1072, "Math": 1136, "Planning": 1146, "Brainstorming": 1165, "Role playing": 1087, "Advice seeking": 1107, "Data Analysis": 1074, "Others": 1031, "average": 1108.4166666666667, "# battles": 3658}
{"model name ": "mistral-large-2402", "elo overall": 1114, "Information seeking": 1118, "Creative Writing": 1098, "Coding & Debugging": 1132, "Reasoning": 1133, "Editing": 1096, "Math": 1091, "Planning": 1131, "Brainstorming": 1076, "Role playing": 1075, "Advice seeking": 1102, "Data Analysis": 1088, "Others": 1016, "average": 1096.3333333333333, "# battles": 2058}
{"model name ": "claude-3-sonnet-20240229", "elo overall": 1114, "Information seeking": 1111, "Creative Writing": 1123, "Coding & Debugging": 1072, "Reasoning": 1100, "Editing": 1075, "Math": 1067, "Planning": 1115, "Brainstorming": 1146, "Role playing": 1085, "Advice seeking": 1087, "Data Analysis": 1080, "Others": 1034, "average": 1091.25, "# battles": 2791}
{"model name ": "gpt-3.5-turbo-0125", "elo overall": 1112, "Information seeking": 1115, "Creative Writing": 1106, "Coding & Debugging": 1158, "Reasoning": 1135, "Editing": 1084, "Math": 1252, "Planning": 1109, "Brainstorming": 1080, "Role playing": 1099, "Advice seeking": 1092, "Data Analysis": 1129, "Others": 1142, "average": 1125.0833333333333, "# battles": 14196}
{"model name ": "gemini-1.0-pro", "elo overall": 1103, "Information seeking": 1121, "Creative Writing": 1085, "Coding & Debugging": 1088, "Reasoning": 1098, "Editing": 1052, "Math": 1079, "Planning": 1131, "Brainstorming": 1073, "Role playing": 1052, "Advice seeking": 1112, "Data Analysis": 1081, "Others": 1008, "average": 1081.6666666666667, "# battles": 2035}
{"model name ": "gpt-4-0125-preview", "elo overall": 1092, "Information seeking": 1065, "Creative Writing": 1095, "Coding & Debugging": 1107, "Reasoning": 1083, "Editing": 1103, "Math": 1085, "Planning": 1098, "Brainstorming": 1134, "Role playing": 1055, "Advice seeking": 1066, "Data Analysis": 1087, "Others": 1064, "average": 1086.8333333333333, "# battles": 5781}
{"model name ": "Mixtral-8x7B-Instruct-v0.1", "elo overall": 1050, "Information seeking": 1063, "Creative Writing": 1042, "Coding & Debugging": 1061, "Reasoning": 1050, "Editing": 1017, "Math": 1047, "Planning": 1034, "Brainstorming": 1043, "Role playing": 1053, "Advice seeking": 1022, "Data Analysis": 1029, "Others": 960, "average": 1035.0833333333333, "# battles": 3619}
{"model name ": "Yi-34B-Chat", "elo overall": 1019, "Information seeking": 1008, "Creative Writing": 1049, "Coding & Debugging": 990, "Reasoning": 985, "Editing": 1046, "Math": 942, "Planning": 1012, "Brainstorming": 1036, "Role playing": 1029, "Advice seeking": 1002, "Data Analysis": 972, "Others": 1000, "average": 1005.9166666666666, "# battles": 2606}
{"model name ": "Mistral-7B-Instruct-v0.2", "elo overall": 1018, "Information seeking": 1010, "Creative Writing": 1039, "Coding & Debugging": 1025, "Reasoning": 995, "Editing": 1051, "Math": 941, "Planning": 1049, "Brainstorming": 1042, "Role playing": 1025, "Advice seeking": 1036, "Data Analysis": 990, "Others": 983, "average": 1015.5, "# battles": 2519}
{"model name ": "Nous-Hermes-2-Mixtral-8x7B-DPO", "elo overall": 1017, "Information seeking": 1019, "Creative Writing": 993, "Coding & Debugging": 1030, "Reasoning": 1030, "Editing": 1006, "Math": 1053, "Planning": 998, "Brainstorming": 996, "Role playing": 997, "Advice seeking": 1002, "Data Analysis": 1006, "Others": 982, "average": 1009.3333333333334, "# battles": 1484}
{"model name ": "gemma-7b-it", "elo overall": 1010, "Information seeking": 1002, "Creative Writing": 1033, "Coding & Debugging": 953, "Reasoning": 1031, "Editing": 1012, "Math": 1038, "Planning": 1074, "Brainstorming": 1094, "Role playing": 984, "Advice seeking": 1046, "Data Analysis": 977, "Others": 990, "average": 1019.5, "# battles": 2728}
{"model name ": "zephyr-7b-gemma-v0.1", "elo overall": 985, "Information seeking": 982, "Creative Writing": 964, "Coding & Debugging": 1011, "Reasoning": 994, "Editing": 989, "Math": 1024, "Planning": 981, "Brainstorming": 963, "Role playing": 991, "Advice seeking": 954, "Data Analysis": 1011, "Others": 1001, "average": 988.75, "# battles": 1532}
{"model name ": "vicuna-13b-v1.5", "elo overall": 981, "Information seeking": 988, "Creative Writing": 986, "Coding & Debugging": 1003, "Reasoning": 977, "Editing": 992, "Math": 953, "Planning": 959, "Brainstorming": 981, "Role playing": 979, "Advice seeking": 949, "Data Analysis": 1008, "Others": 967, "average": 978.5, "# battles": 2715}
{"model name ": "tulu-2-dpo-70b", "elo overall": 967, "Information seeking": 991, "Creative Writing": 965, "Coding & Debugging": 911, "Reasoning": 980, "Editing": 908, "Math": 928, "Planning": 945, "Brainstorming": 940, "Role playing": 1004, "Advice seeking": 969, "Data Analysis": 894, "Others": 951, "average": 948.8333333333334, "# battles": 3630}
{"model name ": "gemma-2b-it", "elo overall": 960, "Information seeking": 934, "Creative Writing": 983, "Coding & Debugging": 935, "Reasoning": 964, "Editing": 962, "Math": 1030, "Planning": 955, "Brainstorming": 1011, "Role playing": 972, "Advice seeking": 972, "Data Analysis": 981, "Others": 984, "average": 973.5833333333334, "# battles": 2689}
{"model name ": "Llama-2-70b-chat-hf.nosp", "elo overall": 957, "Information seeking": 947, "Creative Writing": 939, "Coding & Debugging": 968, "Reasoning": 982, "Editing": 960, "Math": 979, "Planning": 942, "Brainstorming": 946, "Role playing": 962, "Advice seeking": 969, "Data Analysis": 1006, "Others": 1010, "average": 967.5, "# battles": 2144}
{"model name ": "Llama-2-70b-chat-hf", "elo overall": 945, "Information seeking": 934, "Creative Writing": 940, "Coding & Debugging": 987, "Reasoning": 926, "Editing": 968, "Math": 931, "Planning": 933, "Brainstorming": 924, "Role playing": 940, "Advice seeking": 936, "Data Analysis": 1005, "Others": 987, "average": 950.9166666666666, "# battles": 2461}
{"model name ": "command", "elo overall": 944, "Information seeking": 966, "Creative Writing": 964, "Coding & Debugging": 893, "Reasoning": 926, "Editing": 933, "Math": 921, "Planning": 927, "Brainstorming": 981, "Role playing": 981, "Advice seeking": 988, "Data Analysis": 929, "Others": 978, "average": 948.9166666666666, "# battles": 1939}
{"model name ": "Llama-2-13b-chat-hf.nosp", "elo overall": 940, "Information seeking": 921, "Creative Writing": 947, "Coding & Debugging": 945, "Reasoning": 924, "Editing": 958, "Math": 945, "Planning": 923, "Brainstorming": 917, "Role playing": 959, "Advice seeking": 915, "Data Analysis": 960, "Others": 991, "average": 942.0833333333334, "# battles": 2091}
{"model name ": "Llama-2-13b-chat-hf", "elo overall": 932, "Information seeking": 923, "Creative Writing": 931, "Coding & Debugging": 983, "Reasoning": 953, "Editing": 979, "Math": 929, "Planning": 932, "Brainstorming": 908, "Role playing": 912, "Advice seeking": 945, "Data Analysis": 970, "Others": 1008, "average": 947.75, "# battles": 2406}
{"model name ": "zephyr-7b-beta", "elo overall": 919, "Information seeking": 976, "Creative Writing": 917, "Coding & Debugging": 848, "Reasoning": 928, "Editing": 902, "Math": 886, "Planning": 913, "Brainstorming": 883, "Role playing": 978, "Advice seeking": 948, "Data Analysis": 915, "Others": 940, "average": 919.5, "# battles": 3543}
{"model name ": "Llama-2-7b-chat-hf.nosp", "elo overall": 914, "Information seeking": 899, "Creative Writing": 927, "Coding & Debugging": 931, "Reasoning": 885, "Editing": 972, "Math": 905, "Planning": 914, "Brainstorming": 890, "Role playing": 954, "Advice seeking": 931, "Data Analysis": 956, "Others": 1007, "average": 930.9166666666666, "# battles": 2094}
{"model name ": "Llama-2-7b-chat-hf", "elo overall": 899, "Information seeking": 871, "Creative Writing": 907, "Coding & Debugging": 962, "Reasoning": 877, "Editing": 958, "Math": 923, "Planning": 875, "Brainstorming": 893, "Role playing": 916, "Advice seeking": 904, "Data Analysis": 956, "Others": 996, "average": 919.8333333333334, "# battles": 2366}
{"model name ": "Mistral-7B-Instruct-v0.1", "elo overall": 886, "Information seeking": 897, "Creative Writing": 858, "Coding & Debugging": 925, "Reasoning": 910, "Editing": 899, "Math": 907, "Planning": 908, "Brainstorming": 883, "Role playing": 909, "Advice seeking": 939, "Data Analysis": 900, "Others": 977, "average": 909.3333333333334, "# battles": 2659}