jihoo-kim commited on
Commit
fea7ea6
1 Parent(s): 4287161

Initial commit

Browse files
app.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import pingouin as pg
4
+ import plotly.express as px
5
+ import seaborn as sns
6
+ from matplotlib import pyplot as plt
7
+
8
+ my_theme = gr.themes.Soft(
9
+ primary_hue="indigo",
10
+ secondary_hue="gray",
11
+ font=[gr.themes.GoogleFont("Source Sans Pro")],
12
+ ).set(
13
+ body_background_fill="#White",
14
+ block_background_fill="White",
15
+ button_primary_background_fill="#8B71FF",
16
+ button_cancel_text_color="White",
17
+ )
18
+ sns.set(color_codes=True, font_scale=1.2)
19
+
20
+
21
+ SCORE_PATH = "db/score_240429.csv"
22
+ score_df = pd.read_csv(SCORE_PATH)
23
+ score_df["H6-Avg"] = (
24
+ score_df[["ARC-c", "HellaSwag", "MMLU", "TruthfulQA", "WinoGrande", "GSM-8K"]]
25
+ .mean(axis=1)
26
+ .round(2)
27
+ )
28
+ AVAILABLE_SCORES = score_df.columns[3:].tolist()
29
+ AVAILABLE_SCORES.remove("Arena Elo")
30
+ AVAILABLE_MODELS = score_df["Model"].to_list()
31
+
32
+ DEFAULT_SCORES = ["ARC-c", "HellaSwag", "MMLU", "TruthfulQA", "WinoGrande", "GSM-8K"]
33
+ DEFAULT_MODELS = [
34
+ "SOLAR-10.7B-Instruct-v1.0",
35
+ "Mistral-7B-Instruct-v0.2",
36
+ "Gemma-7B-it",
37
+ "Llama-3-8b-instruct",
38
+ ]
39
+
40
+
41
+ def get_report(models_list, benchmarks_list):
42
+ global score_df
43
+ report_df = score_df.copy()
44
+ report_df["MT-Bench (x10)"] = report_df["MT-Bench"] * 10
45
+ report_df = report_df[report_df["Model"].isin(models_list)]
46
+
47
+ table = report_df[["Organization", "Model", "Size"] + benchmarks_list].copy()
48
+ table["Total_avg"] = table[benchmarks_list].mean(axis=1).round(2)
49
+ table["Ranking"] = table["Total_avg"].rank(ascending=False).astype(int)
50
+ table = table.sort_values("Ranking").reset_index(drop=True)
51
+ rank_table = table[["Organization", "Model", "Size", "Ranking", "Total_avg"]]
52
+ score_table = table[["Model"] + benchmarks_list]
53
+
54
+ if "MT-Bench" in benchmarks_list:
55
+ benchmarks_list.remove("MT-Bench")
56
+ benchmarks_list.append("MT-Bench (x10)")
57
+
58
+ scores = []
59
+ for b in benchmarks_list:
60
+ for m, n in report_df[["Model", b]].values:
61
+ scores.append([m, b, n])
62
+ figure_df = pd.DataFrame(scores, columns=["model", "benchmark", "score"])
63
+
64
+ fig = px.line_polar(
65
+ figure_df,
66
+ r="score",
67
+ theta="benchmark",
68
+ line_close=True,
69
+ category_orders={"benchmark": benchmarks_list},
70
+ color="model",
71
+ markers=True,
72
+ color_discrete_sequence=px.colors.qualitative.Pastel,
73
+ title="LLM Evaluation Report (by Evalverse)",
74
+ width=800,
75
+ )
76
+
77
+ return fig, rank_table, score_table
78
+
79
+
80
+ def get_corr_table(benchmarks_list=None):
81
+ global score_df
82
+ if benchmarks_list:
83
+ benchmarks_list = ["Arena Elo"] + benchmarks_list
84
+ else:
85
+ benchmarks_list = score_df.columns[3:]
86
+
87
+ corr_table = score_df[benchmarks_list].pairwise_corr(method="pearson")
88
+
89
+ return corr_table
90
+
91
+
92
+ def get_corr_figure(benchmarks_list=None):
93
+ global score_df
94
+ if benchmarks_list:
95
+ benchmarks_list = ["Arena Elo"] + benchmarks_list
96
+ else:
97
+ benchmarks_list = score_df.columns[3:]
98
+
99
+ corr_values = score_df[benchmarks_list].corr()
100
+
101
+ plt.figure(figsize=(16, 10))
102
+ sns.heatmap(corr_values, annot=True, cmap="RdBu", linewidths=3)
103
+ plt.xticks(rotation=45)
104
+ plt.title("Correlation - LLM Benchmarks", size=30)
105
+
106
+ return plt
107
+
108
+
109
+ def get_analysis_figure(bench_name):
110
+ global score_df
111
+
112
+ fig = px.scatter(
113
+ score_df,
114
+ x=bench_name,
115
+ y="Arena Elo",
116
+ marginal_x="histogram",
117
+ marginal_y="histogram",
118
+ width=450,
119
+ hover_data=["Organization", "Model"],
120
+ trendline="ols",
121
+ trendline_color_override="#27138F",
122
+ )
123
+
124
+ return fig
125
+
126
+
127
+ report_plot, rank_table, score_table = get_report(DEFAULT_MODELS, DEFAULT_SCORES)
128
+ corr_table = get_corr_table()
129
+
130
+
131
+ with gr.Blocks(theme=my_theme) as demo:
132
+ with gr.Row():
133
+ gr.Image(
134
+ "asset/evalverse_logo.png",
135
+ show_label=False,
136
+ show_download_button=False,
137
+ scale=0.4,
138
+ )
139
+ with gr.Row():
140
+ gr.Markdown(
141
+ """
142
+ The Universe of Evaluation. All about the evaluation for LLMs.\n
143
+ Run an evaluation for your LLM with **`Evalverse`** [[Github](https://github.com/UpstageAI/evalverse) • [Paper](https://arxiv.org/abs/2404.00943) • [Docs](https://evalverse.gitbook.io/evalverse-docs)].
144
+
145
+ """
146
+ )
147
+ with gr.Tab("📊 LLM Evaluation Report"):
148
+ with gr.Row():
149
+ model_list = gr.Dropdown(
150
+ AVAILABLE_MODELS,
151
+ value=DEFAULT_MODELS,
152
+ multiselect=True,
153
+ label="Models",
154
+ info="Select models to evaluate",
155
+ )
156
+ bench_list = gr.Dropdown(
157
+ AVAILABLE_SCORES,
158
+ value=DEFAULT_SCORES,
159
+ multiselect=True,
160
+ label="Benchmarks",
161
+ info="Select benchmarks to evaluate",
162
+ )
163
+ btn = gr.Button("Report!", variant="primary")
164
+ with gr.Row():
165
+ output_figure = gr.Plot(report_plot, label="Report")
166
+ with gr.Row():
167
+ gr.Markdown("## Summary")
168
+ with gr.Row():
169
+ output_rank_table = gr.DataFrame(rank_table)
170
+ with gr.Row():
171
+ gr.Markdown("## Detailed scores")
172
+ with gr.Row():
173
+ output_score_table = gr.DataFrame(score_table)
174
+
175
+ btn.click(
176
+ fn=get_report,
177
+ inputs=[model_list, bench_list],
178
+ outputs=[output_figure, output_rank_table, output_score_table],
179
+ )
180
+
181
+ with gr.Tab("🧐 LLM Evaluation Analysis"):
182
+ with gr.Row():
183
+ bench_a = gr.Dropdown(
184
+ AVAILABLE_SCORES,
185
+ value="MT-Bench",
186
+ label="A Benchmark",
187
+ info="Select a benchmark to analyze the correlation with Arena Elo",
188
+ )
189
+ bench_b = gr.Dropdown(
190
+ AVAILABLE_SCORES,
191
+ value="H6-Avg",
192
+ label="B Benchmark",
193
+ info="Select a benchmark to analyze the correlation with Arena Elo",
194
+ )
195
+ with gr.Row():
196
+ btn_a = gr.Button("Analyze A!", variant="primary")
197
+ btn_b = gr.Button("Analyze B!", variant="primary")
198
+ with gr.Row():
199
+ mtbench_figure = get_analysis_figure("MT-Bench")
200
+ h6avg_figure = get_analysis_figure("H6-Avg")
201
+
202
+ figure_a = gr.Plot(mtbench_figure, label="Selected A")
203
+ figure_b = gr.Plot(h6avg_figure, label="Selected B")
204
+
205
+ btn_a.click(fn=get_analysis_figure, inputs=bench_a, outputs=figure_a)
206
+ btn_b.click(fn=get_analysis_figure, inputs=bench_b, outputs=figure_b)
207
+
208
+ with gr.Row():
209
+ gr.Markdown("## Analysis")
210
+ with gr.Row():
211
+ corr_figure = get_corr_figure()
212
+ output_corr_figure = gr.Plot(corr_figure, label="Correlations")
213
+ with gr.Row():
214
+ output_corr_table = gr.DataFrame(corr_table, label="Detailed statistics")
215
+ with gr.Row():
216
+ gr.Markdown(
217
+ """
218
+ - `X`: Name(s) of first columns.
219
+ - `Y`: Name(s) of second columns.
220
+ - `method`: Correlation type.
221
+ - `alternative`: Tail of the test.
222
+ - `n`: Sample size (after removal of missing values).
223
+ - `r`: Correlation coefficients.
224
+ - `CI95`': 95% parametric confidence intervals.
225
+ - `p-unc`: Uncorrected p-values.
226
+ - `BF10`: Bayes Factor of the alternative hypothesis (only for Pearson correlation)
227
+ - `power`: achieved power of the test (= 1 - type II error).
228
+
229
+ Reference: https://pingouin-stats.org/build/html/generated/pingouin.pairwise_corr.html#pingouin.pairwise_corr
230
+ """
231
+ )
232
+ with gr.Tab("🌌 About Evalverse"):
233
+ gr.Markdown(
234
+ """
235
+ ## 🌌 Introduction
236
+ **Evalverse** is a freely accessible, open-source project designed to support your LLM (Large Language Model) evaluation needs. We provide a simple, standardized, and user-friendly solution for the processing and management of LLM evaluations, catering to the needs of AI research engineers and scientists. We also support no-code evaluation processes for people who may have less experience working with LLMs. Moreover, you will receive a well-organized report with figures summarizing the evaluation results.
237
+ """
238
+ )
239
+ with gr.Row():
240
+ gr.Image(
241
+ "asset/overview.png",
242
+ show_label=False,
243
+ show_download_button=False,
244
+ scale=0.6,
245
+ )
246
+ gr.Markdown(
247
+ """
248
+ ### With Evalverse, you are empowered to
249
+ - access various evaluation methods without juggling multiple libraries.
250
+ - receive insightful report about the evaluation results that helps you to compare the varied scores across different models.
251
+ - initiate evaluation and generate reports without any code via Slack bot.
252
+
253
+ ## 🌌 Architecture of Evalverse
254
+ """
255
+ )
256
+ with gr.Row():
257
+ gr.Image(
258
+ "asset/architecture.png",
259
+ show_label=False,
260
+ show_download_button=False,
261
+ scale=0.8,
262
+ )
263
+ gr.Markdown(
264
+ """
265
+ - `Submodule`. The Submodule serves as the evaluation engine that is responsible for the heavy lifting involved in evaluating LLMs. Publicly available LLM evaluation libraries can be integrated into Evalverse as submodules. This component makes Evalverse expandable, thereby ensuring that the library remains up-to-date.
266
+ - `Connector`. The Connector plays a role in linking the Submodules with the Evaluator. It contains evaluation scripts, along with the necessary arguments, from various external libraries.
267
+ - `Evaluator`. The Evaluator performs the requested evaluations on the Compute Cluster by utilizing the evaluation scripts from the Connector. The Evaluator can receive evaluation requests either from the Reporter, which facilitates a no-code evaluation approach, or directly from the end-user for code-based evaluation.
268
+ - `Compute Cluster`. The Compute Cluster is the collection of hardware accelerators needed to execute the LLM evaluation processes. When the Evaluator schedules an evaluation job to be ran, the Compute Cluster fetches the required model and data files from the Database. The results of the evaluation jobs are sent to the Database for storage.
269
+ - `Database`. The Database stores the model files and data needed in the evaluation processes, along with evaluation results. The stored evaluation results are used by the Reporter to create evaluation reports for the user.
270
+ - `Reporter`. The Reporter handles the evaluation and report requests sent by the users, allowing for a no-code approach to LLM evaluation. The Reporter sends the requested evaluation jobs to the Evaluator and fetches the evaluation results from the Database, which are sent to the user via an external communication platform such as Slack. Through this, users can receive table and figure that summarize evaluation results.
271
+
272
+ ## 🌌 Key Features of Evalverse
273
+ - **Unified evaluation with Submodules**: Evalverse extends its evaluation capabilities through Git submodules, effortlessly incorporating frameworks like [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) and [FastChat](https://github.com/lm-sys/FastChat). Swiftly add new tools and keep pace with the latest in LLM evaluation.
274
+ - **No-code evaluation request**: With Evalverse, request LLM evaluations without any code, simply by sending `Request!` in a direct message or Slack channel with an activate Evalverse Slack bot. Enter the model name in the Huggingface hub or local model directory path in Slack, and let the bot handle the rest.
275
+ - **LLM evaluation report**: Obtain comprehensive, no-code reports from Evalverse. Request with a simple command -`Report!`-, select the model and evaluation criteria, and receive detailed reports with scores, rankings, and visuals, all generated from the stored score database.
276
+
277
+ ## 🌌 Supported Evaluations
278
+ We currently support four evaluation methods. If you have suggestions for new methods, we welcome your input!
279
+
280
+ | Evaluation | Original Repository |
281
+ |---------------------------|--------------------------------------------|
282
+ | H6 (Open LLM Leaderboard) | [EleutherAI](https://github.com/EleutherAI)/[lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)|
283
+ | MT-bench | [lm-sys](https://github.com/lm-sys)/[FastChat](https://github.com/lm-sys/FastChat)|
284
+ | IFEval | [google-research](https://github.com/google-research/google-research/tree/master)/[instruction_following_eval](https://github.com/google-research/google-research/tree/master/instruction_following_eval)|
285
+ | EQ-Bench | [EQ-bench](https://github.com/EQ-bench)/[EQ-Bench](https://github.com/EQ-bench/EQ-Bench)|
286
+
287
+ ## 🌌 Acknowledgements
288
+ Evalverse is an open-source project orchestrated by the **Data-Centric LLM Team** at `Upstage`, designed as an ecosystem for LLM evaluation. Launched in April 2024, this initiative stands at the forefront of advancing evaluation handling in the realm of large language models (LLMs).
289
+
290
+ ## 🌌 License
291
+ Evalverse is completely freely-accessible open-source and licensed under the Apache License 2.0.
292
+
293
+ ## 🌌 Citation
294
+ If you want to cite our 🌌 Evalverse project, feel free to use the following bibtex. You can check our paper via [link](https://arxiv.org/abs/2404.00943).
295
+
296
+ ```bibtex
297
+ @misc{kim2024evalverse,
298
+ title={Evalverse: Unified and Accessible Library for Large Language Model Evaluation},
299
+ author={Jihoo Kim and Wonho Song and Dahyun Kim and Yunsu Kim and Yungi Kim and Chanjun Park},
300
+ year={2024},
301
+ eprint={2404.00943},
302
+ archivePrefix={arXiv},
303
+ primaryClass={cs.CL}
304
+ }
305
+ ```
306
+ """
307
+ )
308
+
309
+ if __name__ == "__main__":
310
+ demo.launch()
asset/architecture.png ADDED
asset/evalverse_logo.png ADDED
asset/overview.png ADDED
db/score_240429.csv ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Organization,Model,Size,Arena Elo,MT-Bench,Arena-Hard,EQ-Bench,MAGI-Hard,ARC-c,HellaSwag,MMLU,TruthfulQA,WinoGrande,GSM-8K
2
+ Anthropic,Claude-1,,1150,7.9,,76.83,,,,77,,,
3
+ Anthropic,Claude-2.0,,1132,8.06,24,72.89,,,,78.5,,,
4
+ Anthropic,Claude-2.1,,1119,8.18,22.8,73.96,,,,,,,
5
+ Anthropic,Claude-3-Haiku-20240307,,1181,,41.5,63.65,47.71,89.2,85.9,75.2,,74.2,
6
+ Anthropic,Claude-3-Opus-20240229 ,,1251,9.43,60.4,82.19,76.55,96.4,95.4,86.8,,88.5,
7
+ Anthropic,Claude-3-Sonnet-20240229,,1202,9.18,46.8,80.45,61.01,93.2,89,79,,75.1,
8
+ Cohere,Command-R-Plus,104B,1192,,33.1,76.11,49.7,70.39,87.96,74.02,56.95,83.82,47.31
9
+ Cohere,Command-R-v01,35B,1148,,17,56.05,43.27,65.53,87,68.2,52.32,81.53,56.63
10
+ Databricks,DBRX-Instruct ,132B,1102,8.26,23.9,76.82,57.13,68.9,89,73.7,66.9,81.8,66.9
11
+ DeepSeekAI,DeepSeek-LLM-67B-Chat,67B,1079,,,,,67.75,86.82,72.42,55.85,84.21,63.68
12
+ DeepSeekAI,DeepSeek-LLM-7B-Chat,7B,,,,,,55.8,79.38,51.75,47.98,74.82,46.55
13
+ Google,Gemini-1.0-Pro,,1136,,17.8,,,,,71.8,,,
14
+ Google,Gemini-1.5-Pro-API-0409-Preview,,1248,,,,,,,81.9,,,
15
+ Google,Gemma-1.1-7B-it,7B,1085,,,59.17,38.43,,,,,,
16
+ Google,Gemma-2B-it,2B,,,3,23.26,24.16,43.94,62.7,37.65,45.82,60.93,5.46
17
+ Google,Gemma-7B-it,7B,1043,,7.5,61.72,24.85,53.2,81.2,64.3,31.81,72.3,46.4
18
+ OpenAI,GPT-3.5-Turbo-0125,,1106,,23.3,64.97,42.65,,,,,,
19
+ OpenAI,GPT-3.5-Turbo-0301,,1108,7.94,18.1,70.67,46.66,85.2,85.5,70,47,81.6,57.1
20
+ OpenAI,GPT-3.5-Turbo-0613,,1120,8.39,24.8,69.35,40.55,,,,,,
21
+ OpenAI,GPT-3.5-Turbo-1106,,1072,8.32,18.9,71.74,43.17,,,,,,
22
+ OpenAI,GPT-4-0125-Preview,,1247,,78,83.87,76.83,,,,,,
23
+ OpenAI,GPT-4-0314,,1189,8.96,50,85.73,75.67,96.3,95.3,86.4,59,87.5,92
24
+ OpenAI,GPT-4-0613,,1165,9.18,37.9,84.79,77.85,,,,,,
25
+ OpenAI,GPT-4-1106-Preview,,1253,9.32,,86.05,74.96,,,,,,
26
+ OpenAI,GPT-4-Turbo-2024-04-09,,1257,,82.6,86.35,77.74,,,,,,
27
+ InternLM,InternLM2-Chat-20B,20B,,7.9,,,,,,66.5,,,
28
+ InternLM,InternLM2-Chat-7B,7B,,7.7,,62.61,38.43,,,63.7,,,
29
+ Meta,Llama-2-13b-chat-hf,13B,1054,6.65,,49.12,28.2,59.04,81.94,54.64,44.12,74.51,15.24
30
+ Meta,Llama-2-70b-chat-hf,70B,1088,6.86,11.6,73.59,35.4,64.59,85.88,63.91,52.8,80.51,26.69
31
+ Meta,Llama-2-7b-chat-hf ,7B,1040,6.27,4.6,36.32,27.5,52.9,78.55,48.32,45.57,71.74,7.35
32
+ Meta,Llama-3-70b-instruct,70B,1207,,41.1,82.13,67.97,71.42,85.69,80.06,61.81,82.87,85.44
33
+ Meta,Llama-3-8b-instruct,8B,1146,,20.6,68.88,63.84,60.75,78.55,67.07,51.65,74.51,68.69
34
+ Mistral,Mistral-7B-Instruct-v0.1,7B,1011,6.84,,52.15,30.69,54.52,75.63,55.38,56.28,73.72,14.25
35
+ Mistral,Mistral-7B-Instruct-v0.2,7B,1073,7.6,12.6,68.18,34.69,63.14,84.88,60.78,68.26,77.19,40.03
36
+ Mistral,Mistral-large-2402,,1158,8.66,37.7,85.17,67.69,94,89.2,81.2,50.5,86.7,81
37
+ Mistral,Mistral-medium,,1148,8.61,31.9,82.57,62.15,89.9,88,75.3,,88,66.7
38
+ Mistral,Mixtral-8x22B-Instruct-v0.1,141B,1147,,36.4,78.79,62.41,72.7,89.08,77.77,68.14,85.16,82.03
39
+ Mistral,Mixtral-8x7b-Instruct-v0.1,47B,1114,8.3,23.4,72.37,45.74,70.22,87.63,71.16,64.58,81.37,60.73
40
+ OpenChat,OpenChat-3.5-0106,7B,1098,7.8,,,,66.04,82.93,65.04,51.9,81.77,68.16
41
+ OrionStarAI,Orion-14B-Chat,14B,,7.37,,59.71,40.74,,,61.7,,,
42
+ Microsoft,Phi-3-Mini-128k-Instruct,3.8B,1064,,,,,63.14,80.09,68.7,54.12,72.85,69.52
43
+ Microsoft,Phi-3-Mini-4k-Instruct,3.8B,,,,58.15,53.26,62.97,80.6,69.08,59.88,72.38,74.53
44
+ Alibaba,Qwen-14B-Chat,14B,1038,6.96,,63.47,39.74,,,66.5,,,
45
+ Alibaba,Qwen-7B-Chat,7B,,,,50.11,33.44,,,57,,,
46
+ Alibaba,Qwen1.5-1.8B-Chat,1.8B,,,,24.12,31.56,38.74,60.02,45.87,40.62,59.67,19.03
47
+ Alibaba,Qwen1.5-14B-Chat,14B,1119,7.91,,74.99,49.27,58.79,82.33,68.52,60.38,73.32,30.86
48
+ Alibaba,Qwen1.5-32B-Chat,32B,1135,8.3,,75.59,60.72,66.04,85.49,74.99,66.95,77.19,7.05
49
+ Alibaba,Qwen1.5-4B-Chat,4B,,,,28.75,32.66,43.26,69.73,55.55,44.79,64.96,2.43
50
+ Alibaba,Qwen1.5-72B-Chat ,72B,1153,8.61,36.1,82.81,63.47,68.52,86.42,77.44,63.9,79.08,20.39
51
+ Alibaba,Qwen1.5-7B-Chat,7B,1073,7.6,,54.41,41.59,55.89,78.56,61.7,53.65,67.8,13.19
52
+ RekaAI,Reka-Edge,7B,,7.6,,,,,,65.7,,,
53
+ RekaAI,Reka-Flash,21B,1149,8.2,,,,,,73.5,,,
54
+ RekaAI,Reka-Core,,,,,,,,,83.2,,,
55
+ Upstage,SOLAR-10.7B-Instruct-v1.0,10.7B,1065,7.58,,73.53,39.62,71.08,88.16,66.21,71.43,83.58,64.75
56
+ Nexusflow,Starling-LM-7B-alpha,7B,1091,8.09,12.8,73.9,37.06,63.82,84.9,64.67,46.39,80.58,62.4
57
+ Nexusflow,Starling-LM-7B-beta,7B,1119,8.12,23,73.82,40.12,67.24,83.47,65.14,55.47,81.29,66.64
58
+ AllenAI,Tulu-2-DPO-70B,70B,1102,7.89,15,76.63,50.23,72.1,88.99,69.84,65.78,83.27,62.62
59
+ LMSys,Vicuna-13B-v1.5,13B,1047,6.57,,67.39,28.75,57.08,81.24,56.67,51.51,74.66,11.3
60
+ LMSys,Vicuna-33B-v1.3,33B,1093,7.12,8.6,67.07,31.66,,,59.2,,,
61
+ LMSys,Vicuna-7B-v1.1,7B,1009,6.17,,26.12,27.38,53.67,77.46,45.63,48.94,70.96,5.53
62
+ Microsoft,WizardLM-13b-v1.2,13B,1061,7.2,,63.71,29.1,,,52.7,,,
63
+ Microsoft,WizardLM-2-70B,70B,,8.92,,,,,,,,,
64
+ Microsoft,WizardLM-2-7B,7B,,8.28,,69.31,35.4,63.23,83.41,61.75,57.01,73.48,43.59
65
+ Microsoft,WizardLM-2-8x22B,141B,,9.12,,77.91,59.16,72.44,89.05,76.77,60.5,82.24,84.61
66
+ Microsoft,WizardLM-70B-v1.0,70B,1108,7.71,,,,64.52,83.21,63.32,54.6,,
67
+ 01.AI,Yi-34B-Chat,34B,1110,7.88,23.1,71.62,57.1,65.1,84.08,74.87,55.41,79.79,19.79
68
+ 01.AI,Yi-6B-Chat,6B,,,,61.79,38.74,,,60.99,,,
69
+ HuggingFace,Zephyr-7b-alpha,7B,1042,6.88,,56.82,35.15,61.01,84.04,61.39,57.9,78.61,14.03
70
+ HuggingFace,Zephyr-7b-beta,7B,1054,7.34,,58.33,35.97,62.03,84.36,61.07,57.45,77.74,29.04
71
+ HuggingFace,Zephyr-ORPO-141b-A35b-v0.1,141B,1125,8.17,,,,,,,,,
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio==4.27.0
2
+ pandas==2.2.2
3
+ pingouin==0.5.4
4
+ plotly==5.21.0
5
+ seaborn==0.13.2
6
+ matplotlib==3.8.4