File size: 16,150 Bytes
fea7ea6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399f6b4
 
fea7ea6
 
 
 
 
 
399f6b4
fea7ea6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f57813a
fea7ea6
399f6b4
fea7ea6
 
 
 
 
 
 
 
 
f57813a
fea7ea6
399f6b4
fea7ea6
 
 
f57813a
fea7ea6
 
 
 
 
 
 
 
 
 
 
 
 
f57813a
fea7ea6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399f6b4
 
 
 
 
 
fea7ea6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399f6b4
 
 
 
 
 
 
fea7ea6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f57813a
 
 
 
 
 
 
 
 
399f6b4
f57813a
 
 
 
fea7ea6
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
import gradio as gr
import pandas as pd
import pingouin as pg
import plotly.express as px
import seaborn as sns
from matplotlib import pyplot as plt

my_theme = gr.themes.Soft(
    primary_hue="indigo",
    secondary_hue="gray",
    font=[gr.themes.GoogleFont("Source Sans Pro")],
).set(
    body_background_fill="#White",
    block_background_fill="White",
    button_primary_background_fill="#8B71FF",
    button_cancel_text_color="White",
)
sns.set(color_codes=True, font_scale=1.2)

TARGET_DATE = "240515"
SCORE_PATH = f"db/score_240517.csv"
score_df = pd.read_csv(SCORE_PATH)
score_df["H6-Avg"] = (
    score_df[["ARC-c", "HellaSwag", "MMLU", "TruthfulQA", "WinoGrande", "GSM-8K"]]
    .mean(axis=1)
    .round(2)
)
AVAILABLE_SCORES = score_df.columns[8:].tolist()
AVAILABLE_MODELS = score_df["Model"].to_list()

DEFAULT_SCORES = ["ARC-c", "HellaSwag", "MMLU", "TruthfulQA", "WinoGrande", "GSM-8K"]
DEFAULT_MODELS = [
    "SOLAR-10.7B-Instruct-v1.0",
    "Mistral-7B-Instruct-v0.2",
    "Gemma-7B-it",
    "Llama-3-8b-instruct",
]


def get_report(models_list, benchmarks_list):
    global score_df
    report_df = score_df.copy()
    report_df["MT-Bench (x10)"] = report_df["MT-Bench"] * 10
    report_df = report_df[report_df["Model"].isin(models_list)]

    table = report_df[["Organization", "Model", "Size"] + benchmarks_list].copy()
    table["Total_avg"] = table[benchmarks_list].mean(axis=1).round(2)
    table["Ranking"] = table["Total_avg"].rank(ascending=False).astype(int)
    table = table.sort_values("Ranking").reset_index(drop=True)
    rank_table = table[["Organization", "Model", "Size", "Ranking", "Total_avg"]]
    score_table = table[["Model"] + benchmarks_list]

    if "MT-Bench" in benchmarks_list:
        benchmarks_list.remove("MT-Bench")
        benchmarks_list.append("MT-Bench (x10)")

    scores = []
    for b in benchmarks_list:
        for m, n in report_df[["Model", b]].values:
            scores.append([m, b, n])
    figure_df = pd.DataFrame(scores, columns=["model", "benchmark", "score"])

    fig = px.line_polar(
        figure_df,
        r="score",
        theta="benchmark",
        line_close=True,
        category_orders={"benchmark": benchmarks_list},
        color="model",
        markers=True,
        color_discrete_sequence=px.colors.qualitative.Pastel,
        title="LLM Evaluation Report (by Evalverse)",
        width=800,
    )

    return fig, rank_table, score_table


def get_corr_table(benchmarks_list=None):
    global score_df
    if benchmarks_list:
        benchmarks_list = [f"Arena Elo ({TARGET_DATE})"] + benchmarks_list
    else:
        benchmarks_list = score_df.columns[4:]

    corr_table = score_df[benchmarks_list].pairwise_corr(method="pearson")

    return corr_table


def get_corr_figure(benchmarks_list=None):
    global score_df
    if benchmarks_list:
        benchmarks_list = [f"Arena Elo ({TARGET_DATE})"] + benchmarks_list
    else:
        benchmarks_list = score_df.columns[4:]

    corr_values = score_df[benchmarks_list].corr()

    plt.figure(figsize=(21, 14))
    sns.heatmap(corr_values, annot=True, cmap="RdBu", linewidths=3)
    plt.xticks(rotation=45)
    plt.title("Correlation - LLM Benchmarks", size=30)

    return plt


def get_analysis_figure(bench_name):
    global score_df

    fig = px.scatter(
        score_df,
        x=bench_name,
        y=f"Arena Elo ({TARGET_DATE})",
        marginal_x="histogram",
        marginal_y="histogram",
        width=450,
        hover_data=["Organization", "Model"],
        trendline="ols",
        trendline_color_override="#27138F",
    )

    return fig


report_plot, rank_table, score_table = get_report(DEFAULT_MODELS, DEFAULT_SCORES)
corr_table = get_corr_table()


with gr.Blocks(theme=my_theme) as demo:
    with gr.Row():
        gr.Image(
            "asset/evalverse_logo.png",
            show_label=False,
            show_download_button=False,
            scale=0.4,
        )
    with gr.Row():
        gr.Markdown(
            """
        The Universe of Evaluation. All about the evaluation for LLMs.\n
        Run an evaluation for your LLM with **`Evalverse`** [[Github](https://github.com/UpstageAI/evalverse) β€’ [Paper](https://arxiv.org/abs/2404.00943) β€’ [Docs](https://evalverse.gitbook.io/evalverse-docs)].
        
        ### πŸš€ Newly updated
        [2024.05.17]
        - Weekly scores: `Arena Elo (240515)`, `Arena Elo (240508)`, `Arena Elo (240501)`
        - New benchmarks: [`AlpacaEval 2.0`](https://tatsu-lab.github.io/alpaca_eval/), [`MMLU-Pro`](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro) 
        - New models: `GPT-4o-0513`, `Grok-1`, `OpenELM`, `Qwen-Max-0428`, `Snowflake-Arctic-Instruct`, `Yi-Large`
        - New tab: `πŸ† Full leaderboard`
        """
        )
    with gr.Tab("πŸ“Š LLM Evaluation Report"):
        with gr.Row():
            model_list = gr.Dropdown(
                AVAILABLE_MODELS,
                value=DEFAULT_MODELS,
                multiselect=True,
                label="Models",
                info="Select models to evaluate",
            )
            bench_list = gr.Dropdown(
                AVAILABLE_SCORES,
                value=DEFAULT_SCORES,
                multiselect=True,
                label="Benchmarks",
                info="Select benchmarks to evaluate",
            )
        btn = gr.Button("Report!", variant="primary")
        with gr.Row():
            output_figure = gr.Plot(report_plot, label="Report")
        with gr.Row():
            gr.Markdown("## Summary")
        with gr.Row():
            output_rank_table = gr.DataFrame(rank_table)
        with gr.Row():
            gr.Markdown("## Detailed scores")
        with gr.Row():
            output_score_table = gr.DataFrame(score_table)

        btn.click(
            fn=get_report,
            inputs=[model_list, bench_list],
            outputs=[output_figure, output_rank_table, output_score_table],
        )

    with gr.Tab("🧐 LLM Evaluation Analysis"):
        with gr.Row():
            bench_a = gr.Dropdown(
                AVAILABLE_SCORES,
                value="MT-Bench",
                label="A Benchmark",
                info="Select a benchmark to analyze the correlation with Arena Elo",
            )
            bench_b = gr.Dropdown(
                AVAILABLE_SCORES,
                value="H6-Avg",
                label="B Benchmark",
                info="Select a benchmark to analyze the correlation with Arena Elo",
            )
        with gr.Row():
            btn_a = gr.Button("Analyze A!", variant="primary")
            btn_b = gr.Button("Analyze B!", variant="primary")
        with gr.Row():
            mtbench_figure = get_analysis_figure("MT-Bench")
            h6avg_figure = get_analysis_figure("H6-Avg")

            figure_a = gr.Plot(mtbench_figure, label="Selected A")
            figure_b = gr.Plot(h6avg_figure, label="Selected B")

            btn_a.click(fn=get_analysis_figure, inputs=bench_a, outputs=figure_a)
            btn_b.click(fn=get_analysis_figure, inputs=bench_b, outputs=figure_b)

        with gr.Row():
            gr.Markdown("## Analysis")
        with gr.Row():
            corr_figure = get_corr_figure()
            output_corr_figure = gr.Plot(corr_figure, label="Correlations")
        with gr.Row():
            output_corr_table = gr.DataFrame(corr_table, label="Detailed statistics")
        with gr.Row():
            gr.Markdown(
                """
                - `X`: Name(s) of first columns.
                - `Y`: Name(s) of second columns.
                - `method`: Correlation type.
                - `alternative`: Tail of the test.
                - `n`: Sample size (after removal of missing values).
                - `r`: Correlation coefficients.
                - `CI95`': 95% parametric confidence intervals.
                - `p-unc`: Uncorrected p-values.
                - `BF10`: Bayes Factor of the alternative hypothesis (only for Pearson correlation)
                - `power`: achieved power of the test (= 1 - type II error).

                Reference: https://pingouin-stats.org/build/html/generated/pingouin.pairwise_corr.html#pingouin.pairwise_corr
            """
            )

    with gr.Tab("πŸ† Full leaderboard"):
        lb_selected = ["Arena Elo (240515)", "MT-Bench", "MMLU", "Arena-Hard", "EQ-Bench", "MAGI-Hard", "LC-AlpacaEval-2.0", "MMLU-Pro", "H6-Avg"]
        lb = score_df[["Organization", "Model", "Size"] + lb_selected]
        lb = lb.sort_values(lb_selected, ascending=False)
        gr.DataFrame(lb)

    with gr.Tab("🌌 About Evalverse"):
        gr.Markdown(
            """
            ## 🌌 Introduction
            **Evalverse** is a freely accessible, open-source project designed to support your LLM (Large Language Model) evaluation needs. We provide a simple, standardized, and user-friendly solution for the processing and management of LLM evaluations, catering to the needs of AI research engineers and scientists. We also support no-code evaluation processes for people who may have less experience working with LLMs. Moreover, you will receive a well-organized report with figures summarizing the evaluation results.
        """
        )
        with gr.Row():
            gr.Image(
                "asset/overview.png",
                show_label=False,
                show_download_button=False,
                scale=0.6,
            )
        gr.Markdown(
            """
            ### With Evalverse, you are empowered to
            - access various evaluation methods without juggling multiple libraries.
            - receive insightful report about the evaluation results that helps you to compare the varied scores across different models.
            - initiate evaluation and generate reports without any code via Slack bot.

            ## 🌌 Architecture of Evalverse
        """
        )
        with gr.Row():
            gr.Image(
                "asset/architecture.png",
                show_label=False,
                show_download_button=False,
                scale=0.8,
            )
        gr.Markdown(
            """
            - `Submodule`. The Submodule serves as the evaluation engine that is responsible for the heavy lifting involved in evaluating LLMs. Publicly available LLM evaluation libraries can be integrated into Evalverse as submodules. This component makes Evalverse expandable, thereby ensuring that the library remains up-to-date. 
            - `Connector`. The Connector plays a role in linking the Submodules with the Evaluator. It contains evaluation scripts, along with the necessary arguments, from various external libraries.
            - `Evaluator`. The Evaluator performs the requested evaluations on the Compute Cluster by utilizing the evaluation scripts from the Connector. The Evaluator can receive evaluation requests either from the Reporter, which facilitates a no-code evaluation approach, or directly from the end-user for code-based evaluation.
            - `Compute Cluster`. The Compute Cluster is the collection of hardware accelerators needed to execute the LLM evaluation processes. When the Evaluator schedules an evaluation job to be ran, the Compute Cluster fetches the required model and data files from the Database. The results of the evaluation jobs are sent to the Database for storage.
            - `Database`. The Database stores the model files and data needed in the evaluation processes, along with evaluation results. The stored evaluation results are used by the Reporter to create evaluation reports for the user.
            - `Reporter`. The Reporter handles the evaluation and report requests sent by the users, allowing for a no-code approach to LLM evaluation. The Reporter sends the requested evaluation jobs to the Evaluator and fetches the evaluation results from the Database, which are sent to the user via an external communication platform such as Slack. Through this, users can receive table and figure that summarize evaluation results.

            ## 🌌 Key Features of Evalverse
            - **Unified evaluation with Submodules**: Evalverse extends its evaluation capabilities through Git submodules, effortlessly incorporating frameworks like [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) and [FastChat](https://github.com/lm-sys/FastChat). Swiftly add new tools and keep pace with the latest in LLM evaluation.
            - **No-code evaluation request**: With Evalverse, request LLM evaluations without any code, simply by sending `Request!` in a direct message or Slack channel with an activate Evalverse Slack bot. Enter the model name in the Huggingface hub or local model directory path in Slack, and let the bot handle the rest.
            - **LLM evaluation report**: Obtain comprehensive, no-code reports from Evalverse. Request with a simple command -`Report!`-, select the model and evaluation criteria, and receive detailed reports with scores, rankings, and visuals, all generated from the stored score database.

            ## 🌌 Supported Evaluations
            We currently support four evaluation methods. If you have suggestions for new methods, we welcome your input!

            | Evaluation                | Original Repository                        |
            |---------------------------|--------------------------------------------|
            | H6 (Open LLM Leaderboard) | [EleutherAI](https://github.com/EleutherAI)/[lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)|
            | MT-bench                  | [lm-sys](https://github.com/lm-sys)/[FastChat](https://github.com/lm-sys/FastChat)|
            | IFEval                    | [google-research](https://github.com/google-research/google-research/tree/master)/[instruction_following_eval](https://github.com/google-research/google-research/tree/master/instruction_following_eval)|
            | EQ-Bench                  | [EQ-bench](https://github.com/EQ-bench)/[EQ-Bench](https://github.com/EQ-bench/EQ-Bench)|

            ## 🌌 Acknowledgements
            Evalverse is an open-source project orchestrated by the **Data-Centric LLM Team** at `Upstage`, designed as an ecosystem for LLM evaluation. Launched in April 2024, this initiative stands at the forefront of advancing evaluation handling in the realm of large language models (LLMs).

            ## 🌌 License
            Evalverse is completely freely-accessible open-source and licensed under the Apache License 2.0.

            ## 🌌 Citation
            If you want to cite our 🌌 Evalverse project, feel free to use the following bibtex. You can check our paper via [link](https://arxiv.org/abs/2404.00943).

            ```bibtex
            @misc{kim2024evalverse,
                title={Evalverse: Unified and Accessible Library for Large Language Model Evaluation}, 
                author={Jihoo Kim and Wonho Song and Dahyun Kim and Yunsu Kim and Yungi Kim and Chanjun Park},
                year={2024},
                eprint={2404.00943},
                archivePrefix={arXiv},
                primaryClass={cs.CL}
            }
            ```
        """
        )
    with gr.Row():
        with gr.Accordion("The scores are collected from ...", open=False):
            gr.Markdown(
                """                
                - [HuggingFace Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
                - [LMSYS Chatbot Arena Leaderboard](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard)
                - [EQ-Bench Leaderboard](https://eqbench.com/)
                - [Arena-Hard Leaderboard](https://lmsys.org/blog/2024-04-19-arena-hard/#full-leaderboard-with-gpt-4-turbo-as-judge)
                - [AlpacaEval Leaderboard](https://tatsu-lab.github.io/alpaca_eval/)
                - [MMLU-Pro Leaderboard](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro#4-leaderboard)
                - Results from [Evalverse](https://github.com/UpstageAI/evalverse)

            """
            )

if __name__ == "__main__":
    demo.launch()