File size: 3,125 Bytes
397694c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import re

import gradio as gr
import numpy
import pandas as pd

from src.display.css_html_js import custom_css
from src.about import (
    CITATION_BUTTON_LABEL,
    CITATION_BUTTON_TEXT,
    EVALUATION_QUEUE_TEXT,
    INTRODUCTION_TEXT,
    LLM_BENCHMARKS_TEXT,
    TITLE,
)

demo = gr.Blocks(css=custom_css)
with demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    # load dataframe from csv
    # leaderboard_df = pd.read_csv("benchmark_results.csv")
    leaderboard_df = []
    with open("benchmark_results.csv", "r") as f:
        header = f.readline().strip().split(",")
        header = [h.strip() for h in header]
        for i, line in enumerate(f):
            leaderboard_df.append(line.strip().split(",", 13))
    # create dataframe from list and header
    leaderboard_df = pd.DataFrame(leaderboard_df, columns=header)
    # filter column with value eq-bench_v2_pl
    print(header)
    leaderboard_df = leaderboard_df[(leaderboard_df["Benchmark Version"] == "eq-bench_v2_pl") | (leaderboard_df["Benchmark Version"]=='eq-bench_pl')]
    #fix: ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().


    #leave only defined columns
    leaderboard_df = leaderboard_df[["Model Path", "Benchmark Score", "Num Questions Parseable", "Error"]]

    #create new column with model name
    def parse_parseable(x):
        if x["Num Questions Parseable"] == 'FAILED':
            m = re.match(r'(\d+)\.0 questions were parseable', x["Error"])
            print(m.group(1))
            return m.group(1)
        return x["Num Questions Parseable"]

    leaderboard_df["Num Questions Parseable"] = leaderboard_df[["Num Questions Parseable", "Error"]].apply(lambda x: parse_parseable(x), axis=1)

    #change value of column to nan
    leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', numpy.nan)

    #set datatype of column
    leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float)
    leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].astype(float)

    #set nan if value of column is less than 0
    leaderboard_df.loc[leaderboard_df["Benchmark Score"] < 0, "Benchmark Score"] = 0

    #sort by 2 columns
    leaderboard_df = leaderboard_df.sort_values(by=["Benchmark Score", "Num Questions Parseable"], ascending=[False, False])

    leaderboard_df_styled = leaderboard_df.style.background_gradient(cmap="RdYlGn")
    rounding = {}
    # for col in ["Benchmark Score", "Num Questions Parseable"]:

    rounding["Benchmark Score"] = "{:.2f}"
    rounding["Num Questions Parseable"] = "{:.0f}"
    leaderboard_df_styled = leaderboard_df_styled.format(rounding)

    leaderboard_table = gr.components.Dataframe(
        value=leaderboard_df_styled,
        # headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
        # datatype=TYPES,
        elem_id="leaderboard-table",
        interactive=False,
        visible=True,
    )

    demo.queue(default_concurrency_limit=40).launch()