File size: 3,962 Bytes
c3dcec1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c4c73c
c3dcec1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af9d904
c3dcec1
 
 
 
 
 
 
 
 
 
af9d904
c3dcec1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# matplotlib.use('macosx')
import gradio as gr
import plotly.graph_objects as go
from apscheduler.schedulers.background import BackgroundScheduler
from get_data_info import plot_data, tab_data


def create_data_interface(df):
    headers = df.columns
    types = ["str"] + ["number"] * (len(headers) - 1)

    return gr.components.Dataframe(
        value=df.values.tolist(),
        headers=[col_name for col_name in headers],
        datatype=types,
        # max_rows=10,
    )


def plot_radar_chart(df, attributes, category_name):
    fig = go.Figure()

    for index, row in df.iterrows():
        model = row['Model']
        values = row[attributes].tolist()
        fig.add_trace(go.Scatterpolar(
            r=values,
            theta=attributes,
            fill='toself',
            name=model
        ))

    fig.update_layout(
        title=f"{category_name}",
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, 100]  #
            )),
        showlegend=True
    )

    return fig


def create_data_interface_for_aggregated(df, category_name):
    attributes = df.columns[1:]
    print(f"attributes: {attributes}")
    plt = plot_radar_chart(df, attributes, category_name)
    return plt


def reindex_cols(fix_cols, df):
    # reindex with task_col
    task_col = [subtask for subtask in fix_cols if subtask in df.columns.values.tolist()]
    df = df[task_col]
    return df


def launch_gradio(df1, df2):
    demo = gr.Blocks()

    with demo:
        gr.HTML(TITLE)
        gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

        with gr.Row():
            for key, df in df1.items():
                if key == "Overall" or key == "Basic Information Retrieval":
                    df = df.replace('', 0)
                    new_df = df[[val for val in df.columns]].copy()
                    # new_df = reindex_cols(Task_COLS, new_df)
                    print(f"{key}: \n{new_df}")
                    plot = create_data_interface_for_aggregated(new_df, key)
                    gr.Plot(plot)
                    del new_df

        with gr.Row():
            for key, df in df1.items():
                if key == "Legal Foundation Inference" or key == "Complex Legal Application":
                    # if True:
                    df = df.replace('', 0)
                    new_df = df[[val for val in df.columns]].copy()
                    # new_df = reindex_cols(Task_COLS, new_df)
                    print(f"{key}: \n{new_df}")
                    plot = create_data_interface_for_aggregated(new_df, key)
                    gr.Plot(plot)
                    del new_df

        for key, df in df2.items():
            # if key != "Overall":
            if True:
                with gr.Tab(key):
                    # df = reindex_cols(Task_COLS, df)
                    create_data_interface(df)

    demo.launch()


if __name__ == "__main__":
    df1 = plot_data()
    df2 = tab_data()

    # Constants
    TITLE = '<h1 align="center" id="space-title">βš–οΈ LAiW Leaderboard</h1>'
    INTRODUCTION_TEXT = """πŸ† The LAiW Leaderboard is designed to rigorously track, rank, and evaluate state-of-the-art Large Language Models in Legal.

    πŸ’‘ Our leaderboard not only covers basic Legal NLP tasks but also incorporates Legal practice tasks such as similar case matching, offering a more comprehensive evaluation for real-world Legal applications.

    🌟 Our evaluation metrics include, but are not limited to, Accuracy, F1 Score, ROUGE score, and Matthews correlation coefficient (MCC), providing a multidimensional assessment of model performance.

    πŸ”— For more details, refer to our GitHub page [here](https://github.com/Dai-shen/LAiW).
    """

    scheduler = BackgroundScheduler()
    scheduler.add_job(launch_gradio(df1=df1, df2=df2), "interval", seconds=3600)
    scheduler.start()

    # Launch immediately
    launch_gradio(df1=df1, df2=df2)