⚖️ LAiW Leaderboard

# matplotlib.use('macosx')
import gradio as gr
import plotly.graph_objects as go
from apscheduler.schedulers.background import BackgroundScheduler
from get_data_info import plot_data, tab_data


def create_data_interface(df):
    headers = df.columns
    types = ["str"] + ["number"] * (len(headers) - 1)

    return gr.components.Dataframe(
        value=df.values.tolist(),
        headers=[col_name for col_name in headers],
        datatype=types,
        # max_rows=10,
    )


def plot_radar_chart(df, attributes, category_name):
    fig = go.Figure()

    for index, row in df.iterrows():
        model = row['Model']
        values = row[attributes].tolist()
        fig.add_trace(go.Scatterpolar(
            r=values,
            theta=attributes,
            fill='toself',
            name=model
        ))

    fig.update_layout(
        title=f"{category_name}",
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, 100]  #
            )),
        showlegend=True
    )

    return fig


def create_data_interface_for_aggregated(df, category_name):
    attributes = df.columns[1:]
    print(f"attributes: {attributes}")
    plt = plot_radar_chart(df, attributes, category_name)
    return plt


def reindex_cols(fix_cols, df):
    # reindex with task_col
    task_col = [subtask for subtask in fix_cols if subtask in df.columns.values.tolist()]
    df = df[task_col]
    return df


def launch_gradio(df1, df2):
    demo = gr.Blocks()

    with demo:
        gr.HTML(TITLE)
        gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

        with gr.Row():
            for key, df in df1.items():
                if key == "Overall" or key == "Basic Information Retrieval":
                    df = df.replace('', 0)
                    new_df = df[[val for val in df.columns]].copy()
                    # new_df = reindex_cols(Task_COLS, new_df)
                    print(f"{key}: \n{new_df}")
                    plot = create_data_interface_for_aggregated(new_df, key)
                    gr.Plot(plot)
                    del new_df

        with gr.Row():
            for key, df in df1.items():
                if key == "Legal Foundation Inference" or key == "Complex Legal Application":
                    # if True:
                    df = df.replace('', 0)
                    new_df = df[[val for val in df.columns]].copy()
                    # new_df = reindex_cols(Task_COLS, new_df)
                    print(f"{key}: \n{new_df}")
                    plot = create_data_interface_for_aggregated(new_df, key)
                    gr.Plot(plot)
                    del new_df

        for key, df in df2.items():
            # if key != "Overall":
            if True:
                with gr.Tab(key):
                    # df = reindex_cols(Task_COLS, df)
                    create_data_interface(df)

    demo.launch()


if __name__ == "__main__":
    df1 = plot_data()
    df2 = tab_data()

    # Constants
    TITLE = '<h1 align="center" id="space-title">⚖️ LAiW Leaderboard</h1>'
    INTRODUCTION_TEXT = """🏆 The LAiW Leaderboard is designed to rigorously track, rank, and evaluate state-of-the-art Large Language Models in Legal.

    💡 Our leaderboard not only covers basic Legal NLP tasks but also incorporates Legal practice tasks such as similar case matching, offering a more comprehensive evaluation for real-world Legal applications.

    🌟 Our evaluation metrics include, but are not limited to, Accuracy, F1 Score, ROUGE score, and Matthews correlation coefficient (MCC), providing a multidimensional assessment of model performance.

    🔗 For more details, refer to our GitHub page [here](https://github.com/Dai-shen/LAiW).
    """

    scheduler = BackgroundScheduler()
    scheduler.add_job(launch_gradio(df1=df1, df2=df2), "interval", seconds=3600)
    scheduler.start()

    # Launch immediately
    launch_gradio(df1=df1, df2=df2)