Jimin Huang
feat: modify leaderboard
b44eb8b
raw
history blame
4.6 kB
# matplotlib.use('macosx')
import gradio as gr
import matplotlib
import numpy as np
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
TASK1_COLS = [
("Model", "str"),
("Acc", "number"),
("F1", "number"),
("MCC", "number"),
]
TASK2_COLS = [
("Model", "str"),
("Rouge-1", "number"),
("Rouge-2", "number"),
("Rouge-L", "number"),
("BertScore", "number"),
("BartScore", "number"),
]
TASK3_COLS = [
("Model", "str"),
("Sharpe Ratio", "number"),
("Sharpe Ratio - DRIV", "number"),
("Sharpe Ratio - FORM", "number"),
("Sharpe Ratio - JNJ", "number"),
("Sharpe Ratio - MSFT", "number"),
]
# Extract column names
task1_cols = [col_name for col_name, _ in TASK1_COLS]
task2_cols = [col_name for col_name, _ in TASK2_COLS]
task3_cols = [col_name for col_name, _ in TASK3_COLS]
def create_df_dict(lang, lang_cols):
# Load leaderboard data with column names
leaderboard_df = pd.read_csv(f"{lang}_result.csv", names=lang_cols)
leaderboard_df = leaderboard_df.sort_index(axis=1)
# Move 'key' column to the front
leaderboard_df = leaderboard_df[["Model"] + [col for col in leaderboard_df.columns if col != "Model"]]
cols = leaderboard_df.columns
types = ["str"] + ["number"] * (len(lang_cols) - 1)
# Split merged_df into subtask dataframes
df_dict = {"overall": leaderboard_df}
return df_dict
df_lang = {
"Task 1": create_df_dict("task1", task1_cols),
"Task 2": create_df_dict("task2", task2_cols),
"Task 3": create_df_dict("task3", task3_cols),
}
# Constants
TITLE = '<h1 align="center" id="space-title">🐲 IJCAI 2024 FinLLM Challenge Leaderboard</h1>'
INTRODUCTION_TEXT = """πŸ“Š Introduction
The FinLLM Challenge rigorously evaluates state-of-the-art models in financial text analysis, generation, and decision-making tasks. These tasks include financial classification, financial text summarization, and single stock trading.
πŸ“ˆ Unique Evaluation Metrics
Our leaderboard incorporates a comprehensive evaluation using diverse metrics like Accuracy, F1 Score, ROUGE, BERTScore, and Sharpe Ratio to assess the models' capabilities in real-world financial applications.
πŸ“š Task Details
**Task 1: Financial Classification**
- **Objective:** Classify sentences as claims or premises.
- **Dataset:** 7.75k training data, 969 test data.
- **Evaluation Metrics:** F1 Score (final ranking metric) and Accuracy.
**Task 2: Financial Text Summarization**
- **Objective:** Summarize financial news articles into concise texts.
- **Dataset:** 8k training data, 2k test data.
- **Evaluation Metrics:** ROUGE (1, 2, L) and BERTScore (ROUGE-1 as the final ranking metric).
**Task 3: Single Stock Trading**
- **Objective:** Make stock trading decisions (buy, sell, hold) with reasonings.
- **Dataset:** 291 data points.
- **Evaluation Metrics:** Sharpe Ratio (final ranking metric), Cumulative Return, Daily and Annualized Volatility, Maximum Drawdown.
For more details, refer to our [Challenge page](https://sites.google.com/nlg.csie.ntu.edu.tw/finnlp-agentscen/shared-task-finllm?authuser=0).
"""
def create_data_interface(df):
headers = df.columns
types = ["str"] + ["number"] * (len(headers) - 1)
return gr.components.Dataframe(
value=df.values.tolist(),
headers=[col_name for col_name in headers],
datatype=types,
)
def plot_radar_chart(df, attributes, category_name):
fig = go.Figure()
for index, row in df.iterrows():
model = row["Model"]
values = row[attributes].tolist()
fig.add_trace(go.Scatterpolar(r=values, theta=attributes, fill="toself", name=model))
fig.update_layout(title="FLARE", polar=dict(radialaxis=dict(visible=True, range=[0, 0.9])), showlegend=True)
return fig
def create_data_interface_for_aggregated(df, category_name):
attributes = df.columns[1:]
print(attributes)
plt = plot_radar_chart(df, attributes, category_name)
return plt
def create_lang_leaderboard(df_dict):
for key, df in df_dict.items():
with gr.Tab(key):
create_data_interface(df)
def launch_gradio():
demo = gr.Blocks()
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
for key, df_dict in df_lang.items():
with gr.Tab(key):
create_lang_leaderboard(df_dict)
demo.launch()
scheduler = BackgroundScheduler()
scheduler.add_job(launch_gradio, "interval", seconds=3600)
scheduler.start()
# Launch immediately
launch_gradio()