Spaces:
Running
Running
kexinhuang12345
commited on
Commit
•
aae1219
1
Parent(s):
df330ee
minor fix
Browse files- app.py +9 -4
- src/about.py +1 -1
- src/populate.py +30 -5
app.py
CHANGED
@@ -134,7 +134,7 @@ with demo:
|
|
134 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
135 |
|
136 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
137 |
-
with gr.TabItem("🏅
|
138 |
COLS = COLS_NC
|
139 |
AutoEvalColumn = AutoEvalColumn_NodeClassification
|
140 |
original_df = get_leaderboard_df(EVAL_REQUESTS_PATH, "Node Classification")
|
@@ -206,8 +206,10 @@ with demo:
|
|
206 |
leaderboard_table,
|
207 |
queue=True,
|
208 |
)
|
|
|
|
|
209 |
|
210 |
-
with gr.TabItem("🏅
|
211 |
COLS = COLS_NR
|
212 |
AutoEvalColumn = AutoEvalColumn_NodeRegression
|
213 |
original_df = get_leaderboard_df(EVAL_REQUESTS_PATH, "Node Regression")
|
@@ -278,8 +280,9 @@ with demo:
|
|
278 |
leaderboard_table,
|
279 |
queue=True,
|
280 |
)
|
|
|
281 |
|
282 |
-
with gr.TabItem("🏅
|
283 |
COLS = COLS_LP
|
284 |
AutoEvalColumn = AutoEvalColumn_LinkPrediction
|
285 |
original_df = get_leaderboard_df(EVAL_REQUESTS_PATH, "Link Prediction")
|
@@ -350,6 +353,8 @@ with demo:
|
|
350 |
leaderboard_table,
|
351 |
queue=True,
|
352 |
)
|
|
|
|
|
353 |
|
354 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
355 |
with gr.Column():
|
@@ -388,7 +393,7 @@ with demo:
|
|
388 |
github_url_textbox = gr.Textbox(label="GitHub URL Link")
|
389 |
#parameters_textbox = gr.Textbox(label="Number of parameters")
|
390 |
task_track = gr.Dropdown(
|
391 |
-
choices=['
|
392 |
label="Choose the task track",
|
393 |
multiselect=False,
|
394 |
value=None,
|
|
|
134 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
135 |
|
136 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
137 |
+
with gr.TabItem("🏅 Entity Classification Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
|
138 |
COLS = COLS_NC
|
139 |
AutoEvalColumn = AutoEvalColumn_NodeClassification
|
140 |
original_df = get_leaderboard_df(EVAL_REQUESTS_PATH, "Node Classification")
|
|
|
206 |
leaderboard_table,
|
207 |
queue=True,
|
208 |
)
|
209 |
+
gr.Markdown("Evaluation metric: AUROC ⬆️")
|
210 |
+
|
211 |
|
212 |
+
with gr.TabItem("🏅 Entity Regression Leaderboard", elem_id="llm-benchmark-tab-table", id=1):
|
213 |
COLS = COLS_NR
|
214 |
AutoEvalColumn = AutoEvalColumn_NodeRegression
|
215 |
original_df = get_leaderboard_df(EVAL_REQUESTS_PATH, "Node Regression")
|
|
|
280 |
leaderboard_table,
|
281 |
queue=True,
|
282 |
)
|
283 |
+
gr.Markdown("Evaluation metric: MAE ⬇️")
|
284 |
|
285 |
+
with gr.TabItem("🏅 Recommendation Leaderboard", elem_id="llm-benchmark-tab-table", id=2):
|
286 |
COLS = COLS_LP
|
287 |
AutoEvalColumn = AutoEvalColumn_LinkPrediction
|
288 |
original_df = get_leaderboard_df(EVAL_REQUESTS_PATH, "Link Prediction")
|
|
|
353 |
leaderboard_table,
|
354 |
queue=True,
|
355 |
)
|
356 |
+
gr.Markdown("Evaluation metric: MAE ⬇️")
|
357 |
+
|
358 |
|
359 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
360 |
with gr.Column():
|
|
|
393 |
github_url_textbox = gr.Textbox(label="GitHub URL Link")
|
394 |
#parameters_textbox = gr.Textbox(label="Number of parameters")
|
395 |
task_track = gr.Dropdown(
|
396 |
+
choices=['Entity Classification', 'Entity Regression', 'Recommendation'],
|
397 |
label="Choose the task track",
|
398 |
multiselect=False,
|
399 |
value=None,
|
src/about.py
CHANGED
@@ -79,7 +79,7 @@ Once you have developed your model and got results, you can submit your test res
|
|
79 |
- **Is it an official submission**: Whether the implementation is official (implementation by authors who proposed the method) or unofficial (re-implementation of the method by non-authors).
|
80 |
- **Paper URL Link**: The original paper describing the method (arXiv link is recommended. paper needs not be peer-reviewed). If your method has any original component (e.g., even just combining existing methods XXX and YYY), you have to write a technical report describing it (e.g., how you exactly combined XXX and YYY).
|
81 |
- **GitHub URL Link**: The Github repository or directory containining all code to reproduce the result. A placeholder repository is not allowed.
|
82 |
-
- **Task Track**: Choose the task you submit to, from
|
83 |
- **Honor code**: Please acknowledge that your submission adheres to all the ethical policies and your result is reproducible.
|
84 |
- **Test performance**: Raw test performance output by RelBench model evaluators, where average and unbiased standard deviation must be taken over 5 different random seeds. You can either not fix random seeds at all, or use the random seeds from 0 to 4. We highly discourage you to tune the random seeds.
|
85 |
- **Validation performance**: Validation performance of the model that is used to report the test performance above.
|
|
|
79 |
- **Is it an official submission**: Whether the implementation is official (implementation by authors who proposed the method) or unofficial (re-implementation of the method by non-authors).
|
80 |
- **Paper URL Link**: The original paper describing the method (arXiv link is recommended. paper needs not be peer-reviewed). If your method has any original component (e.g., even just combining existing methods XXX and YYY), you have to write a technical report describing it (e.g., how you exactly combined XXX and YYY).
|
81 |
- **GitHub URL Link**: The Github repository or directory containining all code to reproduce the result. A placeholder repository is not allowed.
|
82 |
+
- **Task Track**: Choose the task you submit to, from entity classification, entity regression, and recommendation.
|
83 |
- **Honor code**: Please acknowledge that your submission adheres to all the ethical policies and your result is reproducible.
|
84 |
- **Test performance**: Raw test performance output by RelBench model evaluators, where average and unbiased standard deviation must be taken over 5 different random seeds. You can either not fix random seeds at all, or use the random seeds from 0 to 4. We highly discourage you to tune the random seeds.
|
85 |
- **Validation performance**: Validation performance of the model that is used to report the test performance above.
|
src/populate.py
CHANGED
@@ -35,18 +35,41 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
35 |
#df = df[has_no_nan_values(df, benchmark_cols)]
|
36 |
return raw_data, df
|
37 |
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
def format_number(num):
|
39 |
return f"{num:.3f}"
|
40 |
def get_leaderboard_df(EVAL_REQUESTS_PATH, task_type) -> pd.DataFrame:
|
41 |
-
if task_type
|
42 |
ascending = False
|
43 |
tasks = nc_tasks
|
44 |
-
|
|
|
45 |
ascending = True
|
46 |
tasks = nr_tasks
|
47 |
-
|
|
|
48 |
ascending = False
|
49 |
tasks = lp_tasks
|
|
|
50 |
|
51 |
model_result_filepaths = []
|
52 |
for root,_, files in os.walk(EVAL_REQUESTS_PATH):
|
@@ -60,7 +83,7 @@ def get_leaderboard_df(EVAL_REQUESTS_PATH, task_type) -> pd.DataFrame:
|
|
60 |
import json
|
61 |
with open(model) as f:
|
62 |
out = json.load(f)
|
63 |
-
if ('task' in out) and (out['task']
|
64 |
model_res.append(out)
|
65 |
|
66 |
for model in model_res:
|
@@ -87,11 +110,13 @@ def get_leaderboard_df(EVAL_REQUESTS_PATH, task_type) -> pd.DataFrame:
|
|
87 |
df_res = pd.DataFrame(columns=columns_to_show)
|
88 |
|
89 |
#df_res = pd.DataFrame([{col: model[col] for col in columns_to_show} for model in model_res])
|
90 |
-
|
91 |
ranks = df_res[list(name2short_name.values())].rank(ascending = ascending)
|
92 |
df_res.rename(columns={'model': 'Model', 'author': 'Author', 'email': 'Email', 'paper_url': 'Paper URL', 'github_url': 'Github URL', 'submitted_time': 'Time'}, inplace=True)
|
93 |
df_res['Average Rank⬆️'] = ranks.mean(axis=1)
|
94 |
df_res.sort_values(by='Average Rank⬆️', ascending=True, inplace=True)
|
|
|
|
|
95 |
return df_res
|
96 |
|
97 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
|
35 |
#df = df[has_no_nan_values(df, benchmark_cols)]
|
36 |
return raw_data, df
|
37 |
'''
|
38 |
+
|
39 |
+
# Function to extract the numerical part before '+'
|
40 |
+
def extract_x(value):
|
41 |
+
return float(value.split('+')[0])
|
42 |
+
|
43 |
+
# Function to highlight the highest (or lowest) value based on X
|
44 |
+
def make_bold(df, cols, ascending):
|
45 |
+
df_highlight = df.copy()
|
46 |
+
|
47 |
+
def apply_highlight(s):
|
48 |
+
if ascending:
|
49 |
+
max_idx = s.apply(extract_x).idxmin()
|
50 |
+
else:
|
51 |
+
max_idx = s.apply(extract_x).idxmax()
|
52 |
+
|
53 |
+
return ['font-weight: bold' if i == max_idx else '' for i in range(len(s))]
|
54 |
+
|
55 |
+
styler = df_highlight.style.apply(lambda x: apply_highlight(x) if x.name in cols else ['']*len(x), axis=0)
|
56 |
+
return styler
|
57 |
+
|
58 |
def format_number(num):
|
59 |
return f"{num:.3f}"
|
60 |
def get_leaderboard_df(EVAL_REQUESTS_PATH, task_type) -> pd.DataFrame:
|
61 |
+
if task_type in ['Node Classification', 'Entity Classification']:
|
62 |
ascending = False
|
63 |
tasks = nc_tasks
|
64 |
+
task_type = ['Node Classification', 'Entity Classification']
|
65 |
+
elif task_type in ['Node Regression', 'Entity Regression']:
|
66 |
ascending = True
|
67 |
tasks = nr_tasks
|
68 |
+
task_type = ['Node Regression', 'Entity Regression']
|
69 |
+
elif task_type in ['Link Prediction', 'Recommendation']:
|
70 |
ascending = False
|
71 |
tasks = lp_tasks
|
72 |
+
task_type = ['Link Prediction', 'Recommendation']
|
73 |
|
74 |
model_result_filepaths = []
|
75 |
for root,_, files in os.walk(EVAL_REQUESTS_PATH):
|
|
|
83 |
import json
|
84 |
with open(model) as f:
|
85 |
out = json.load(f)
|
86 |
+
if ('task' in out) and (out['task'] in task_type):
|
87 |
model_res.append(out)
|
88 |
|
89 |
for model in model_res:
|
|
|
110 |
df_res = pd.DataFrame(columns=columns_to_show)
|
111 |
|
112 |
#df_res = pd.DataFrame([{col: model[col] for col in columns_to_show} for model in model_res])
|
113 |
+
|
114 |
ranks = df_res[list(name2short_name.values())].rank(ascending = ascending)
|
115 |
df_res.rename(columns={'model': 'Model', 'author': 'Author', 'email': 'Email', 'paper_url': 'Paper URL', 'github_url': 'Github URL', 'submitted_time': 'Time'}, inplace=True)
|
116 |
df_res['Average Rank⬆️'] = ranks.mean(axis=1)
|
117 |
df_res.sort_values(by='Average Rank⬆️', ascending=True, inplace=True)
|
118 |
+
#df_res = make_bold(df_res, list(name2short_name.values()), ascending = ascending)
|
119 |
+
print(df_res)
|
120 |
return df_res
|
121 |
|
122 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|