kexinhuang12345 commited on
Commit
aae1219
1 Parent(s): df330ee
Files changed (3) hide show
  1. app.py +9 -4
  2. src/about.py +1 -1
  3. src/populate.py +30 -5
app.py CHANGED
@@ -134,7 +134,7 @@ with demo:
134
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
135
 
136
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
137
- with gr.TabItem("🏅 Node Classification Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
138
  COLS = COLS_NC
139
  AutoEvalColumn = AutoEvalColumn_NodeClassification
140
  original_df = get_leaderboard_df(EVAL_REQUESTS_PATH, "Node Classification")
@@ -206,8 +206,10 @@ with demo:
206
  leaderboard_table,
207
  queue=True,
208
  )
 
 
209
 
210
- with gr.TabItem("🏅 Node Regression Leaderboard", elem_id="llm-benchmark-tab-table", id=1):
211
  COLS = COLS_NR
212
  AutoEvalColumn = AutoEvalColumn_NodeRegression
213
  original_df = get_leaderboard_df(EVAL_REQUESTS_PATH, "Node Regression")
@@ -278,8 +280,9 @@ with demo:
278
  leaderboard_table,
279
  queue=True,
280
  )
 
281
 
282
- with gr.TabItem("🏅 Link Prediction Leaderboard", elem_id="llm-benchmark-tab-table", id=2):
283
  COLS = COLS_LP
284
  AutoEvalColumn = AutoEvalColumn_LinkPrediction
285
  original_df = get_leaderboard_df(EVAL_REQUESTS_PATH, "Link Prediction")
@@ -350,6 +353,8 @@ with demo:
350
  leaderboard_table,
351
  queue=True,
352
  )
 
 
353
 
354
  with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
355
  with gr.Column():
@@ -388,7 +393,7 @@ with demo:
388
  github_url_textbox = gr.Textbox(label="GitHub URL Link")
389
  #parameters_textbox = gr.Textbox(label="Number of parameters")
390
  task_track = gr.Dropdown(
391
- choices=['Node Classification', 'Node Regression', 'Link Prediction'],
392
  label="Choose the task track",
393
  multiselect=False,
394
  value=None,
 
134
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
135
 
136
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
137
+ with gr.TabItem("🏅 Entity Classification Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
138
  COLS = COLS_NC
139
  AutoEvalColumn = AutoEvalColumn_NodeClassification
140
  original_df = get_leaderboard_df(EVAL_REQUESTS_PATH, "Node Classification")
 
206
  leaderboard_table,
207
  queue=True,
208
  )
209
+ gr.Markdown("Evaluation metric: AUROC ⬆️")
210
+
211
 
212
+ with gr.TabItem("🏅 Entity Regression Leaderboard", elem_id="llm-benchmark-tab-table", id=1):
213
  COLS = COLS_NR
214
  AutoEvalColumn = AutoEvalColumn_NodeRegression
215
  original_df = get_leaderboard_df(EVAL_REQUESTS_PATH, "Node Regression")
 
280
  leaderboard_table,
281
  queue=True,
282
  )
283
+ gr.Markdown("Evaluation metric: MAE ⬇️")
284
 
285
+ with gr.TabItem("🏅 Recommendation Leaderboard", elem_id="llm-benchmark-tab-table", id=2):
286
  COLS = COLS_LP
287
  AutoEvalColumn = AutoEvalColumn_LinkPrediction
288
  original_df = get_leaderboard_df(EVAL_REQUESTS_PATH, "Link Prediction")
 
353
  leaderboard_table,
354
  queue=True,
355
  )
356
+ gr.Markdown("Evaluation metric: MAE ⬇️")
357
+
358
 
359
  with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
360
  with gr.Column():
 
393
  github_url_textbox = gr.Textbox(label="GitHub URL Link")
394
  #parameters_textbox = gr.Textbox(label="Number of parameters")
395
  task_track = gr.Dropdown(
396
+ choices=['Entity Classification', 'Entity Regression', 'Recommendation'],
397
  label="Choose the task track",
398
  multiselect=False,
399
  value=None,
src/about.py CHANGED
@@ -79,7 +79,7 @@ Once you have developed your model and got results, you can submit your test res
79
  - **Is it an official submission**: Whether the implementation is official (implementation by authors who proposed the method) or unofficial (re-implementation of the method by non-authors).
80
  - **Paper URL Link**: The original paper describing the method (arXiv link is recommended. paper needs not be peer-reviewed). If your method has any original component (e.g., even just combining existing methods XXX and YYY), you have to write a technical report describing it (e.g., how you exactly combined XXX and YYY).
81
  - **GitHub URL Link**: The Github repository or directory containining all code to reproduce the result. A placeholder repository is not allowed.
82
- - **Task Track**: Choose the task you submit to, from node classification, node regression, and link prediction.
83
  - **Honor code**: Please acknowledge that your submission adheres to all the ethical policies and your result is reproducible.
84
  - **Test performance**: Raw test performance output by RelBench model evaluators, where average and unbiased standard deviation must be taken over 5 different random seeds. You can either not fix random seeds at all, or use the random seeds from 0 to 4. We highly discourage you to tune the random seeds.
85
  - **Validation performance**: Validation performance of the model that is used to report the test performance above.
 
79
  - **Is it an official submission**: Whether the implementation is official (implementation by authors who proposed the method) or unofficial (re-implementation of the method by non-authors).
80
  - **Paper URL Link**: The original paper describing the method (arXiv link is recommended. paper needs not be peer-reviewed). If your method has any original component (e.g., even just combining existing methods XXX and YYY), you have to write a technical report describing it (e.g., how you exactly combined XXX and YYY).
81
  - **GitHub URL Link**: The Github repository or directory containining all code to reproduce the result. A placeholder repository is not allowed.
82
+ - **Task Track**: Choose the task you submit to, from entity classification, entity regression, and recommendation.
83
  - **Honor code**: Please acknowledge that your submission adheres to all the ethical policies and your result is reproducible.
84
  - **Test performance**: Raw test performance output by RelBench model evaluators, where average and unbiased standard deviation must be taken over 5 different random seeds. You can either not fix random seeds at all, or use the random seeds from 0 to 4. We highly discourage you to tune the random seeds.
85
  - **Validation performance**: Validation performance of the model that is used to report the test performance above.
src/populate.py CHANGED
@@ -35,18 +35,41 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
35
  #df = df[has_no_nan_values(df, benchmark_cols)]
36
  return raw_data, df
37
  '''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  def format_number(num):
39
  return f"{num:.3f}"
40
  def get_leaderboard_df(EVAL_REQUESTS_PATH, task_type) -> pd.DataFrame:
41
- if task_type == 'Node Classification':
42
  ascending = False
43
  tasks = nc_tasks
44
- elif task_type == 'Node Regression':
 
45
  ascending = True
46
  tasks = nr_tasks
47
- elif task_type == 'Link Prediction':
 
48
  ascending = False
49
  tasks = lp_tasks
 
50
 
51
  model_result_filepaths = []
52
  for root,_, files in os.walk(EVAL_REQUESTS_PATH):
@@ -60,7 +83,7 @@ def get_leaderboard_df(EVAL_REQUESTS_PATH, task_type) -> pd.DataFrame:
60
  import json
61
  with open(model) as f:
62
  out = json.load(f)
63
- if ('task' in out) and (out['task'] == task_type):
64
  model_res.append(out)
65
 
66
  for model in model_res:
@@ -87,11 +110,13 @@ def get_leaderboard_df(EVAL_REQUESTS_PATH, task_type) -> pd.DataFrame:
87
  df_res = pd.DataFrame(columns=columns_to_show)
88
 
89
  #df_res = pd.DataFrame([{col: model[col] for col in columns_to_show} for model in model_res])
90
- print(df_res)
91
  ranks = df_res[list(name2short_name.values())].rank(ascending = ascending)
92
  df_res.rename(columns={'model': 'Model', 'author': 'Author', 'email': 'Email', 'paper_url': 'Paper URL', 'github_url': 'Github URL', 'submitted_time': 'Time'}, inplace=True)
93
  df_res['Average Rank⬆️'] = ranks.mean(axis=1)
94
  df_res.sort_values(by='Average Rank⬆️', ascending=True, inplace=True)
 
 
95
  return df_res
96
 
97
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
 
35
  #df = df[has_no_nan_values(df, benchmark_cols)]
36
  return raw_data, df
37
  '''
38
+
39
+ # Function to extract the numerical part before '+'
40
+ def extract_x(value):
41
+ return float(value.split('+')[0])
42
+
43
+ # Function to highlight the highest (or lowest) value based on X
44
+ def make_bold(df, cols, ascending):
45
+ df_highlight = df.copy()
46
+
47
+ def apply_highlight(s):
48
+ if ascending:
49
+ max_idx = s.apply(extract_x).idxmin()
50
+ else:
51
+ max_idx = s.apply(extract_x).idxmax()
52
+
53
+ return ['font-weight: bold' if i == max_idx else '' for i in range(len(s))]
54
+
55
+ styler = df_highlight.style.apply(lambda x: apply_highlight(x) if x.name in cols else ['']*len(x), axis=0)
56
+ return styler
57
+
58
  def format_number(num):
59
  return f"{num:.3f}"
60
  def get_leaderboard_df(EVAL_REQUESTS_PATH, task_type) -> pd.DataFrame:
61
+ if task_type in ['Node Classification', 'Entity Classification']:
62
  ascending = False
63
  tasks = nc_tasks
64
+ task_type = ['Node Classification', 'Entity Classification']
65
+ elif task_type in ['Node Regression', 'Entity Regression']:
66
  ascending = True
67
  tasks = nr_tasks
68
+ task_type = ['Node Regression', 'Entity Regression']
69
+ elif task_type in ['Link Prediction', 'Recommendation']:
70
  ascending = False
71
  tasks = lp_tasks
72
+ task_type = ['Link Prediction', 'Recommendation']
73
 
74
  model_result_filepaths = []
75
  for root,_, files in os.walk(EVAL_REQUESTS_PATH):
 
83
  import json
84
  with open(model) as f:
85
  out = json.load(f)
86
+ if ('task' in out) and (out['task'] in task_type):
87
  model_res.append(out)
88
 
89
  for model in model_res:
 
110
  df_res = pd.DataFrame(columns=columns_to_show)
111
 
112
  #df_res = pd.DataFrame([{col: model[col] for col in columns_to_show} for model in model_res])
113
+
114
  ranks = df_res[list(name2short_name.values())].rank(ascending = ascending)
115
  df_res.rename(columns={'model': 'Model', 'author': 'Author', 'email': 'Email', 'paper_url': 'Paper URL', 'github_url': 'Github URL', 'submitted_time': 'Time'}, inplace=True)
116
  df_res['Average Rank⬆️'] = ranks.mean(axis=1)
117
  df_res.sort_values(by='Average Rank⬆️', ascending=True, inplace=True)
118
+ #df_res = make_bold(df_res, list(name2short_name.values()), ascending = ascending)
119
+ print(df_res)
120
  return df_res
121
 
122
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]: