sxie78-dd commited on
Commit
8110fce
·
unverified ·
1 Parent(s): e17e9c6

initial attempt to make leaderboard working

Browse files
Files changed (6) hide show
  1. app.py +15 -134
  2. src/about.py +18 -5
  3. src/display/formatting.py +2 -0
  4. src/display/utils.py +36 -25
  5. src/envs.py +5 -3
  6. src/populate.py +52 -47
app.py CHANGED
@@ -1,13 +1,9 @@
1
  import gradio as gr
2
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
- import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
 
7
  from src.about import (
8
  CITATION_BUTTON_LABEL,
9
  CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
  INTRODUCTION_TEXT,
12
  LLM_BENCHMARKS_TEXT,
13
  TITLE,
@@ -16,46 +12,23 @@ from src.display.css_html_js import custom_css
16
  from src.display.utils import (
17
  BENCHMARK_COLS,
18
  COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
  AutoEvalColumn,
22
- ModelType,
23
  fields,
24
- WeightType,
25
- Precision
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
  from src.submission.submit import add_new_eval
 
30
 
31
 
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
 
 
 
 
51
 
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
@@ -68,20 +41,15 @@ def init_leaderboard(dataframe):
68
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
  label="Select Columns to Display:",
70
  ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
  ColumnFilter(
77
- AutoEvalColumn.params.name,
78
  type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
  ),
86
  ],
87
  bool_checkboxgroup_label="Hide models",
@@ -95,99 +63,12 @@ with demo:
95
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
  leaderboard = init_leaderboard(LEADERBOARD_DF)
100
 
101
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
 
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
-
191
  with gr.Row():
192
  with gr.Accordion("📙 Citation", open=False):
193
  citation_button = gr.Textbox(
@@ -198,7 +79,7 @@ with demo:
198
  show_copy_button=True,
199
  )
200
 
201
- scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
- scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
1
  import gradio as gr
2
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 
 
 
3
 
4
  from src.about import (
5
  CITATION_BUTTON_LABEL,
6
  CITATION_BUTTON_TEXT,
 
7
  INTRODUCTION_TEXT,
8
  LLM_BENCHMARKS_TEXT,
9
  TITLE,
 
12
  from src.display.utils import (
13
  BENCHMARK_COLS,
14
  COLS,
 
 
15
  AutoEvalColumn,
 
16
  fields,
 
 
17
  )
18
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
19
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
20
  from src.submission.submit import add_new_eval
21
+ from src.populate import get_leaderboard_df
22
 
23
 
24
  def restart_space():
25
  API.restart_space(repo_id=REPO_ID)
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ LEADERBOARD_DF = get_leaderboard_df(
29
+ EVAL_RESULTS_PATH + "/ARFBench_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
30
+ )
31
 
 
 
 
 
 
 
 
32
 
33
  def init_leaderboard(dataframe):
34
  if dataframe is None or dataframe.empty:
 
41
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
42
  label="Select Columns to Display:",
43
  ),
44
+ search_columns=[AutoEvalColumn.model.name],
45
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
46
  filter_columns=[
 
 
47
  ColumnFilter(
48
+ AutoEvalColumn.pass_at_1.name,
49
  type="slider",
50
+ min=0,
51
+ max=100,
52
+ label="pass@1 score",
 
 
 
53
  ),
54
  ],
55
  bool_checkboxgroup_label="Hide models",
 
63
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
64
 
65
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
66
+ with gr.TabItem("🏅 ARFBench Leaderboard", elem_id="arfbench-tab-table", id=0):
67
  leaderboard = init_leaderboard(LEADERBOARD_DF)
68
 
69
+ with gr.TabItem("📝 About", elem_id="about-tab-table", id=1):
70
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  with gr.Row():
73
  with gr.Accordion("📙 Citation", open=False):
74
  citation_button = gr.Textbox(
 
79
  show_copy_button=True,
80
  )
81
 
82
+ scheduler = None
83
+ demo.queue(default_concurrency_limit=40)
84
+ if __name__ == "__main__":
85
+ demo.launch()
src/about.py CHANGED
@@ -1,6 +1,7 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
 
4
  @dataclass
5
  class Task:
6
  benchmark: str
@@ -11,13 +12,13 @@ class Task:
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
  task0 = Task("anli_r1", "acc", "ANLI")
16
  task1 = Task("logiqa", "acc_norm", "LogiQA")
17
 
18
- NUM_FEWSHOT = 0 # Change with your few shot
19
- # ---------------------------------------------------
20
 
 
 
21
 
22
 
23
  # Your leaderboard name
@@ -25,8 +26,20 @@ TITLE = """<h1 align="center" id="space-title">ARFBench Multimodal Time Series R
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- **ARF**Bench (**A**nomaly **R**easoning **F**ramework Benchmark) is a multimodal time-series reasoning benchmark consisting of 550 question-answer (QA) pairs composed from
29
- real-world incident data collected at Datadog, a leading observability platform.
 
 
 
 
 
 
 
 
 
 
 
 
30
  """
31
 
32
  # Which evaluations are you running? how can people reproduce what you have?
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+
5
  @dataclass
6
  class Task:
7
  benchmark: str
 
12
  # Select your tasks here
13
  # ---------------------------------------------------
14
  class Tasks(Enum):
15
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
  task0 = Task("anli_r1", "acc", "ANLI")
17
  task1 = Task("logiqa", "acc_norm", "LogiQA")
18
 
 
 
19
 
20
+ NUM_FEWSHOT = 0 # Change with your few shot
21
+ # ---------------------------------------------------
22
 
23
 
24
  # Your leaderboard name
 
26
 
27
  # What does your leaderboard evaluate?
28
  INTRODUCTION_TEXT = """
29
+ **ARF**Bench (**A**nomaly **R**easoning **F**ramework Benchmark) is a
30
+ multimodal time-series reasoning benchmark consisting of 550 question-answer
31
+ (QA) pairs composed from real-world incident data collected at Datadog,
32
+ a leading observability platform.
33
+
34
+ The benchmark evaluates models across various aspects of time-series anomaly reasoning:
35
+ - **Presence**: Detecting if anomalies exist in the data
36
+ - **Identification**: Identifying specific anomalous metrics
37
+ - **Start Time**: Determining when anomalies began
38
+ - **End Time**: Determining when anomalies ended
39
+ - **Magnitude**: Assessing the severity of anomalies
40
+ - **Categorization**: Classifying anomaly types
41
+ - **Correlation**: Understanding relationships between anomalies
42
+ - **Indicator**: Identifying leading indicators
43
  """
44
 
45
  # Which evaluations are you running? how can people reproduce what you have?
src/display/formatting.py CHANGED
@@ -1,4 +1,6 @@
1
  def model_hyperlink(link, model_name):
 
 
2
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
 
4
 
 
1
  def model_hyperlink(link, model_name):
2
+ if model_name == "":
3
+ return model_name
4
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
5
 
6
 
src/display/utils.py CHANGED
@@ -1,9 +1,6 @@
1
  from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
 
4
- import pandas as pd
5
-
6
- from src.about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -20,29 +17,28 @@ class ColumnContent:
20
  hidden: bool = False
21
  never_hidden: bool = False
22
 
23
- ## Leaderboard columns
 
24
  auto_eval_column_dict = []
25
- # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
- #auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
- #for task in Tasks:
31
- # auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
- # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- #auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- #auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- #auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- #auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- #auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- #auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- #auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- #auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
-
43
- # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
 
 
46
  ## For the queue columns in the submission tab
47
  @dataclass(frozen=True)
48
  class EvalQueueColumn: # Queue column
@@ -53,12 +49,13 @@ class EvalQueueColumn: # Queue column
53
  weight_type = ColumnContent("weight_type", "str", "Original")
54
  status = ColumnContent("status", "str", True)
55
 
 
56
  ## All the model information that we might need
57
  @dataclass
58
  class ModelDetails:
59
  name: str
60
  display_name: str = ""
61
- symbol: str = "" # emoji
62
 
63
 
64
  class ModelType(Enum):
@@ -83,11 +80,13 @@ class ModelType(Enum):
83
  return ModelType.TSFM
84
  return ModelType.Unknown
85
 
 
86
  class WeightType(Enum):
87
  Adapter = ModelDetails("Adapter")
88
  Original = ModelDetails("Original")
89
  Delta = ModelDetails("Delta")
90
 
 
91
  class Precision(Enum):
92
  float16 = ModelDetails("float16")
93
  bfloat16 = ModelDetails("bfloat16")
@@ -100,11 +99,23 @@ class Precision(Enum):
100
  return Precision.bfloat16
101
  return Precision.Unknown
102
 
 
103
  # Column selection
104
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
105
 
106
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
-
 
 
 
 
 
 
 
 
 
 
 
 
1
  from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
 
 
 
 
4
 
5
  def fields(raw_class):
6
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
17
  hidden: bool = False
18
  never_hidden: bool = False
19
 
20
+
21
+ # ARFBench Leaderboard columns
22
  auto_eval_column_dict = []
23
+ # Model column (always displayed)
 
24
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
25
+ # Performance metrics
26
+ auto_eval_column_dict.append(["pass_at_1", ColumnContent, ColumnContent("pass@1", "number", True)])
27
+ auto_eval_column_dict.append(["pass_at_5", ColumnContent, ColumnContent("pass@5", "number", True)])
28
+ # Specific benchmark metrics
29
+ auto_eval_column_dict.append(["presence", ColumnContent, ColumnContent("Presence", "number", True)])
30
+ auto_eval_column_dict.append(["identification", ColumnContent, ColumnContent("Identification", "number", True)])
31
+ auto_eval_column_dict.append(["start_time", ColumnContent, ColumnContent("Start Time", "number", True)])
32
+ auto_eval_column_dict.append(["end_time", ColumnContent, ColumnContent("End Time", "number", True)])
33
+ auto_eval_column_dict.append(["magnitude", ColumnContent, ColumnContent("Magnitude", "number", True)])
34
+ auto_eval_column_dict.append(["categorization", ColumnContent, ColumnContent("Categorization", "number", True)])
35
+ auto_eval_column_dict.append(["correlation", ColumnContent, ColumnContent("Correlation", "number", True)])
36
+ auto_eval_column_dict.append(["indicator", ColumnContent, ColumnContent("Indicator", "number", True)])
37
+
38
+ # We use make dataclass to dynamically fill the scores
 
 
39
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
40
 
41
+
42
  ## For the queue columns in the submission tab
43
  @dataclass(frozen=True)
44
  class EvalQueueColumn: # Queue column
 
49
  weight_type = ColumnContent("weight_type", "str", "Original")
50
  status = ColumnContent("status", "str", True)
51
 
52
+
53
  ## All the model information that we might need
54
  @dataclass
55
  class ModelDetails:
56
  name: str
57
  display_name: str = ""
58
+ symbol: str = "" # emoji
59
 
60
 
61
  class ModelType(Enum):
 
80
  return ModelType.TSFM
81
  return ModelType.Unknown
82
 
83
+
84
  class WeightType(Enum):
85
  Adapter = ModelDetails("Adapter")
86
  Original = ModelDetails("Original")
87
  Delta = ModelDetails("Delta")
88
 
89
+
90
  class Precision(Enum):
91
  float16 = ModelDetails("float16")
92
  bfloat16 = ModelDetails("bfloat16")
 
99
  return Precision.bfloat16
100
  return Precision.Unknown
101
 
102
+
103
  # Column selection
104
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
105
 
106
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
+ # Define the benchmark columns for ARFBench
110
+ BENCHMARK_COLS = [
111
+ "pass_at_1",
112
+ "pass_at_5",
113
+ "presence",
114
+ "identification",
115
+ "start_time",
116
+ "end_time",
117
+ "magnitude",
118
+ "categorization",
119
+ "correlation",
120
+ "indicator",
121
+ ]
src/envs.py CHANGED
@@ -4,9 +4,11 @@ from huggingface_hub import HfApi
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
- TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "Datadog" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 
 
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/ARFBench"
@@ -14,7 +16,7 @@ QUEUE_REPO = f"{OWNER}/requests"
14
  RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
- CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
+ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ OWNER = (
10
+ "Datadog" # Change to your org - don't forget to create a results and request dataset, with the correct format!
11
+ )
12
  # ----------------------------------
13
 
14
  REPO_ID = f"{OWNER}/ARFBench"
 
16
  RESULTS_REPO = f"{OWNER}/results"
17
 
18
  # If you setup a cache later, just change HF_HOME
19
+ CACHE_PATH = os.getenv("HF_HOME", ".")
20
 
21
  # Local caches
22
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
src/populate.py CHANGED
@@ -1,58 +1,63 @@
1
- import json
2
  import os
3
-
4
  import pandas as pd
5
 
6
- from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
- """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
15
-
16
- df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
- df = df[cols].round(decimals=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- # filter out if any of the benchmarks have not been produced
21
- df = df[has_no_nan_values(df, benchmark_cols)]
22
  return df
23
 
24
 
25
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
26
- """Creates the different dataframes for the evaluation queues requestes"""
27
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
28
- all_evals = []
29
-
30
- for entry in entries:
31
- if ".json" in entry:
32
- file_path = os.path.join(save_path, entry)
33
- with open(file_path) as fp:
34
- data = json.load(fp)
35
-
36
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
-
39
- all_evals.append(data)
40
- elif ".md" not in entry:
41
- # this is a folder
42
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
43
- for sub_entry in sub_entries:
44
- file_path = os.path.join(save_path, entry, sub_entry)
45
- with open(file_path) as fp:
46
- data = json.load(fp)
47
-
48
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
- all_evals.append(data)
51
-
52
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
53
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
54
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
55
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
56
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
- df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
58
- return df_finished[cols], df_running[cols], df_pending[cols]
 
 
1
  import os
 
2
  import pandas as pd
3
 
4
+ from src.display.formatting import make_clickable_model
 
 
5
 
6
 
7
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
8
+ """Creates a dataframe from the static CSV file"""
9
+ # Read the static CSV file
10
+ csv_path = os.path.join("results", "ARFBench_leaderboard.csv")
11
+
12
+ if not os.path.exists(csv_path):
13
+ raise FileNotFoundError(f"CSV file not found at {csv_path}")
14
+
15
+ df = pd.read_csv(csv_path)
16
+
17
+ # Clean up column names to match our schema
18
+ df.columns = df.columns.str.replace("pass@1", "pass_at_1")
19
+ df.columns = df.columns.str.replace("pass@5", "pass_at_5")
20
+ df.columns = df.columns.str.replace("Start Time", "start_time")
21
+ df.columns = df.columns.str.replace("End Time", "end_time")
22
+ df.columns = df.columns.str.lower()
23
+
24
+ # Make model names clickable
25
+ df["model"] = df["model"].apply(make_clickable_model)
26
+
27
+ # Sort by pass@1 performance (descending)
28
+ df = df.sort_values(by=["pass_at_1"], ascending=False)
29
+
30
+ # Round numeric columns to 2 decimal places
31
+ numeric_cols = [
32
+ "pass_at_1",
33
+ "pass_at_5",
34
+ "presence",
35
+ "identification",
36
+ "start_time",
37
+ "end_time",
38
+ "magnitude",
39
+ "categorization",
40
+ "correlation",
41
+ "indicator",
42
+ ]
43
+
44
+ for col in numeric_cols:
45
+ if col in df.columns:
46
+ df[col] = df[col].round(2)
47
+
48
+ # Handle missing values - replace with 0 or appropriate value
49
+ df = df.fillna(0)
50
+
51
+ # Select only the columns we need
52
+ available_cols = [col for col in cols if col in df.columns]
53
+ df = df[available_cols]
54
 
 
 
55
  return df
56
 
57
 
58
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
59
+ """Creates empty dataframes for evaluation queues since we're using
60
+ static data"""
61
+ # Return empty dataframes for the queue system
62
+ empty_df = pd.DataFrame(columns=cols)
63
+ return empty_df, empty_df, empty_df