annamonica commited on
Commit
cebed79
·
verified ·
1 Parent(s): 34b0224

update-leaderboard (#2)

Browse files

- add toy results (caa5e2c7a15b22fffec51a9b2125523e08aba0b3)
- update column names (492f4350b7cca4b995eb44cf6af09ea22e5b8cd3)
- update about and env (d4cb50ad7a467c731f3363b755c3ab6264a03b75)
- add basic static leaderboard (bca96936ac7b6b6d1a21189155c05ca2340329da)

Files changed (6) hide show
  1. app.py +30 -106
  2. results/BOOM_leaderboard.csv +15 -0
  3. src/about.py +15 -5
  4. src/display/utils.py +33 -23
  5. src/envs.py +8 -5
  6. src/populate.py +54 -10
app.py CHANGED
@@ -22,7 +22,7 @@ from src.display.utils import (
22
  ModelType,
23
  fields,
24
  WeightType,
25
- Precision
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
@@ -32,24 +32,40 @@ from src.submission.submit import add_new_eval
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
 
35
  ### Space initialisation
36
  try:
37
  print(EVAL_REQUESTS_PATH)
38
  snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
40
  )
41
  except Exception:
42
  restart_space()
43
  try:
44
  print(EVAL_RESULTS_PATH)
45
  snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
47
  )
48
  except Exception:
49
  restart_space()
50
 
51
 
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
 
 
 
 
53
 
54
  (
55
  finished_eval_queue_df,
@@ -57,7 +73,9 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
57
  pending_eval_queue_df,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
 
60
  def init_leaderboard(dataframe):
 
61
  if dataframe is None or dataframe.empty:
62
  raise ValueError("Leaderboard DataFrame is empty or None.")
63
  return Leaderboard(
@@ -68,21 +86,10 @@ def init_leaderboard(dataframe):
68
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
  label="Select Columns to Display:",
70
  ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
  ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
  ],
87
  bool_checkboxgroup_label="Hide models",
88
  interactive=False,
@@ -95,98 +102,15 @@ with demo:
95
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
  leaderboard = init_leaderboard(LEADERBOARD_DF)
100
 
101
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
103
 
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
 
191
  with gr.Row():
192
  with gr.Accordion("📙 Citation", open=False):
@@ -201,4 +125,4 @@ with demo:
201
  scheduler = BackgroundScheduler()
202
  scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
22
  ModelType,
23
  fields,
24
  WeightType,
25
+ Precision,
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
 
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
+
36
  ### Space initialisation
37
  try:
38
  print(EVAL_REQUESTS_PATH)
39
  snapshot_download(
40
+ repo_id=QUEUE_REPO,
41
+ local_dir=EVAL_REQUESTS_PATH,
42
+ repo_type="dataset",
43
+ tqdm_class=None,
44
+ etag_timeout=30,
45
+ token=TOKEN,
46
  )
47
  except Exception:
48
  restart_space()
49
  try:
50
  print(EVAL_RESULTS_PATH)
51
  snapshot_download(
52
+ repo_id=RESULTS_REPO,
53
+ local_dir=EVAL_RESULTS_PATH,
54
+ repo_type="dataset",
55
+ tqdm_class=None,
56
+ etag_timeout=30,
57
+ token=TOKEN,
58
  )
59
  except Exception:
60
  restart_space()
61
 
62
 
63
+ LEADERBOARD_DF = get_leaderboard_df(
64
+ EVAL_RESULTS_PATH + "/" + "BOOM_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
65
+ )
66
+ LEADERBOARD_DF_DOMAIN = get_leaderboard_df(
67
+ EVAL_RESULTS_PATH + "/" + "BOOM_leaderboard.csv", EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
68
+ )
69
 
70
  (
71
  finished_eval_queue_df,
 
73
  pending_eval_queue_df,
74
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
75
 
76
+
77
  def init_leaderboard(dataframe):
78
+ # TODO: merge results df with model info df
79
  if dataframe is None or dataframe.empty:
80
  raise ValueError("Leaderboard DataFrame is empty or None.")
81
  return Leaderboard(
 
86
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
87
  label="Select Columns to Display:",
88
  ),
89
+ search_columns=[AutoEvalColumn.model.name],
90
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
91
  filter_columns=[
92
  ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
 
 
 
 
 
 
 
 
 
 
 
93
  ],
94
  bool_checkboxgroup_label="Hide models",
95
  interactive=False,
 
102
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
103
 
104
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
105
+ with gr.TabItem("🏅 Overall", elem_id="boom-benchmark-tab-table", id=0):
106
  leaderboard = init_leaderboard(LEADERBOARD_DF)
107
 
108
+ # TODO - add other tabs if needed
109
+ with gr.TabItem("🏅 By Domain - TODO", elem_id="boom-benchmark-tab-table", id=1):
110
+ leaderboard = init_leaderboard(LEADERBOARD_DF_DOMAIN) # TODO - update table data
111
 
112
+ with gr.TabItem("📝 About - TODO", elem_id="boom-benchmark-tab-table", id=2):
113
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  with gr.Row():
116
  with gr.Accordion("📙 Citation", open=False):
 
125
  scheduler = BackgroundScheduler()
126
  scheduler.add_job(restart_space, "interval", seconds=1800)
127
  scheduler.start()
128
+ demo.queue(default_concurrency_limit=40).launch()
results/BOOM_leaderboard.csv ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model,model_type,MASE_6750_scaled,CRPS_6750_scaled,Rank_6750_scaled,MAE_663_unscaled,CRPS_663_unscaled,Rank_663_unscaled
2
+ Toto-Open-Base-1.0,pretrained,0.617,0.375,2.351,0.001,0.025,7.549
3
+ moirai_1.1_base,pretrained,0.710,0.428,4.278,0.000,0.003,5.644
4
+ moirai_1.1_large,pretrained,0.720,0.436,4.499,0.001,0.005,6.707
5
+ moirai_1.1_small,pretrained,0.738,0.447,4.796,0.001,0.009,7.404
6
+ timesfm_2_0_500m,pretrained,0.725,0.447,5.153,0.014,0.091,10.029
7
+ chronos_bolt_base,pretrained,0.726,0.451,5.446,0.003,0.019,7.682
8
+ chronos_bolt_small,pretrained,0.733,0.455,5.793,0.003,0.022,8.140
9
+ autoarima,statistical,0.824,0.736,9.171,0.000,0.001,5.496
10
+ timer,pretrained,0.796,0.639,9.356,0.001,0.005,6.474
11
+ time-moe,pretrained,0.806,0.649,9.369,0.001,0.005,8.505
12
+ visionts,pretrained,0.991,0.675,10.336,0.001,0.009,8.538
13
+ autoets,statistical,0.842,1.975,10.956,0.000,0.030,6.992
14
+ autotheta,statistical,1.123,1.018,11.712,0.001,0.002,6.513
15
+ naive,statistical,1.000,1.000,11.783,0.000,0.006,9.326
src/about.py CHANGED
@@ -1,6 +1,7 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
 
4
  @dataclass
5
  class Task:
6
  benchmark: str
@@ -11,24 +12,25 @@ class Task:
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
  task0 = Task("anli_r1", "acc", "ANLI")
16
  task1 = Task("logiqa", "acc_norm", "LogiQA")
17
 
18
- NUM_FEWSHOT = 0 # Change with your few shot
19
- # ---------------------------------------------------
20
 
 
 
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
 
32
  LLM_BENCHMARKS_TEXT = f"""
33
  ## How it works
34
 
@@ -69,4 +71,12 @@ If everything is done, check you can launch the EleutherAIHarness on your model
69
 
70
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
  CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
 
 
 
72
  """
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+
5
  @dataclass
6
  class Task:
7
  benchmark: str
 
12
  # Select your tasks here
13
  # ---------------------------------------------------
14
  class Tasks(Enum):
15
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
  task0 = Task("anli_r1", "acc", "ANLI")
17
  task1 = Task("logiqa", "acc_norm", "LogiQA")
18
 
 
 
19
 
20
+ NUM_FEWSHOT = 0 # Change with your few shot
21
+ # ---------------------------------------------------
22
 
23
 
24
  # Your leaderboard name
25
+ TITLE = """<h1 align="center" id="space-title">BOOM 💥 Time Series Forecasting Leaderboard</h1>"""
26
 
27
  # What does your leaderboard evaluate?
28
  INTRODUCTION_TEXT = """
29
+ BOOM (Benchmark of Observability Metrics) is a large-scale, real-world time series dataset designed for evaluating models on forecasting tasks in complex observability environments. Composed of real-world metrics data collected from Datadog, a leading observability platform, the benchmark captures the irregularity, structural complexity, and heavy-tailed statistics typical of production observability data. For more information, please refer to the [BOOM Dataset Card](https://huggingface.co/datasets/Datadog/BOOM) and the [BOOM GitHub repository](https://github.com/DataDog/toto?tab=readme-ov-file#boom-benchmark-of-observability-metrics)
30
  """
31
 
32
  # Which evaluations are you running? how can people reproduce what you have?
33
+ # TODO
34
  LLM_BENCHMARKS_TEXT = f"""
35
  ## How it works
36
 
 
71
 
72
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
73
  CITATION_BUTTON_TEXT = r"""
74
+ @misc{toto2025,
75
+ title={This Time is Different: An Observability Perspective on Time Series Foundation Models},
76
+ author={TODO},
77
+ year={2025},
78
+ eprint={arXiv:TODO},
79
+ archivePrefix={arXiv},
80
+ primaryClass={cs.LG}
81
+ }
82
  """
src/display/utils.py CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
5
 
6
  from src.about import Tasks
7
 
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
 
@@ -20,29 +21,34 @@ class ColumnContent:
20
  hidden: bool = False
21
  never_hidden: bool = False
22
 
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
- for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
 
 
32
  # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
 
 
46
  ## For the queue columns in the submission tab
47
  @dataclass(frozen=True)
48
  class EvalQueueColumn: # Queue column
@@ -53,19 +59,21 @@ class EvalQueueColumn: # Queue column
53
  weight_type = ColumnContent("weight_type", "str", "Original")
54
  status = ColumnContent("status", "str", True)
55
 
 
56
  ## All the model information that we might need
57
  @dataclass
58
  class ModelDetails:
59
  name: str
60
  display_name: str = ""
61
- symbol: str = "" # emoji
62
 
63
 
64
  class ModelType(Enum):
65
- PT = ModelDetails(name="pretrained", symbol="🟢")
66
- FT = ModelDetails(name="fine-tuned", symbol="🔶")
67
- IFT = ModelDetails(name="instruction-tuned", symbol="")
68
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
 
69
  Unknown = ModelDetails(name="", symbol="?")
70
 
71
  def to_str(self, separator=" "):
@@ -77,17 +85,19 @@ class ModelType(Enum):
77
  return ModelType.FT
78
  if "pretrained" in type or "🟢" in type:
79
  return ModelType.PT
80
- if "RL-tuned" in type or "🟦" in type:
81
- return ModelType.RL
82
- if "instruction-tuned" in type or "" in type:
83
- return ModelType.IFT
84
  return ModelType.Unknown
85
 
 
86
  class WeightType(Enum):
87
  Adapter = ModelDetails("Adapter")
88
  Original = ModelDetails("Original")
89
  Delta = ModelDetails("Delta")
90
 
 
91
  class Precision(Enum):
92
  float16 = ModelDetails("float16")
93
  bfloat16 = ModelDetails("bfloat16")
@@ -100,6 +110,7 @@ class Precision(Enum):
100
  return Precision.bfloat16
101
  return Precision.Unknown
102
 
 
103
  # Column selection
104
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
105
 
@@ -107,4 +118,3 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
-
 
5
 
6
  from src.about import Tasks
7
 
8
+
9
  def fields(raw_class):
10
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
11
 
 
21
  hidden: bool = False
22
  never_hidden: bool = False
23
 
24
+
25
  ## Leaderboard columns
26
  auto_eval_column_dict = []
27
  # Init
28
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
29
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
30
+ # Scores
31
+ auto_eval_column_dict.append(["MASE_6750_scaled", ColumnContent, ColumnContent("MASE_scaled", "number", True)])
32
+ auto_eval_column_dict.append(["CRPS_6750_scaled", ColumnContent, ColumnContent("CRPS_scaled", "number", True)])
33
+ auto_eval_column_dict.append(["Rank_6750_scaled", ColumnContent, ColumnContent("Rank_scaled", "number", True)])
34
+ auto_eval_column_dict.append(["MAE_663_unscaled", ColumnContent, ColumnContent("MAE[0.5]_unscaled", "number", True)])
35
+ auto_eval_column_dict.append(["CRPS_663_unscaled", ColumnContent, ColumnContent("CRPS_unscaled", "number", True)])
36
+ auto_eval_column_dict.append(["Rank_663_unscaled", ColumnContent, ColumnContent("Rank_unscaled", "number", True)])
37
  # Model information
38
+ auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False, hidden=True)])
39
+ # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
40
+ # auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
41
+ # auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
42
+ # auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
43
+ # auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
44
+ # auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
45
+ # auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
46
+ # auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
47
 
48
  # We use make dataclass to dynamically fill the scores from Tasks
49
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
50
 
51
+
52
  ## For the queue columns in the submission tab
53
  @dataclass(frozen=True)
54
  class EvalQueueColumn: # Queue column
 
59
  weight_type = ColumnContent("weight_type", "str", "Original")
60
  status = ColumnContent("status", "str", True)
61
 
62
+
63
  ## All the model information that we might need
64
  @dataclass
65
  class ModelDetails:
66
  name: str
67
  display_name: str = ""
68
+ symbol: str = "" # emoji
69
 
70
 
71
  class ModelType(Enum):
72
+ PT = ModelDetails(name="🟢 pretrained", symbol="🟢")
73
+ FT = ModelDetails(name="🔶 fine-tuned", symbol="🔶")
74
+ DL = ModelDetails(name="🔷 deep-learning", symbol="🔷")
75
+ ST = ModelDetails(name="🟣 statistical", symbol="🟣")
76
+
77
  Unknown = ModelDetails(name="", symbol="?")
78
 
79
  def to_str(self, separator=" "):
 
85
  return ModelType.FT
86
  if "pretrained" in type or "🟢" in type:
87
  return ModelType.PT
88
+ if "deep-learning" in type or "🟦" in type:
89
+ return ModelType.DL
90
+ if "statistical" in type or "🟣" in type:
91
+ return ModelType.ST
92
  return ModelType.Unknown
93
 
94
+
95
  class WeightType(Enum):
96
  Adapter = ModelDetails("Adapter")
97
  Original = ModelDetails("Original")
98
  Delta = ModelDetails("Delta")
99
 
100
+
101
  class Precision(Enum):
102
  float16 = ModelDetails("float16")
103
  bfloat16 = ModelDetails("bfloat16")
 
110
  return Precision.bfloat16
111
  return Precision.Unknown
112
 
113
+
114
  # Column selection
115
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
116
 
 
118
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
119
 
120
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 
src/envs.py CHANGED
@@ -4,21 +4,24 @@ from huggingface_hub import HfApi
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
- TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 
 
10
  # ----------------------------------
11
 
12
- REPO_ID = f"{OWNER}/leaderboard"
13
  QUEUE_REPO = f"{OWNER}/requests"
14
  RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
- CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
- EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 
22
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
 
 
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
+ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ OWNER = (
10
+ "Datadog" # Change to your org - don't forget to create a results and request dataset, with the correct format!
11
+ )
12
  # ----------------------------------
13
 
14
+ REPO_ID = f"{OWNER}/BOOM-Leaderboard" # The repo id of your space
15
  QUEUE_REPO = f"{OWNER}/requests"
16
  RESULTS_REPO = f"{OWNER}/results"
17
 
18
  # If you setup a cache later, just change HF_HOME
19
+ CACHE_PATH = os.getenv("HF_HOME", ".")
20
 
21
  # Local caches
22
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
23
+ # EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
24
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "results")
25
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
26
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
27
 
src/populate.py CHANGED
@@ -2,23 +2,65 @@ import json
2
  import os
3
 
4
  import pandas as pd
5
-
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
- """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
- df = df[cols].round(decimals=2)
 
 
19
 
20
- # filter out if any of the benchmarks have not been produced
21
- df = df[has_no_nan_values(df, benchmark_cols)]
22
  return df
23
 
24
 
@@ -39,7 +81,9 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
39
  all_evals.append(data)
40
  elif ".md" not in entry:
41
  # this is a folder
42
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
 
 
43
  for sub_entry in sub_entries:
44
  file_path = os.path.join(save_path, entry, sub_entry)
45
  with open(file_path) as fp:
 
2
  import os
3
 
4
  import pandas as pd
5
+ from dataclasses import fields
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
+ from src.display.utils import ModelType
10
+
11
+
12
+ # def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
+ # """Creates a dataframe from all the individual experiment results"""
14
+ # raw_data = get_raw_eval_results(results_path, requests_path)
15
+ # all_data_json = [v.to_dict() for v in raw_data]
16
+
17
+ # df = pd.DataFrame.from_records(all_data_json)
18
+ # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
19
+ # df = df[cols].round(decimals=2)
20
+
21
+ # # filter out if any of the benchmarks have not been produced
22
+ # df = df[has_no_nan_values(df, benchmark_cols)]
23
+ # return df
24
 
25
 
26
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
27
+ """
28
+ Processes a STATIC results CSV file to generate a leaderboard DataFrame with formatted columns and sorted values.
29
+ Args:
30
+ results_path (str): The file path to the results CSV file.
31
+ Returns:
32
+ pd.DataFrame: A processed DataFrame with renamed columns, additional formatting, and sorted values.
33
+ Notes:
34
+ - The function reads a CSV file from the given `results_path`.
35
+ - Internal column names are mapped to display names using `AutoEvalColumn`.
36
+ - A new column for model type symbols is created by parsing the `model_type` column.
37
+ - The `model_type` column is updated to prepend the model type symbol.
38
+ - The DataFrame is sorted by the `Rank_6750_scaled` column in ascending order.
39
+ """
40
+
41
+ df = pd.read_csv(results_path)
42
+ # Create the mapping from internal column name to display name
43
+
44
+ column_mapping = {field.name: getattr(AutoEvalColumn, field.name).name for field in fields(AutoEvalColumn)}
45
+ # Assuming `df` is your DataFrame:
46
+ df.rename(columns=column_mapping, inplace=True)
47
+
48
+ # Create a new column for model type symbol by parsing the model_type column
49
+ df[AutoEvalColumn.model_type_symbol.name] = df[AutoEvalColumn.model_type.name].apply(
50
+ lambda x: ModelType.from_str(x).value.symbol
51
+ )
52
+ # Prepend the value of model_type_symbol to the value of model_type
53
+ df[AutoEvalColumn.model_type.name] = (
54
+ df[AutoEvalColumn.model_type_symbol.name] + " " + df[AutoEvalColumn.model_type.name]
55
+ )
56
 
57
+ # Move the model_type_symbol column to the beginning
58
+ cols = [AutoEvalColumn.model_type_symbol.name] + [
59
+ col for col in df.columns if col != AutoEvalColumn.model_type_symbol.name
60
+ ]
61
+ df = df[cols]
62
 
63
+ df = df.sort_values(by=[AutoEvalColumn.Rank_6750_scaled.name], ascending=True)
 
64
  return df
65
 
66
 
 
81
  all_evals.append(data)
82
  elif ".md" not in entry:
83
  # this is a folder
84
+ sub_entries = [
85
+ e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")
86
+ ]
87
  for sub_entry in sub_entries:
88
  file_path = os.path.join(save_path, entry, sub_entry)
89
  with open(file_path) as fp: