annamonica commited on
Commit
8c93523
Β·
verified Β·
1 Parent(s): d472b41

some-updates (#3)

Browse files

- remove unused cols (02b73122a4d3837840957be33e56d53da75ac3f4)
- rename Type column and update About section (55a94df69759b8d20626024453688ef70b50cd91)

Files changed (4) hide show
  1. app.py +1 -1
  2. results/BOOM_leaderboard.csv +15 -15
  3. src/about.py +9 -6
  4. src/display/utils.py +7 -8
app.py CHANGED
@@ -109,7 +109,7 @@ with demo:
109
  with gr.TabItem("πŸ… By Domain - TODO", elem_id="boom-benchmark-tab-table", id=1):
110
  leaderboard = init_leaderboard(LEADERBOARD_DF_DOMAIN) # TODO - update table data
111
 
112
- with gr.TabItem("πŸ“ About - TODO", elem_id="boom-benchmark-tab-table", id=2):
113
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
114
 
115
  with gr.Row():
 
109
  with gr.TabItem("πŸ… By Domain - TODO", elem_id="boom-benchmark-tab-table", id=1):
110
  leaderboard = init_leaderboard(LEADERBOARD_DF_DOMAIN) # TODO - update table data
111
 
112
+ with gr.TabItem("πŸ“ About", elem_id="boom-benchmark-tab-table", id=2):
113
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
114
 
115
  with gr.Row():
results/BOOM_leaderboard.csv CHANGED
@@ -1,15 +1,15 @@
1
- model,model_type,MASE_6750_scaled,CRPS_6750_scaled,Rank_6750_scaled,MAE_663_unscaled,CRPS_663_unscaled,Rank_663_unscaled
2
- Toto-Open-Base-1.0,pretrained,0.617,0.375,2.351,0.001,0.025,7.549
3
- moirai_1.1_base,pretrained,0.710,0.428,4.278,0.000,0.003,5.644
4
- moirai_1.1_large,pretrained,0.720,0.436,4.499,0.001,0.005,6.707
5
- moirai_1.1_small,pretrained,0.738,0.447,4.796,0.001,0.009,7.404
6
- timesfm_2_0_500m,pretrained,0.725,0.447,5.153,0.014,0.091,10.029
7
- chronos_bolt_base,pretrained,0.726,0.451,5.446,0.003,0.019,7.682
8
- chronos_bolt_small,pretrained,0.733,0.455,5.793,0.003,0.022,8.140
9
- autoarima,statistical,0.824,0.736,9.171,0.000,0.001,5.496
10
- timer,pretrained,0.796,0.639,9.356,0.001,0.005,6.474
11
- time-moe,pretrained,0.806,0.649,9.369,0.001,0.005,8.505
12
- visionts,pretrained,0.991,0.675,10.336,0.001,0.009,8.538
13
- autoets,statistical,0.842,1.975,10.956,0.000,0.030,6.992
14
- autotheta,statistical,1.123,1.018,11.712,0.001,0.002,6.513
15
- naive,statistical,1.000,1.000,11.783,0.000,0.006,9.326
 
1
+ model,model_type,MASE_6750_scaled,CRPS_6750_scaled,Rank_6750_scaled
2
+ Toto-Open-Base-1.0,pretrained,0.617,0.375,2.351
3
+ moirai_1.1_base,pretrained,0.710,0.428,4.278
4
+ moirai_1.1_large,pretrained,0.720,0.436,4.499
5
+ moirai_1.1_small,pretrained,0.738,0.447,4.796
6
+ timesfm_2_0_500m,pretrained,0.725,0.447,5.153
7
+ chronos_bolt_base,pretrained,0.726,0.451,5.446
8
+ chronos_bolt_small,pretrained,0.733,0.455,5.793
9
+ autoarima,statistical,0.824,0.736,9.171
10
+ timer,pretrained,0.796,0.639,9.356
11
+ time-moe,pretrained,0.806,0.649,9.369
12
+ visionts,pretrained,0.991,0.675,10.336
13
+ autoets,statistical,0.842,1.975,10.956
14
+ autotheta,statistical,1.123,1.018,11.712
15
+ naive,statistical,1.000,1.000,11.783
src/about.py CHANGED
@@ -22,20 +22,23 @@ NUM_FEWSHOT = 0 # Change with your few shot
22
 
23
 
24
  # Your leaderboard name
25
- TITLE = """<h1 align="center" id="space-title">BOOM πŸ’₯ Time Series Forecasting Leaderboard</h1>"""
26
 
27
  # What does your leaderboard evaluate?
28
  INTRODUCTION_TEXT = """
29
- BOOM (Benchmark of Observability Metrics) is a large-scale, real-world time series dataset designed for evaluating models on forecasting tasks in complex observability environments. Composed of real-world metrics data collected from Datadog, a leading observability platform, the benchmark captures the irregularity, structural complexity, and heavy-tailed statistics typical of production observability data. For more information, please refer to the [BOOM Dataset Card](https://huggingface.co/datasets/Datadog/BOOM) and the [BOOM GitHub repository](https://github.com/DataDog/toto?tab=readme-ov-file#boom-benchmark-of-observability-metrics)
 
30
  """
31
 
32
  # Which evaluations are you running? how can people reproduce what you have?
33
- # TODO
34
  LLM_BENCHMARKS_TEXT = f"""
35
- ## How it works
36
 
37
- ## Reproducibility
38
- To reproduce our results, here is the commands you can run:
 
 
 
 
39
 
40
  """
41
 
 
22
 
23
 
24
  # Your leaderboard name
25
+ TITLE = """<h1 align="center" id="space-title">BOOM πŸ’₯ Observability Time-Series Forecasting Leaderboard</h1>"""
26
 
27
  # What does your leaderboard evaluate?
28
  INTRODUCTION_TEXT = """
29
+ BOOM (**B**enchmark **o**f **O**bservability **M**etrics) is a large-scale, real-world time series dataset designed for evaluating models on forecasting tasks in complex observability environments. Consisting of around 350 million time-series data points spanning 32,887 variables, the benchmark is derived from real-world metrics collected via Datadog, a leading observability platform. It therefore captures the irregularity, structural complexity, and heavy-tailed statistics typical of production observability data.
30
+ For more information, please refer to the [BOOM Dataset Card](https://huggingface.co/datasets/Datadog/BOOM) and the [BOOM GitHub repository](https://github.com/DataDog/toto?tab=readme-ov-file#boom-benchmark-of-observability-metrics)
31
  """
32
 
33
  # Which evaluations are you running? how can people reproduce what you have?
 
34
  LLM_BENCHMARKS_TEXT = f"""
 
35
 
36
+ BOOM is a large-scale, real-world time series dataset designed for benchmarking forecasting models in observability environments. The dataset captures the complexity and irregularity of production observability data, making it a challenging and realistic testbed for time series forecasting. BOOM consists of approximately 350 million time-series points across 32,887 variates. The dataset is split into 2,807 individual time series with one or multiple variates.
37
+ For more details and dataset structure, please refer to the [BOOM Dataset Card](https://huggingface.co/datasets/Datadog/BOOM).
38
+
39
+ The evaluation procedure is inspired by [Gift-Eval](https://github.com/SalesforceAIResearch/gift-eval): We evaluate models using **MASE (Mean Absolute Scaled Error)** for forecast accuracy, **CRPS (Continuous Ranked Probability Score)** for probabilistic forecast quality, and **Rank**β€”which determines overall performance and is used to order models on the leaderboard.
40
+
41
+ To reproduce our results, we provide a guide in the [BOOM GitHub repository](https://github.com/DataDog/toto/tree/main/boom) that explains how to install the required dependencies and includes example notebooks demonstrating how to evaluate both foundation models and statistical baselines on BOOM.
42
 
43
  """
44
 
src/display/utils.py CHANGED
@@ -25,17 +25,16 @@ class ColumnContent:
25
  ## Leaderboard columns
26
  auto_eval_column_dict = []
27
  # Init
28
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 
 
29
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
30
  # Scores
31
- auto_eval_column_dict.append(["MASE_6750_scaled", ColumnContent, ColumnContent("MASE_scaled", "number", True)])
32
- auto_eval_column_dict.append(["CRPS_6750_scaled", ColumnContent, ColumnContent("CRPS_scaled", "number", True)])
33
- auto_eval_column_dict.append(["Rank_6750_scaled", ColumnContent, ColumnContent("Rank_scaled", "number", True)])
34
- auto_eval_column_dict.append(["MAE_663_unscaled", ColumnContent, ColumnContent("MAE[0.5]_unscaled", "number", True)])
35
- auto_eval_column_dict.append(["CRPS_663_unscaled", ColumnContent, ColumnContent("CRPS_unscaled", "number", True)])
36
- auto_eval_column_dict.append(["Rank_663_unscaled", ColumnContent, ColumnContent("Rank_unscaled", "number", True)])
37
  # Model information
38
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False, hidden=True)])
39
  # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
40
  # auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
41
  # auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
 
25
  ## Leaderboard columns
26
  auto_eval_column_dict = []
27
  # Init
28
+ auto_eval_column_dict.append(
29
+ ["model_type_symbol", ColumnContent, ColumnContent("Type", "str", True, never_hidden=True)]
30
+ )
31
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
32
  # Scores
33
+ auto_eval_column_dict.append(["MASE_6750_scaled", ColumnContent, ColumnContent("MASE", "number", True)])
34
+ auto_eval_column_dict.append(["CRPS_6750_scaled", ColumnContent, ColumnContent("CRPS", "number", True)])
35
+ auto_eval_column_dict.append(["Rank_6750_scaled", ColumnContent, ColumnContent("Rank", "number", True)])
 
 
 
36
  # Model information
37
+ auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Model Type", "str", False, hidden=True)])
38
  # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
39
  # auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
40
  # auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])