Spaces:
Runtime error
Runtime error
some-updates (#3)
Browse files- remove unused cols (02b73122a4d3837840957be33e56d53da75ac3f4)
- rename Type column and update About section (55a94df69759b8d20626024453688ef70b50cd91)
- app.py +1 -1
- results/BOOM_leaderboard.csv +15 -15
- src/about.py +9 -6
- src/display/utils.py +7 -8
app.py
CHANGED
@@ -109,7 +109,7 @@ with demo:
|
|
109 |
with gr.TabItem("π
By Domain - TODO", elem_id="boom-benchmark-tab-table", id=1):
|
110 |
leaderboard = init_leaderboard(LEADERBOARD_DF_DOMAIN) # TODO - update table data
|
111 |
|
112 |
-
with gr.TabItem("π About
|
113 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
114 |
|
115 |
with gr.Row():
|
|
|
109 |
with gr.TabItem("π
By Domain - TODO", elem_id="boom-benchmark-tab-table", id=1):
|
110 |
leaderboard = init_leaderboard(LEADERBOARD_DF_DOMAIN) # TODO - update table data
|
111 |
|
112 |
+
with gr.TabItem("π About", elem_id="boom-benchmark-tab-table", id=2):
|
113 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
114 |
|
115 |
with gr.Row():
|
results/BOOM_leaderboard.csv
CHANGED
@@ -1,15 +1,15 @@
|
|
1 |
-
model,model_type,MASE_6750_scaled,CRPS_6750_scaled,Rank_6750_scaled
|
2 |
-
Toto-Open-Base-1.0,pretrained,0.617,0.375,2.351
|
3 |
-
moirai_1.1_base,pretrained,0.710,0.428,4.278
|
4 |
-
moirai_1.1_large,pretrained,0.720,0.436,4.499
|
5 |
-
moirai_1.1_small,pretrained,0.738,0.447,4.796
|
6 |
-
timesfm_2_0_500m,pretrained,0.725,0.447,5.153
|
7 |
-
chronos_bolt_base,pretrained,0.726,0.451,5.446
|
8 |
-
chronos_bolt_small,pretrained,0.733,0.455,5.793
|
9 |
-
autoarima,statistical,0.824,0.736,9.171
|
10 |
-
timer,pretrained,0.796,0.639,9.356
|
11 |
-
time-moe,pretrained,0.806,0.649,9.369
|
12 |
-
visionts,pretrained,0.991,0.675,10.336
|
13 |
-
autoets,statistical,0.842,1.975,10.956
|
14 |
-
autotheta,statistical,1.123,1.018,11.712
|
15 |
-
naive,statistical,1.000,1.000,11.783
|
|
|
1 |
+
model,model_type,MASE_6750_scaled,CRPS_6750_scaled,Rank_6750_scaled
|
2 |
+
Toto-Open-Base-1.0,pretrained,0.617,0.375,2.351
|
3 |
+
moirai_1.1_base,pretrained,0.710,0.428,4.278
|
4 |
+
moirai_1.1_large,pretrained,0.720,0.436,4.499
|
5 |
+
moirai_1.1_small,pretrained,0.738,0.447,4.796
|
6 |
+
timesfm_2_0_500m,pretrained,0.725,0.447,5.153
|
7 |
+
chronos_bolt_base,pretrained,0.726,0.451,5.446
|
8 |
+
chronos_bolt_small,pretrained,0.733,0.455,5.793
|
9 |
+
autoarima,statistical,0.824,0.736,9.171
|
10 |
+
timer,pretrained,0.796,0.639,9.356
|
11 |
+
time-moe,pretrained,0.806,0.649,9.369
|
12 |
+
visionts,pretrained,0.991,0.675,10.336
|
13 |
+
autoets,statistical,0.842,1.975,10.956
|
14 |
+
autotheta,statistical,1.123,1.018,11.712
|
15 |
+
naive,statistical,1.000,1.000,11.783
|
src/about.py
CHANGED
@@ -22,20 +22,23 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
22 |
|
23 |
|
24 |
# Your leaderboard name
|
25 |
-
TITLE = """<h1 align="center" id="space-title">BOOM π₯ Time
|
26 |
|
27 |
# What does your leaderboard evaluate?
|
28 |
INTRODUCTION_TEXT = """
|
29 |
-
BOOM (
|
|
|
30 |
"""
|
31 |
|
32 |
# Which evaluations are you running? how can people reproduce what you have?
|
33 |
-
# TODO
|
34 |
LLM_BENCHMARKS_TEXT = f"""
|
35 |
-
## How it works
|
36 |
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
39 |
|
40 |
"""
|
41 |
|
|
|
22 |
|
23 |
|
24 |
# Your leaderboard name
|
25 |
+
TITLE = """<h1 align="center" id="space-title">BOOM π₯ Observability Time-Series Forecasting Leaderboard</h1>"""
|
26 |
|
27 |
# What does your leaderboard evaluate?
|
28 |
INTRODUCTION_TEXT = """
|
29 |
+
BOOM (**B**enchmark **o**f **O**bservability **M**etrics) is a large-scale, real-world time series dataset designed for evaluating models on forecasting tasks in complex observability environments. Consisting of around 350 million time-series data points spanning 32,887 variables, the benchmark is derived from real-world metrics collected via Datadog, a leading observability platform. It therefore captures the irregularity, structural complexity, and heavy-tailed statistics typical of production observability data.
|
30 |
+
For more information, please refer to the [BOOM Dataset Card](https://huggingface.co/datasets/Datadog/BOOM) and the [BOOM GitHub repository](https://github.com/DataDog/toto?tab=readme-ov-file#boom-benchmark-of-observability-metrics)
|
31 |
"""
|
32 |
|
33 |
# Which evaluations are you running? how can people reproduce what you have?
|
|
|
34 |
LLM_BENCHMARKS_TEXT = f"""
|
|
|
35 |
|
36 |
+
BOOM is a large-scale, real-world time series dataset designed for benchmarking forecasting models in observability environments. The dataset captures the complexity and irregularity of production observability data, making it a challenging and realistic testbed for time series forecasting. BOOM consists of approximately 350 million time-series points across 32,887 variates. The dataset is split into 2,807 individual time series with one or multiple variates.
|
37 |
+
For more details and dataset structure, please refer to the [BOOM Dataset Card](https://huggingface.co/datasets/Datadog/BOOM).
|
38 |
+
|
39 |
+
The evaluation procedure is inspired by [Gift-Eval](https://github.com/SalesforceAIResearch/gift-eval): We evaluate models using **MASE (Mean Absolute Scaled Error)** for forecast accuracy, **CRPS (Continuous Ranked Probability Score)** for probabilistic forecast quality, and **Rank**βwhich determines overall performance and is used to order models on the leaderboard.
|
40 |
+
|
41 |
+
To reproduce our results, we provide a guide in the [BOOM GitHub repository](https://github.com/DataDog/toto/tree/main/boom) that explains how to install the required dependencies and includes example notebooks demonstrating how to evaluate both foundation models and statistical baselines on BOOM.
|
42 |
|
43 |
"""
|
44 |
|
src/display/utils.py
CHANGED
@@ -25,17 +25,16 @@ class ColumnContent:
|
|
25 |
## Leaderboard columns
|
26 |
auto_eval_column_dict = []
|
27 |
# Init
|
28 |
-
auto_eval_column_dict.append(
|
|
|
|
|
29 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
30 |
# Scores
|
31 |
-
auto_eval_column_dict.append(["MASE_6750_scaled", ColumnContent, ColumnContent("
|
32 |
-
auto_eval_column_dict.append(["CRPS_6750_scaled", ColumnContent, ColumnContent("
|
33 |
-
auto_eval_column_dict.append(["Rank_6750_scaled", ColumnContent, ColumnContent("
|
34 |
-
auto_eval_column_dict.append(["MAE_663_unscaled", ColumnContent, ColumnContent("MAE[0.5]_unscaled", "number", True)])
|
35 |
-
auto_eval_column_dict.append(["CRPS_663_unscaled", ColumnContent, ColumnContent("CRPS_unscaled", "number", True)])
|
36 |
-
auto_eval_column_dict.append(["Rank_663_unscaled", ColumnContent, ColumnContent("Rank_unscaled", "number", True)])
|
37 |
# Model information
|
38 |
-
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False, hidden=True)])
|
39 |
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
40 |
# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
41 |
# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
|
|
25 |
## Leaderboard columns
|
26 |
auto_eval_column_dict = []
|
27 |
# Init
|
28 |
+
auto_eval_column_dict.append(
|
29 |
+
["model_type_symbol", ColumnContent, ColumnContent("Type", "str", True, never_hidden=True)]
|
30 |
+
)
|
31 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
32 |
# Scores
|
33 |
+
auto_eval_column_dict.append(["MASE_6750_scaled", ColumnContent, ColumnContent("MASE", "number", True)])
|
34 |
+
auto_eval_column_dict.append(["CRPS_6750_scaled", ColumnContent, ColumnContent("CRPS", "number", True)])
|
35 |
+
auto_eval_column_dict.append(["Rank_6750_scaled", ColumnContent, ColumnContent("Rank", "number", True)])
|
|
|
|
|
|
|
36 |
# Model information
|
37 |
+
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Model Type", "str", False, hidden=True)])
|
38 |
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
39 |
# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
40 |
# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|