Spaces:

Datadog
/

BOOM

Runtime error

App Files Files Community

annamonica commited on 4 days ago

Commit

55a94df

1 Parent(s): 02b7312

rename Type column and update About section

Browse files

Files changed (3) hide show

app.py +1 -1
src/about.py +9 -6
src/display/utils.py +7 -8

app.py CHANGED Viewed

@@ -109,7 +109,7 @@ with demo:
         with gr.TabItem("🏅 By Domain - TODO", elem_id="boom-benchmark-tab-table", id=1):
             leaderboard = init_leaderboard(LEADERBOARD_DF_DOMAIN)  # TODO - update table data
-        with gr.TabItem("📝 About - TODO", elem_id="boom-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
     with gr.Row():

         with gr.TabItem("🏅 By Domain - TODO", elem_id="boom-benchmark-tab-table", id=1):
             leaderboard = init_leaderboard(LEADERBOARD_DF_DOMAIN)  # TODO - update table data
+        with gr.TabItem("📝 About", elem_id="boom-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
     with gr.Row():

src/about.py CHANGED Viewed

@@ -22,20 +22,23 @@ NUM_FEWSHOT = 0  # Change with your few shot
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">BOOM 💥 Time Series Forecasting Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-BOOM (Benchmark of Observability Metrics) is a large-scale, real-world time series dataset designed for evaluating models on forecasting tasks in complex observability environments. Composed of real-world metrics data collected from Datadog, a leading observability platform, the benchmark captures the irregularity, structural complexity, and heavy-tailed statistics typical of production observability data. For more information, please refer to the [BOOM Dataset Card](https://huggingface.co/datasets/Datadog/BOOM) and the [BOOM GitHub repository](https://github.com/DataDog/toto?tab=readme-ov-file#boom-benchmark-of-observability-metrics)
 """
 # Which evaluations are you running? how can people reproduce what you have?
-# TODO
 LLM_BENCHMARKS_TEXT = f"""
-## How it works
-## Reproducibility
-To reproduce our results, here is the commands you can run:
 """

 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">BOOM 💥 Observability Time-Series Forecasting Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+BOOM (**B**enchmark **o**f **O**bservability **M**etrics) is a large-scale, real-world time series dataset designed for evaluating models on forecasting tasks in complex observability environments. Consisting of around 350 million time-series data points spanning 32,887 variables, the benchmark is derived from real-world metrics collected via Datadog, a leading observability platform. It therefore captures the irregularity, structural complexity, and heavy-tailed statistics typical of production observability data.
+For more information, please refer to the [BOOM Dataset Card](https://huggingface.co/datasets/Datadog/BOOM) and the [BOOM GitHub repository](https://github.com/DataDog/toto?tab=readme-ov-file#boom-benchmark-of-observability-metrics)
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
+BOOM is a large-scale, real-world time series dataset designed for benchmarking forecasting models in observability environments. The dataset captures the complexity and irregularity of production observability data, making it a challenging and realistic testbed for time series forecasting. BOOM consists of approximately 350 million time-series points across 32,887 variates. The dataset is split into 2,807 individual time series with one or multiple variates.
+For more details and dataset structure, please refer to the [BOOM Dataset Card](https://huggingface.co/datasets/Datadog/BOOM).
+The evaluation procedure is inspired by [Gift-Eval](https://github.com/SalesforceAIResearch/gift-eval): We evaluate models using **MASE (Mean Absolute Scaled Error)** for forecast accuracy, **CRPS (Continuous Ranked Probability Score)** for probabilistic forecast quality, and **Rank**—which determines overall performance and is used to order models on the leaderboard.
+To reproduce our results, we provide a guide in the [BOOM GitHub repository](https://github.com/DataDog/toto/tree/main/boom) that explains how to install the required dependencies and includes example notebooks demonstrating how to evaluate both foundation models and statistical baselines on BOOM.
 """

src/display/utils.py CHANGED Viewed

@@ -25,17 +25,16 @@ class ColumnContent:
 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
-auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 # Scores
-auto_eval_column_dict.append(["MASE_6750_scaled", ColumnContent, ColumnContent("MASE_scaled", "number", True)])
-auto_eval_column_dict.append(["CRPS_6750_scaled", ColumnContent, ColumnContent("CRPS_scaled", "number", True)])
-auto_eval_column_dict.append(["Rank_6750_scaled", ColumnContent, ColumnContent("Rank_scaled", "number", True)])
-auto_eval_column_dict.append(["MAE_663_unscaled", ColumnContent, ColumnContent("MAE[0.5]_unscaled", "number", True)])
-auto_eval_column_dict.append(["CRPS_663_unscaled", ColumnContent, ColumnContent("CRPS_unscaled", "number", True)])
-auto_eval_column_dict.append(["Rank_663_unscaled", ColumnContent, ColumnContent("Rank_unscaled", "number", True)])
 # Model information
-auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False, hidden=True)])
 # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 # auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
 # auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])

 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
+auto_eval_column_dict.append(
+    ["model_type_symbol", ColumnContent, ColumnContent("Type", "str", True, never_hidden=True)]
+)
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 # Scores
+auto_eval_column_dict.append(["MASE_6750_scaled", ColumnContent, ColumnContent("MASE", "number", True)])
+auto_eval_column_dict.append(["CRPS_6750_scaled", ColumnContent, ColumnContent("CRPS", "number", True)])
+auto_eval_column_dict.append(["Rank_6750_scaled", ColumnContent, ColumnContent("Rank", "number", True)])
 # Model information
+auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Model Type", "str", False, hidden=True)])
 # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 # auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
 # auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])