Oleksandr Shchur commited on
Commit
d8795b1
1 Parent(s): ec1ab4f

Update leaderboard UI

Browse files
Files changed (5) hide show
  1. app.py +36 -47
  2. src/__init__.py +0 -0
  3. src/about.py +50 -0
  4. src/custom_html_js.py +99 -0
  5. src/formatting.py +28 -0
app.py CHANGED
@@ -2,18 +2,15 @@ import fev
2
  import gradio as gr
3
  import pandas as pd
4
 
 
 
 
 
5
  # Load the CSV data into a pandas DataFrame
6
  df = pd.read_csv(
7
  "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/seasonal_naive.csv"
8
  )
9
 
10
- markdown_text = """
11
- This space hosts evaluation results for time series forecasting models.
12
-
13
- Benchmark definitions, implementations of models, as well as the evaluation results for individual tasks are available under https://github.com/autogluon/fev.
14
-
15
- Currently, the results in this space are a minimal proof of concept. Stay tuned for more benchmarks, results for new models and instructions on how to contribute your results.
16
- """
17
 
18
  summary_urls = [
19
  "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_arima.csv",
@@ -62,46 +59,38 @@ for metric in ["WQL", "MASE"]:
62
  format_dict = {}
63
  for col in lb.columns:
64
  format_dict[col] = "{:.3f}" if col != "Training corpus overlap (%)" else "{:.1%}"
65
- leaderboards[metric] = highlight_zeroshot(lb.reset_index().style.format(format_dict))
66
-
67
-
68
- with gr.Blocks() as demo:
69
- with gr.Tab("Chronos Benchmark II"):
70
- gr.Markdown("""
71
- ## Chronos Benchmark II results
72
-
73
- This tab contains results for various forecasting models on the 28 datasets used in Benchmark II in the publication [Chronos: Learning the Language of Time Series](https://arxiv.org/abs/2403.07815).
74
-
75
- These datasets were used for zero-shot evaluation of Chronos models (i.e., Chronos models were not trained on these datasets), but some other models did include certain datasets in their training corpus.
76
-
77
- Each table contains the following information:
78
-
79
- * **Average relative error**: Geometric mean of the relative errors for each task. The relative error for each task is computed as `model_error / baseline_error`.
80
- * **Average rank**: Arithmetic mean of the ranks achieved by each model on each task.
81
- * **Median inference time (s)**: Median of the times required to make predictions for the entire dataset (in seconds).
82
- * **Training corpus overlap (%)**: Percentage of the datasets used in the benchmark that were included in the model's training corpus. Zero-shot models are highlighted in <span style="color:green; font-weight:bold;">green</span>.
83
-
84
- Lower values are better for all of the above metrics.
85
-
86
- Task definitions and the detailed results are available on [GitHub](https://github.com/autogluon/fev/tree/main/benchmarks/chronos_zeroshot). More information for the datasets is available in [Table 3 of the paper](https://arxiv.org/abs/2403.07815).
87
-
88
- """)
89
- gr.Markdown("### Probabilistic forecast accuracy\nMeasured by Weighted Quantile Loss (WQL).")
90
- gr.Dataframe(
91
- value=leaderboards["WQL"],
92
- datatype=["str", "number", "number", "number"],
93
- interactive=False,
94
- )
95
-
96
- gr.Markdown("""### Point forecast accuracy\nMeasured by Mean Absolute Scaled Error (MASE).
97
- """)
98
- gr.Dataframe(
99
- value=leaderboards["MASE"],
100
- interactive=False,
101
- )
102
-
103
- with gr.Tab("About"):
104
- gr.Markdown(markdown_text)
105
 
106
  if __name__ == "__main__":
107
  demo.launch()
 
2
  import gradio as gr
3
  import pandas as pd
4
 
5
+ from src import about
6
+ from src.custom_html_js import custom_css
7
+ from src.formatting import make_clickable_model
8
+
9
  # Load the CSV data into a pandas DataFrame
10
  df = pd.read_csv(
11
  "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/seasonal_naive.csv"
12
  )
13
 
 
 
 
 
 
 
 
14
 
15
  summary_urls = [
16
  "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_arima.csv",
 
59
  format_dict = {}
60
  for col in lb.columns:
61
  format_dict[col] = "{:.3f}" if col != "Training corpus overlap (%)" else "{:.1%}"
62
+ lb = lb.reset_index()
63
+ lb["model_name"] = lb["model_name"].apply(make_clickable_model)
64
+ leaderboards[metric] = highlight_zeroshot(lb.style.format(format_dict))
65
+
66
+
67
+ with gr.Blocks(css=custom_css) as demo:
68
+ gr.HTML(about.TITLE)
69
+ gr.Markdown(about.INTRODUCTION_TEXT, elem_classes="markdown-text")
70
+
71
+ with gr.Tabs(elem_classes="tab-buttons"):
72
+ with gr.Tab("🏅 Chronos Benchmark II", id=0):
73
+ with gr.Column():
74
+ gr.Markdown(about.CHRONOS_BENCHMARK, elem_classes="markdown-text")
75
+ with gr.Tabs():
76
+ with gr.Tab("📊 Probabilistic forecast (WQL)"):
77
+ gr.Markdown("""Forecast accuracy measured by Weighted Quantile Loss.""")
78
+ gr.Dataframe(
79
+ value=leaderboards["WQL"],
80
+ datatype=["markdown", "number", "number", "number"],
81
+ interactive=False,
82
+ )
83
+
84
+ with gr.Tab("📈 Point forecast (MASE)"):
85
+ gr.Markdown("""Forecast accuracy measured by Mean Absolute Scaled Error.""")
86
+ gr.Dataframe(
87
+ value=leaderboards["MASE"],
88
+ datatype=["markdown", "number", "number", "number"],
89
+ interactive=False,
90
+ )
91
+
92
+ with gr.Tab("📝 About", id=1):
93
+ gr.Markdown(about.ABOUT_LEADERBOARD)
 
 
 
 
 
 
 
 
94
 
95
  if __name__ == "__main__":
96
  demo.launch()
src/__init__.py ADDED
File without changes
src/about.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TITLE = """<h1 align="center" id="space-title">Forecast evaluation leaderboard</h1>"""
2
+
3
+ # What does your leaderboard evaluate?
4
+ INTRODUCTION_TEXT = """
5
+ This space hosts evaluation results for time series forecasting models.
6
+
7
+ The results are obtained using [fev](https://github.com/autogluon/fev) - a lightweight library for evaluating time series forecasting models.
8
+ """
9
+
10
+ ABOUT_LEADERBOARD = """
11
+ ## What is `fev`?
12
+
13
+ [`fev`](https://github.com/autogluon/fev) is a lightweight wrapper around the 🤗 [`datasets`](https://huggingface.co/docs/datasets/en/index) library that makes it easy to benchmark time series forecasting models.
14
+
15
+ For more information about `fev`, please check out [github.com/autogluon/fev](https://github.com/autogluon/fev).
16
+
17
+ Currently, the results in this space are a minimal proof of concept. We plan to add new benchmark datasets and tasks in the future.
18
+
19
+ ## How is `fev` different from other benchmarking tools?
20
+ Existing forecasting benchmarks usually fall into one of two categories:
21
+
22
+ - Standalone datasets without any supporting infrastructure. These provide no guarantees that the results obtained by different users are comparable. For example, changing the start date or duration of the forecast horizon totally changes the meaning of the scores.
23
+ - Bespoke end-to-end systems that combine models, datasets and forecasting tasks. Such packages usually come with lots of dependencies and assumptions, which makes extending or integrating these libraries into existing systems difficult.
24
+
25
+ `fev` aims for the middle ground - it provides the core benchmarking functionality without introducing unnecessary constraints or bloated dependencies. The library supports point & probabilistic forecasting, different types of covariates, as well as all popular forecasting metrics.
26
+
27
+
28
+ ## Submitting your model
29
+ For instructions on how to evaluate your model using `fev` and contribute your results to the leaderboard, please follow the [instructions in the GitHub repo](https://github.com/autogluon/fev/blob/main/docs/04-models.ipynb).
30
+ """
31
+
32
+ CHRONOS_BENCHMARK = """
33
+ ## Chronos Benchmark II results
34
+
35
+ This tab contains results for various forecasting models on the 28 datasets used in Benchmark II in the publication [Chronos: Learning the Language of Time Series](https://arxiv.org/abs/2403.07815).
36
+
37
+ These datasets were used for zero-shot evaluation of Chronos models (i.e., Chronos models were not trained on these datasets), but some other models did include certain datasets in their training corpus.
38
+
39
+ Each table contains the following information:
40
+
41
+ * **Average relative error**: Geometric mean of the relative errors for each task. The relative error for each task is computed as `model_error / baseline_error`.
42
+ * **Average rank**: Arithmetic mean of the ranks achieved by each model on each task.
43
+ * **Median inference time (s)**: Median of the times required to make predictions for the entire dataset (in seconds).
44
+ * **Training corpus overlap (%)**: Percentage of the datasets used in the benchmark that were included in the model's training corpus. Zero-shot models are highlighted in <span style="color:green; font-weight:bold;">green</span>.
45
+
46
+ Lower values are better for all of the above metrics.
47
+
48
+ Task definitions and the detailed results are available on [GitHub](https://github.com/autogluon/fev/tree/main/benchmarks/chronos_zeroshot). More information for the datasets is available in [Table 3 of the paper](https://arxiv.org/abs/2403.07815).
49
+
50
+ """
src/custom_html_js.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+
3
+ .markdown-text {
4
+ font-size: 20px !important;
5
+ }
6
+
7
+ .tab-buttons button {
8
+ font-size: 20px;
9
+ }
10
+
11
+ """
12
+
13
+
14
+ # #citation-button span {
15
+ # font-size: 16px !important;
16
+ # }
17
+
18
+ # #citation-button textarea {
19
+ # font-size: 16px !important;
20
+ # }
21
+
22
+ # #citation-button > label > button {
23
+ # margin: 6px;
24
+ # transform: scale(1.3);
25
+ # }
26
+
27
+
28
+ # #leaderboard-table-lite {
29
+ # margin-top: 15px
30
+ # }
31
+
32
+ # #search-bar-table-box > div:first-child {
33
+ # background: none;
34
+ # border: none;
35
+ # }
36
+
37
+ # #search-bar {
38
+ # padding: 0px;
39
+ # }
40
+
41
+ # /* Hides the final AutoEvalColumn */
42
+ # #llm-benchmark-tab-table table td:last-child,
43
+ # #llm-benchmark-tab-table table th:last-child {
44
+ # display: none;
45
+ # }
46
+
47
+ # /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
48
+ # table td:first-child,
49
+ # table th:first-child {
50
+ # max-width: 400px;
51
+ # overflow: auto;
52
+ # white-space: nowrap;
53
+ # }
54
+
55
+
56
+ # #scale-logo {
57
+ # border-style: none !important;
58
+ # box-shadow: none;
59
+ # display: block;
60
+ # margin-left: auto;
61
+ # margin-right: auto;
62
+ # max-width: 600px;
63
+ # }
64
+
65
+ # #scale-logo .download {
66
+ # display: none;
67
+ # }
68
+ # #filter_type{
69
+ # border: 0;
70
+ # padding-left: 0;
71
+ # padding-top: 0;
72
+ # }
73
+ # #filter_type label {
74
+ # display: flex;
75
+ # }
76
+ # #filter_type label > span{
77
+ # margin-top: var(--spacing-lg);
78
+ # margin-right: 0.5em;
79
+ # }
80
+ # #filter_type label > .wrap{
81
+ # width: 103px;
82
+ # }
83
+ # #filter_type label > .wrap .wrap-inner{
84
+ # padding: 2px;
85
+ # }
86
+ # #filter_type label > .wrap .wrap-inner input{
87
+ # width: 1px
88
+ # }
89
+ # #filter-columns-type{
90
+ # border:0;
91
+ # padding:0.5;
92
+ # }
93
+ # #filter-columns-size{
94
+ # border:0;
95
+ # padding:0.5;
96
+ # }
97
+ # #box-filter > .form{
98
+ # border: 0
99
+ # }
src/formatting.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def model_hyperlink(link, model_name):
2
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
+
4
+
5
+ MODEL_URLS = {
6
+ "chronos_tiny": "amazon/chronos-t5-tiny",
7
+ "chronos_mini": "amazon/chronos-t5-mini",
8
+ "chronos_small": "amazon/chronos-t5-small",
9
+ "chronos_base": "amazon/chronos-t5-base",
10
+ "chronos_large": "amazon/chronos-t5-large",
11
+ "chronos_bolt_tiny": "amazon/chronos-bolt-tiny",
12
+ "chronos_bolt_mini": "amazon/chronos-bolt-mini",
13
+ "chronos_bolt_small": "amazon/chronos-bolt-small",
14
+ "chronos_bolt_base": "amazon/chronos-bolt-base",
15
+ "moirai_large": "Salesforce/moirai-1.1-R-large",
16
+ "moirai_base": "Salesforce/moirai-1.1-R-base",
17
+ "moirai_small": "Salesforce/moirai-1.1-R-small",
18
+ "timesfm": "google/timesfm-1.0-200m",
19
+ }
20
+
21
+
22
+ def make_clickable_model(model_name):
23
+ if model_name in MODEL_URLS:
24
+ model_path = MODEL_URLS.get(model_name)
25
+ link = f"https://huggingface.co/{model_path}"
26
+ return model_hyperlink(link, model_name)
27
+ else:
28
+ return model_name