Spaces:

AI-Secure
/

llm-trustworthy-leaderboard

Running on CPU Upgrade

Clémentine commited on Nov 15, 2023

Commit

b1a1395

1 Parent(s): ccefec9

Refactor 2 - added plotting back

Only takes into account last submissions, but we have no way to go back at eval date apart from loading info from git commit of results files.
Also updated speed with gradio concurrency limit param

Files changed (7) hide show

app.py +24 -44
src/display/formatting.py +3 -0
src/display/utils.py +32 -8
src/leaderboard/read_evals.py +5 -6
src/populate.py +7 -6
src/submission/check_validity.py +1 -1
src/tools/plots.py +49 -120

app.py CHANGED Viewed

@@ -31,18 +31,15 @@ from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 from src.tools.collections import update_collections
 from src.tools.plots import (
-    HUMAN_BASELINES,
     create_metric_plot_obj,
     create_plot_df,
     create_scores_df,
-    join_model_info_with_results,
 )
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
 try:
     snapshot_download(
         repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
@@ -57,13 +54,11 @@ except Exception:
     restart_space()
-original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
 update_collections(original_df.copy())
 leaderboard_df = original_df.copy()
-# models = original_df["model_name_for_query"].tolist()  # needed for model backlinks in their to the leaderboard
-# plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
-# to_be_dumped = f"models = {repr(models)}\n"
 (
     finished_eval_queue_df,
@@ -72,16 +67,6 @@ leaderboard_df = original_df.copy()
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-# Basics
-#def change_tab(query_param: str):
-#    query_param = query_param.replace("'", '"')
-#     query_param = json.loads(query_param)
-#    if isinstance(query_param, dict) and "tab" in query_param and query_param["tab"] == "evaluation":
-#        return gr.Tabs.update(selected=1)
-#    else:
-#        return gr.Tabs.update(selected=0)
 # Searching and filtering
 def update_table(
     hidden_df: pd.DataFrame,
@@ -247,6 +232,7 @@ with demo:
                     search_bar,
                 ],
                 leaderboard_table,
             )
             shown_columns.change(
                 update_table,
@@ -261,6 +247,7 @@ with demo:
                 ],
                 leaderboard_table,
                 queue=True,
             )
             filter_columns_type.change(
                 update_table,
@@ -275,6 +262,7 @@ with demo:
                 ],
                 leaderboard_table,
                 queue=True,
             )
             filter_columns_precision.change(
                 update_table,
@@ -289,6 +277,7 @@ with demo:
                 ],
                 leaderboard_table,
                 queue=True,
             )
             filter_columns_size.change(
                 update_table,
@@ -303,6 +292,7 @@ with demo:
                 ],
                 leaderboard_table,
                 queue=True,
             )
             deleted_models_visibility.change(
                 update_table,
@@ -317,27 +307,25 @@ with demo:
                 ],
                 leaderboard_table,
                 queue=True,
             )
-        # with gr.TabItem("📈
-        #  evolution through time", elem_id="llm-benchmark-tab-table", id=4):
-        #     with gr.Row():
-        #         with gr.Column():
-        #             chart = create_metric_plot_obj(
-        #                 plot_df,
-        #                 ["Average ⬆️"],
-        #                 HUMAN_BASELINES,
-        #                 title="Average of Top Scores and Human Baseline Over Time",
-        #             )
-        #             gr.Plot(value=chart, interactive=False, width=500, height=500)
-        #         with gr.Column():
-        #             chart = create_metric_plot_obj(
-        #                 plot_df,
-        #                 ["ARC", "HellaSwag", "MMLU", "TruthfulQA", "Winogrande", "GSM8K", "DROP"],
-        #                 HUMAN_BASELINES,
-        #                 title="Top Scores and Human Baseline Over Time",
-        #             )
-        #             gr.Plot(value=chart, interactive=False, width=500, height=500)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
@@ -440,14 +428,6 @@ with demo:
                 show_copy_button=True,
             )
-    #dummy = gr.Textbox(visible=False)
-    #demo.load(
-    #    change_tab,
-    #    dummy,
-    #    tabs,
-    #    js=get_window_url_params,
-    #)
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()

 from src.submission.submit import add_new_eval
 from src.tools.collections import update_collections
 from src.tools.plots import (
     create_metric_plot_obj,
     create_plot_df,
     create_scores_df,
 )
 def restart_space():
     API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
 try:
     snapshot_download(
         repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
     restart_space()
+raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS)
 update_collections(original_df.copy())
 leaderboard_df = original_df.copy()
+plot_df = create_plot_df(create_scores_df(raw_data))
 (
     finished_eval_queue_df,
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 # Searching and filtering
 def update_table(
     hidden_df: pd.DataFrame,
                     search_bar,
                 ],
                 leaderboard_table,
+                concurrency_limit=None,
             )
             shown_columns.change(
                 update_table,
                 ],
                 leaderboard_table,
                 queue=True,
+                concurrency_limit=None,
             )
             filter_columns_type.change(
                 update_table,
                 ],
                 leaderboard_table,
                 queue=True,
+                concurrency_limit=None,
             )
             filter_columns_precision.change(
                 update_table,
                 ],
                 leaderboard_table,
                 queue=True,
+                concurrency_limit=None,
             )
             filter_columns_size.change(
                 update_table,
                 ],
                 leaderboard_table,
                 queue=True,
+                concurrency_limit=None,
             )
             deleted_models_visibility.change(
                 update_table,
                 ],
                 leaderboard_table,
                 queue=True,
+                concurrency_limit=None,
             )
+        with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=4):
+            with gr.Row():
+                with gr.Column():
+                    chart = create_metric_plot_obj(
+                        plot_df,
+                        [AutoEvalColumn.average.name],
+                        title="Average of Top Scores and Human Baseline Over Time (from last update)",
+                    )
+                    gr.Plot(value=chart, min_width=500)
+                with gr.Column():
+                    chart = create_metric_plot_obj(
+                        plot_df,
+                        BENCHMARK_COLS,
+                        title="Top Scores and Human Baseline Over Time (from last update)",
+                    )
+                    gr.Plot(value=chart, min_width=500)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
                 show_copy_button=True,
             )
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()

src/display/formatting.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import os
 from huggingface_hub import HfApi
 API = HfApi()

 import os
+from datetime import datetime, timezone
 from huggingface_hub import HfApi
+from huggingface_hub.hf_api import ModelInfo
 API = HfApi()

src/display/utils.py CHANGED Viewed

@@ -60,7 +60,7 @@ baseline_row = {
     AutoEvalColumn.model.name: "<p>Baseline</p>",
     AutoEvalColumn.revision.name: "N/A",
     AutoEvalColumn.precision.name: None,
-    AutoEvalColumn.average.name: 25.0,
     AutoEvalColumn.arc.name: 25.0,
     AutoEvalColumn.hellaswag.name: 25.0,
     AutoEvalColumn.mmlu.name: 25.0,
@@ -72,19 +72,43 @@ baseline_row = {
     AutoEvalColumn.model_type.name: "",
 }
 @dataclass
-class ModelInfo:
     name: str
     symbol: str  # emoji
 class ModelType(Enum):
-    PT = ModelInfo(name="pretrained", symbol="🟢")
-    FT = ModelInfo(name="fine-tuned", symbol="🔶")
-    IFT = ModelInfo(name="instruction-tuned", symbol="⭕")
-    RL = ModelInfo(name="RL-tuned", symbol="🟦")
-    Unknown = ModelInfo(name="", symbol="?")
     def to_str(self, separator=" "):
         return f"{self.value.symbol}{separator}{self.value.name}"
@@ -128,7 +152,7 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
-BENCHMARK_COLS = [t.value.col_name for t in Tasks if t.value.col_name in fields(AutoEvalColumn)]
 NUMERIC_INTERVALS = {
     "?": pd.Interval(-1, 0, closed="right"),

     AutoEvalColumn.model.name: "<p>Baseline</p>",
     AutoEvalColumn.revision.name: "N/A",
     AutoEvalColumn.precision.name: None,
+    AutoEvalColumn.average.name: 31.0,
     AutoEvalColumn.arc.name: 25.0,
     AutoEvalColumn.hellaswag.name: 25.0,
     AutoEvalColumn.mmlu.name: 25.0,
     AutoEvalColumn.model_type.name: "",
 }
+# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
+# ARC human baseline is 0.80 (source: https://lab42.global/arc/)
+# HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
+# MMLU human baseline is 0.898 (source: https://openreview.net/forum?id=d7KBjmI3GmQ)
+# TruthfulQA human baseline is 0.94(source: https://arxiv.org/pdf/2109.07958.pdf)
+# Drop: https://leaderboard.allenai.org/drop/submissions/public
+# Winogrande: https://leaderboard.allenai.org/winogrande/submissions/public
+# GSM8K: paper
+# Define the human baselines
+human_baseline_row = {
+    AutoEvalColumn.model.name: "<p>Human performance</p>",
+    AutoEvalColumn.revision.name: "N/A",
+    AutoEvalColumn.precision.name: None,
+    AutoEvalColumn.average.name: 92.75,
+    AutoEvalColumn.arc.name: 80.0,
+    AutoEvalColumn.hellaswag.name: 95.0,
+    AutoEvalColumn.mmlu.name: 89.8,
+    AutoEvalColumn.truthfulqa.name: 94.0,
+    AutoEvalColumn.winogrande.name: 94.0,
+    AutoEvalColumn.gsm8k.name: 100,
+    AutoEvalColumn.drop.name: 96.42,
+    AutoEvalColumn.dummy.name: "human_baseline",
+    AutoEvalColumn.model_type.name: "",
+}
 @dataclass
+class ModelTypeDetails:
     name: str
     symbol: str  # emoji
 class ModelType(Enum):
+    PT = ModelTypeDetails(name="pretrained", symbol="🟢")
+    FT = ModelTypeDetails(name="fine-tuned", symbol="🔶")
+    IFT = ModelTypeDetails(name="instruction-tuned", symbol="⭕")
+    RL = ModelTypeDetails(name="RL-tuned", symbol="🟦")
+    Unknown = ModelTypeDetails(name="", symbol="?")
     def to_str(self, separator=" "):
         return f"{self.value.symbol}{separator}{self.value.name}"
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
+BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 NUMERIC_INTERVALS = {
     "?": pd.Interval(-1, 0, closed="right"),

src/leaderboard/read_evals.py CHANGED Viewed

@@ -3,9 +3,9 @@ import json
 import math
 import os
 from dataclasses import dataclass
-from typing import Dict, List, Tuple
 import dateutil
 import numpy as np
 from src.display.formatting import make_clickable_model
@@ -61,8 +61,6 @@ class EvalResult:
         still_on_hub, error = is_model_on_hub(
             full_model, config.get("model_sha", "main"), trust_remote_code=True
         )
-        if not still_on_hub:
-            print(full_model, error)
         # Extract results available in this file (some results are split in several files)
         results = {}
@@ -100,7 +98,6 @@ class EvalResult:
             results=results,
             precision=precision,  # todo model_type=, weight_type=
             revision=config.get("model_sha", ""),
-            date=config.get("submission_date", ""),
             still_on_hub=still_on_hub,
         )
@@ -114,6 +111,7 @@ class EvalResult:
             self.license = request.get("license", "?")
             self.likes = request.get("likes", 0)
             self.num_params = request.get("params", 0)
         except Exception:
             print(f"Could not find request file for {self.org}/{self.model}")
@@ -162,7 +160,7 @@ def get_request_file_for_model(model_name, precision):
     return request_file
-def get_eval_results(results_path: str) -> List[EvalResult]:
     json_filepaths = []
     for root, _, files in os.walk(results_path):
@@ -196,7 +194,8 @@ def get_eval_results(results_path: str) -> List[EvalResult]:
     results = []
     for v in eval_results.values():
         try:
-            results.append(v.to_dict())
         except KeyError:  # not all eval values present
             continue

 import math
 import os
 from dataclasses import dataclass
 import dateutil
+from datetime import datetime
 import numpy as np
 from src.display.formatting import make_clickable_model
         still_on_hub, error = is_model_on_hub(
             full_model, config.get("model_sha", "main"), trust_remote_code=True
         )
         # Extract results available in this file (some results are split in several files)
         results = {}
             results=results,
             precision=precision,  # todo model_type=, weight_type=
             revision=config.get("model_sha", ""),
             still_on_hub=still_on_hub,
         )
             self.license = request.get("license", "?")
             self.likes = request.get("likes", 0)
             self.num_params = request.get("params", 0)
+            self.date = request.get("submitted_time", "")
         except Exception:
             print(f"Could not find request file for {self.org}/{self.model}")
     return request_file
+def get_raw_eval_results(results_path: str) -> list[EvalResult]:
     json_filepaths = []
     for root, _, files in os.walk(results_path):
     results = []
     for v in eval_results.values():
         try:
+            v.to_dict() # we test if the dict version is complete
+            results.append(v)
         except KeyError:  # not all eval values present
             continue

src/populate.py CHANGED Viewed

@@ -6,21 +6,22 @@ import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
 from src.leaderboard.filter_models import filter_models
-from src.leaderboard.read_evals import get_eval_results
 def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
-    all_data = get_eval_results(results_path)
-    all_data.append(baseline_row)
-    filter_models(all_data)
-    df = pd.DataFrame.from_records(all_data)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
     df = df[has_no_nan_values(df, benchmark_cols)]
-    return df
 def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:

 from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
 from src.leaderboard.filter_models import filter_models
+from src.leaderboard.read_evals import get_raw_eval_results
 def get_leaderboard_df(results_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
+    raw_data = get_raw_eval_results(results_path)
+    all_data_json = [v.to_dict() for v in raw_data]
+    all_data_json.append(baseline_row)
+    filter_models(all_data_json)
+    df = pd.DataFrame.from_records(all_data_json)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
     df = df[has_no_nan_values(df, benchmark_cols)]
+    return raw_data, df
 def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:

src/submission/check_validity.py CHANGED Viewed

@@ -55,7 +55,7 @@ def get_model_size(model_info: ModelInfo, precision: str):
     size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
     try:
         model_size = round(model_info.safetensors["total"] / 1e9, 3)
-    except (AttributeError, TypeError):
         try:
             size_match = re.search(size_pattern, model_info.modelId.lower())
             model_size = size_match.group(0)

     size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
     try:
         model_size = round(model_info.safetensors["total"] / 1e9, 3)
+    except (AttributeError, TypeError ):
         try:
             size_match = re.search(size_pattern, model_info.modelId.lower())
             model_size = size_match.group(0)

src/tools/plots.py CHANGED Viewed

@@ -1,153 +1,84 @@
-import pickle
-from datetime import datetime, timezone
-from typing import Any, Dict, List, Tuple
 import pandas as pd
 import plotly.express as px
 from plotly.graph_objs import Figure
 from src.leaderboard.filter_models import FLAGGED_MODELS
-# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
-# ARC human baseline is 0.80 (source: https://lab42.global/arc/)
-# HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
-# MMLU human baseline is 0.898 (source: https://openreview.net/forum?id=d7KBjmI3GmQ)
-# TruthfulQA human baseline is 0.94(source: https://arxiv.org/pdf/2109.07958.pdf)
-# Define the human baselines
-HUMAN_BASELINES = {
-    "Average ⬆️": 0.897 * 100,
-    "ARC": 0.80 * 100,
-    "HellaSwag": 0.95 * 100,
-    "MMLU": 0.898 * 100,
-    "TruthfulQA": 0.94 * 100,
-}
-def to_datetime(model_info: Tuple[str, Any]) -> datetime:
-    """
-    Converts the lastModified attribute of the object to datetime.
-    :param model_info: A tuple containing the name and object.
-                       The object must have a lastModified attribute
-                       with a string representing the date and time.
-    :return: A datetime object converted from the lastModified attribute of the input object.
-    """
-    name, obj = model_info
-    return datetime.strptime(obj.lastModified, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)
-def join_model_info_with_results(results_df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Integrates model information with the results DataFrame by matching 'Model sha'.
-    :param results_df: A DataFrame containing results information including 'Model sha' column.
-    :return: A DataFrame with updated 'Results Date' columns, which are synchronized with model information.
     """
-    # copy dataframe to avoid modifying the original
-    df = results_df.copy(deep=True)
-    # Filter out FLAGGED_MODELS to ensure graph is not skewed by mistakes
-    df = df[~df["model_name_for_query"].isin(FLAGGED_MODELS.keys())].reset_index(drop=True)
-    # load cache from disk
-    try:
-        with open("model_info_cache.pkl", "rb") as f:
-            model_info_cache = pickle.load(f)
-    except (EOFError, FileNotFoundError):
-        model_info_cache = {}
-    # Sort date strings using datetime objects as keys
-    sorted_dates = sorted(list(model_info_cache.items()), key=to_datetime, reverse=True)
-    df["Results Date"] = datetime.now().replace(tzinfo=timezone.utc)
-    # Define the date format string
-    date_format = "%Y-%m-%dT%H:%M:%S.%fZ"
-    # Iterate over sorted_dates and update the dataframe
-    for name, obj in sorted_dates:
-        # Convert the lastModified string to a datetime object
-        last_modified_datetime = datetime.strptime(obj.lastModified, date_format).replace(tzinfo=timezone.utc)
-        # Update the "Results Date" column where "Model sha" equals obj.sha
-        df.loc[df["Model sha"] == obj.sha, "Results Date"] = last_modified_datetime
-    return df
-def create_scores_df(results_df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Generates a DataFrame containing the maximum scores until each result date.
-    :param results_df: A DataFrame containing result information including metric scores and result dates.
-    :return: A new DataFrame containing the maximum scores until each result date for every metric.
     """
-    # Step 1: Ensure 'Results Date' is in datetime format and sort the DataFrame by it
-    results_df["Results Date"] = pd.to_datetime(results_df["Results Date"])
-    results_df.sort_values(by="Results Date", inplace=True)
     # Step 2: Initialize the scores dictionary
-    scores = {
-        "Average ⬆️": [],
-        "ARC": [],
-        "HellaSwag": [],
-        "MMLU": [],
-        "TruthfulQA": [],
-        "Result Date": [],
-        "Model Name": [],
-    }
     # Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
-    for i, row in results_df.iterrows():
-        date = row["Results Date"]
-        for column in scores.keys():
-            if column == "Result Date":
-                if not scores[column] or scores[column][-1] <= date:
-                    scores[column].append(date)
                 continue
-            if column == "Model Name":
-                scores[column].append(row["model_name_for_query"])
-                continue
-            current_max = scores[column][-1] if scores[column] else float("-inf")
-            scores[column].append(max(current_max, row[column]))
-    # Step 4: Convert the dictionary to a DataFrame
-    return pd.DataFrame(scores)
-def create_plot_df(scores_df: pd.DataFrame) -> pd.DataFrame:
     """
     Transforms the scores DataFrame into a new format suitable for plotting.
-    :param scores_df: A DataFrame containing metric scores and result dates.
     :return: A new DataFrame reshaped for plotting purposes.
     """
-    # Sample columns
-    cols = ["Average ⬆️", "ARC", "HellaSwag", "MMLU", "TruthfulQA"]
     # Initialize the list to store DataFrames
     dfs = []
     # Iterate over the cols and create a new DataFrame for each column
-    for col in cols:
-        d = scores_df[[col, "Model Name", "Result Date"]].copy().reset_index(drop=True)
-        d["Metric Name"] = col
-        d.rename(columns={col: "Metric Value"}, inplace=True)
         dfs.append(d)
     # Concatenate all the created DataFrames
     concat_df = pd.concat(dfs, ignore_index=True)
-    # Sort values by 'Result Date'
-    concat_df.sort_values(by="Result Date", inplace=True)
-    concat_df.reset_index(drop=True, inplace=True)
-    # Drop duplicates based on 'Metric Name' and 'Metric Value' and keep the first (earliest) occurrence
-    concat_df.drop_duplicates(subset=["Metric Name", "Metric Value"], keep="first", inplace=True)
     concat_df.reset_index(drop=True, inplace=True)
     return concat_df
 def create_metric_plot_obj(
-    df: pd.DataFrame, metrics: List[str], human_baselines: Dict[str, float], title: str
 ) -> Figure:
     """
     Create a Plotly figure object with lines representing different metrics
@@ -156,27 +87,25 @@ def create_metric_plot_obj(
     :param df: The DataFrame containing the metric values, names, and dates.
     :param metrics: A list of strings representing the names of the metrics
                     to be included in the plot.
-    :param human_baselines: A dictionary where keys are metric names
-                            and values are human baseline values for the metrics.
     :param title: A string representing the title of the plot.
     :return: A Plotly figure object with lines representing metrics and
              horizontal dotted lines representing human baselines.
     """
     # Filter the DataFrame based on the specified metrics
-    df = df[df["Metric Name"].isin(metrics)]
     # Filter the human baselines based on the specified metrics
-    filtered_human_baselines = {k: v for k, v in human_baselines.items() if k in metrics}
     # Create a line figure using plotly express with specified markers and custom data
     fig = px.line(
         df,
-        x="Result Date",
-        y="Metric Value",
-        color="Metric Name",
         markers=True,
-        custom_data=["Metric Name", "Metric Value", "Model Name"],
         title=title,
     )

 import pandas as pd
+import numpy as np
 import plotly.express as px
 from plotly.graph_objs import Figure
 from src.leaderboard.filter_models import FLAGGED_MODELS
+from src.display.utils import human_baseline_row as HUMAN_BASELINE, AutoEvalColumn, Tasks, Task, BENCHMARK_COLS
+from src.leaderboard.read_evals import EvalResult
+def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
     """
+    Generates a DataFrame containing the maximum scores until each date.
+    :param results_df: A DataFrame containing result information including metric scores and dates.
+    :return: A new DataFrame containing the maximum scores until each date for every metric.
     """
+    # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
+    results_df = pd.DataFrame(raw_data)
+    #results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
+    results_df.sort_values(by="date", inplace=True)
     # Step 2: Initialize the scores dictionary
+    scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}
     # Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
+    for task in [t.value for t in Tasks] + [Task("Average", "avg", AutoEvalColumn.average.name)]:
+        current_max = 0
+        last_date = ""
+        column = task.col_name
+        for _, row in results_df.iterrows():
+            current_model = row["full_model"]
+            if current_model in FLAGGED_MODELS:
                 continue
+            current_date = row["date"]
+            if task.benchmark == "Average":
+                current_score = np.mean(list(row["results"].values()))
+            else:
+                current_score = row["results"][task.benchmark]
+            if current_score > current_max:
+                if current_date == last_date and len(scores[column]) > 0:
+                    scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
+                else:
+                    scores[column].append({"model": current_model, "date": current_date, "score": current_score})
+                current_max = current_score
+                last_date = current_date
+    # Step 4: Return all dictionaries as DataFrames
+    return {k: pd.DataFrame(v) for k, v in scores.items()}
+def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
     """
     Transforms the scores DataFrame into a new format suitable for plotting.
+    :param scores_df: A DataFrame containing metric scores and dates.
     :return: A new DataFrame reshaped for plotting purposes.
     """
     # Initialize the list to store DataFrames
     dfs = []
     # Iterate over the cols and create a new DataFrame for each column
+    for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
+        d = scores_df[col].reset_index(drop=True)
+        d["task"] = col
         dfs.append(d)
     # Concatenate all the created DataFrames
     concat_df = pd.concat(dfs, ignore_index=True)
+    # Sort values by 'date'
+    concat_df.sort_values(by="date", inplace=True)
     concat_df.reset_index(drop=True, inplace=True)
     return concat_df
 def create_metric_plot_obj(
+    df: pd.DataFrame, metrics: list[str], title: str
 ) -> Figure:
     """
     Create a Plotly figure object with lines representing different metrics
     :param df: The DataFrame containing the metric values, names, and dates.
     :param metrics: A list of strings representing the names of the metrics
                     to be included in the plot.
     :param title: A string representing the title of the plot.
     :return: A Plotly figure object with lines representing metrics and
              horizontal dotted lines representing human baselines.
     """
     # Filter the DataFrame based on the specified metrics
+    df = df[df["task"].isin(metrics)]
     # Filter the human baselines based on the specified metrics
+    filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
     # Create a line figure using plotly express with specified markers and custom data
     fig = px.line(
         df,
+        x="date",
+        y="score",
+        color="task",
         markers=True,
+        custom_data=["task", "score", "model"],
         title=title,
     )