File size: 8,838 Bytes
df66f6e
 
 
 
f2bc0a5
 
 
df66f6e
2a5f9fb
f2bc0a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a5f9fb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import pickle
from datetime import datetime, timezone
from typing import Any, Dict, List, Tuple

import pandas as pd
import plotly.express as px
from plotly.graph_objs import Figure

from src.leaderboard.filter_models import FLAGGED_MODELS

# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
# ARC human baseline is 0.80 (source: https://lab42.global/arc/)
# HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
# MMLU human baseline is 0.898 (source: https://openreview.net/forum?id=d7KBjmI3GmQ)
# TruthfulQA human baseline is 0.94(source: https://arxiv.org/pdf/2109.07958.pdf)
# Define the human baselines
HUMAN_BASELINES = {
    "Average ⬆️": 0.897 * 100,
    "ARC": 0.80 * 100,
    "HellaSwag": 0.95 * 100,
    "MMLU": 0.898 * 100,
    "TruthfulQA": 0.94 * 100,
}


def to_datetime(model_info: Tuple[str, Any]) -> datetime:
    """
    Converts the lastModified attribute of the object to datetime.

    :param model_info: A tuple containing the name and object.
                       The object must have a lastModified attribute
                       with a string representing the date and time.
    :return: A datetime object converted from the lastModified attribute of the input object.
    """
    name, obj = model_info
    return datetime.strptime(obj.lastModified, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc)


def join_model_info_with_results(results_df: pd.DataFrame) -> pd.DataFrame:
    """
    Integrates model information with the results DataFrame by matching 'Model sha'.
    :param results_df: A DataFrame containing results information including 'Model sha' column.
    :return: A DataFrame with updated 'Results Date' columns, which are synchronized with model information.
    """
    # copy dataframe to avoid modifying the original
    df = results_df.copy(deep=True)

    # Filter out FLAGGED_MODELS to ensure graph is not skewed by mistakes
    df = df[~df["model_name_for_query"].isin(FLAGGED_MODELS.keys())].reset_index(drop=True)

    # load cache from disk
    try:
        with open("model_info_cache.pkl", "rb") as f:
            model_info_cache = pickle.load(f)
    except (EOFError, FileNotFoundError):
        model_info_cache = {}

    # Sort date strings using datetime objects as keys
    sorted_dates = sorted(list(model_info_cache.items()), key=to_datetime, reverse=True)
    df["Results Date"] = datetime.now().replace(tzinfo=timezone.utc)

    # Define the date format string
    date_format = "%Y-%m-%dT%H:%M:%S.%fZ"

    # Iterate over sorted_dates and update the dataframe
    for name, obj in sorted_dates:
        # Convert the lastModified string to a datetime object
        last_modified_datetime = datetime.strptime(obj.lastModified, date_format).replace(tzinfo=timezone.utc)

        # Update the "Results Date" column where "Model sha" equals obj.sha
        df.loc[df["Model sha"] == obj.sha, "Results Date"] = last_modified_datetime
    return df


def create_scores_df(results_df: pd.DataFrame) -> pd.DataFrame:
    """
    Generates a DataFrame containing the maximum scores until each result date.

    :param results_df: A DataFrame containing result information including metric scores and result dates.
    :return: A new DataFrame containing the maximum scores until each result date for every metric.
    """
    # Step 1: Ensure 'Results Date' is in datetime format and sort the DataFrame by it
    results_df["Results Date"] = pd.to_datetime(results_df["Results Date"])
    results_df.sort_values(by="Results Date", inplace=True)

    # Step 2: Initialize the scores dictionary
    scores = {
        "Average ⬆️": [],
        "ARC": [],
        "HellaSwag": [],
        "MMLU": [],
        "TruthfulQA": [],
        "Result Date": [],
        "Model Name": [],
    }

    # Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
    for i, row in results_df.iterrows():
        date = row["Results Date"]
        for column in scores.keys():
            if column == "Result Date":
                if not scores[column] or scores[column][-1] <= date:
                    scores[column].append(date)
                continue
            if column == "Model Name":
                scores[column].append(row["model_name_for_query"])
                continue
            current_max = scores[column][-1] if scores[column] else float("-inf")
            scores[column].append(max(current_max, row[column]))

    # Step 4: Convert the dictionary to a DataFrame
    return pd.DataFrame(scores)


def create_plot_df(scores_df: pd.DataFrame) -> pd.DataFrame:
    """
    Transforms the scores DataFrame into a new format suitable for plotting.

    :param scores_df: A DataFrame containing metric scores and result dates.
    :return: A new DataFrame reshaped for plotting purposes.
    """
    # Sample columns
    cols = ["Average ⬆️", "ARC", "HellaSwag", "MMLU", "TruthfulQA"]

    # Initialize the list to store DataFrames
    dfs = []

    # Iterate over the cols and create a new DataFrame for each column
    for col in cols:
        d = scores_df[[col, "Model Name", "Result Date"]].copy().reset_index(drop=True)
        d["Metric Name"] = col
        d.rename(columns={col: "Metric Value"}, inplace=True)
        dfs.append(d)

    # Concatenate all the created DataFrames
    concat_df = pd.concat(dfs, ignore_index=True)

    # Sort values by 'Result Date'
    concat_df.sort_values(by="Result Date", inplace=True)
    concat_df.reset_index(drop=True, inplace=True)

    # Drop duplicates based on 'Metric Name' and 'Metric Value' and keep the first (earliest) occurrence
    concat_df.drop_duplicates(subset=["Metric Name", "Metric Value"], keep="first", inplace=True)

    concat_df.reset_index(drop=True, inplace=True)
    return concat_df


def create_metric_plot_obj(
    df: pd.DataFrame, metrics: List[str], human_baselines: Dict[str, float], title: str
) -> Figure:
    """
    Create a Plotly figure object with lines representing different metrics
    and horizontal dotted lines representing human baselines.

    :param df: The DataFrame containing the metric values, names, and dates.
    :param metrics: A list of strings representing the names of the metrics
                    to be included in the plot.
    :param human_baselines: A dictionary where keys are metric names
                            and values are human baseline values for the metrics.
    :param title: A string representing the title of the plot.
    :return: A Plotly figure object with lines representing metrics and
             horizontal dotted lines representing human baselines.
    """

    # Filter the DataFrame based on the specified metrics
    df = df[df["Metric Name"].isin(metrics)]

    # Filter the human baselines based on the specified metrics
    filtered_human_baselines = {k: v for k, v in human_baselines.items() if k in metrics}

    # Create a line figure using plotly express with specified markers and custom data
    fig = px.line(
        df,
        x="Result Date",
        y="Metric Value",
        color="Metric Name",
        markers=True,
        custom_data=["Metric Name", "Metric Value", "Model Name"],
        title=title,
    )

    # Update hovertemplate for better hover interaction experience
    fig.update_traces(
        hovertemplate="<br>".join(
            [
                "Model Name: %{customdata[2]}",
                "Metric Name: %{customdata[0]}",
                "Date: %{x}",
                "Metric Value: %{y}",
            ]
        )
    )

    # Update the range of the y-axis
    fig.update_layout(yaxis_range=[0, 100])

    # Create a dictionary to hold the color mapping for each metric
    metric_color_mapping = {}

    # Map each metric name to its color in the figure
    for trace in fig.data:
        metric_color_mapping[trace.name] = trace.line.color

    # Iterate over filtered human baselines and add horizontal lines to the figure
    for metric, value in filtered_human_baselines.items():
        color = metric_color_mapping.get(metric, "blue")  # Retrieve color from mapping; default to blue if not found
        location = "top left" if metric == "HellaSwag" else "bottom left"  # Set annotation position
        # Add horizontal line with matched color and positioned annotation
        fig.add_hline(
            y=value,
            line_dash="dot",
            annotation_text=f"{metric} human baseline",
            annotation_position=location,
            annotation_font_size=10,
            annotation_font_color=color,
            line_color=color,
        )

    return fig


# Example Usage:
# human_baselines dictionary is defined.
# chart = create_metric_plot_obj(scores_df, ["ARC", "HellaSwag", "MMLU", "TruthfulQA"], human_baselines, "Graph Title")