andrewrreed's picture
bug fix + update new model releases dates
33eb9c4
import json
from datetime import datetime
from typing import Literal, List
import pandas as pd
import plotly.express as px
from huggingface_hub import HfFileSystem, hf_hub_download
# from: https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/monitor/monitor.py#L389
KEY_TO_CATEGORY_NAME = {
"full": "Overall",
"dedup": "De-duplicate Top Redundant Queries (soon to be default)",
"math": "Math",
"if": "Instruction Following",
"multiturn": "Multi-Turn",
"coding": "Coding",
"hard_6": "Hard Prompts (Overall)",
"hard_english_6": "Hard Prompts (English)",
"long_user": "Longer Query",
"english": "English",
"chinese": "Chinese",
"french": "French",
"german": "German",
"spanish": "Spanish",
"russian": "Russian",
"japanese": "Japanese",
"korean": "Korean",
"no_tie": "Exclude Ties",
"no_short": "Exclude Short Query (< 5 tokens)",
"no_refusal": "Exclude Refusal",
"overall_limit_5_user_vote": "overall_limit_5_user_vote",
"full_old": "Overall (Deprecated)",
}
CAT_NAME_TO_EXPLANATION = {
"Overall": "Overall Questions",
"De-duplicate Top Redundant Queries (soon to be default)": "De-duplicate top redundant queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
"Math": "Math",
"Instruction Following": "Instruction Following",
"Multi-Turn": "Multi-Turn Conversation (>= 2 turns)",
"Coding": "Coding: whether conversation contains code snippets",
"Hard Prompts (Overall)": "Hard Prompts (Overall): details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
"Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)",
"Longer Query": "Longer Query (>= 500 tokens)",
"English": "English Prompts",
"Chinese": "Chinese Prompts",
"French": "French Prompts",
"German": "German Prompts",
"Spanish": "Spanish Prompts",
"Russian": "Russian Prompts",
"Japanese": "Japanese Prompts",
"Korean": "Korean Prompts",
"Exclude Ties": "Exclude Ties and Bothbad",
"Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
"Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
"overall_limit_5_user_vote": "overall_limit_5_user_vote",
"Overall (Deprecated)": "Overall without De-duplicating Top Redundant Queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).",
}
PROPRIETARY_LICENSES = ["Proprietary", "Proprietory"]
def download_latest_data_from_space(
repo_id: str, file_type: Literal["pkl", "csv"]
) -> str:
"""
Downloads the latest data file of the specified file type from the given repository space.
Args:
repo_id (str): The ID of the repository space.
file_type (Literal["pkl", "csv"]): The type of the data file to download. Must be either "pkl" or "csv".
Returns:
str: The local file path of the downloaded data file.
"""
def extract_date(filename):
return filename.split("/")[-1].split(".")[0].split("_")[-1]
fs = HfFileSystem()
data_file_path = f"spaces/{repo_id}/*.{file_type}"
files = fs.glob(data_file_path)
files = [
file for file in files if "leaderboard_table" in file or "elo_results" in file
]
latest_file = sorted(files, key=extract_date, reverse=True)[0]
latest_filepath_local = hf_hub_download(
repo_id=repo_id,
filename=latest_file.split("/")[-1],
repo_type="space",
)
print(latest_file.split("/")[-1])
return latest_filepath_local
def get_constants(dfs):
"""
Calculate and return the minimum and maximum Elo scores, as well as the maximum number of models per month.
Parameters:
- dfs (dict): A dictionary containing DataFrames for different categories.
Returns:
- min_elo_score (float): The minimum Elo score across all DataFrames.
- max_elo_score (float): The maximum Elo score across all DataFrames.
- upper_models_per_month (int): The maximum number of models per month per license across all DataFrames.
"""
filter_ranges = {}
for k, df in dfs.items():
filter_ranges[k] = {
"min_elo_score": df["rating"].min().round(),
"max_elo_score": df["rating"].max().round(),
"upper_models_per_month": int(
df.groupby(["Month-Year", "License"])["rating"]
.apply(lambda x: x.count())
.max()
),
}
min_elo_score = float("inf")
max_elo_score = float("-inf")
upper_models_per_month = 0
for _, value in filter_ranges.items():
min_elo_score = min(min_elo_score, value["min_elo_score"])
max_elo_score = max(max_elo_score, value["max_elo_score"])
upper_models_per_month = max(
upper_models_per_month, value["upper_models_per_month"]
)
return min_elo_score, max_elo_score, upper_models_per_month
def update_release_date_mapping(
new_model_keys_to_add: List[str],
leaderboard_df: pd.DataFrame,
release_date_mapping: pd.DataFrame,
) -> pd.DataFrame:
"""
Update the release date mapping with new model keys.
Args:
new_model_keys_to_add (List[str]): A list of new model keys to add to the release date mapping.
leaderboard_df (pd.DataFrame): The leaderboard DataFrame containing the model information.
release_date_mapping (pd.DataFrame): The current release date mapping DataFrame.
Returns:
pd.DataFrame: The updated release date mapping DataFrame.
"""
# if any, add those to the release date mapping
if new_model_keys_to_add:
for key in new_model_keys_to_add:
new_entry = {
"key": key,
"Model": leaderboard_df[leaderboard_df["key"] == key]["Model"].values[
0
],
"Release Date": datetime.today().strftime("%Y-%m-%d"),
}
with open("release_date_mapping.json", "r") as file:
data = json.load(file)
data.append(new_entry)
with open("release_date_mapping.json", "w") as file:
json.dump(data, file, indent=4)
print(f"Added {key} to release_date_mapping.json")
# reload the release date mapping
release_date_mapping = pd.read_json(
"release_date_mapping.json", orient="records"
)
return release_date_mapping
def format_data(df):
"""
Formats the given DataFrame by performing the following operations:
- Converts the 'License' column values to 'Proprietary LLM' if they are in PROPRIETARY_LICENSES, otherwise 'Open LLM'.
- Converts the 'Release Date' column to datetime format.
- Adds a new 'Month-Year' column by extracting the month and year from the 'Release Date' column.
- Rounds the 'rating' column to the nearest integer.
- Resets the index of the DataFrame.
Args:
df (pandas.DataFrame): The DataFrame to be formatted.
Returns:
pandas.DataFrame: The formatted DataFrame.
"""
df["License"] = df["License"].apply(
lambda x: "Proprietary LLM" if x in PROPRIETARY_LICENSES else "Open LLM"
)
df["Release Date"] = pd.to_datetime(df["Release Date"])
df["Month-Year"] = df["Release Date"].dt.to_period("M")
df["rating"] = df["rating"].round()
return df.reset_index(drop=True)
def get_trendlines(fig):
trend_lines = px.get_trendline_results(fig)
return [
trend_lines.iloc[i]["px_fit_results"].params.tolist()
for i in range(len(trend_lines))
]
def find_crossover_point(b1, m1, b2, m2):
"""
Determine the X value at which two trendlines will cross over.
Parameters:
m1 (float): Slope of the first trendline.
b1 (float): Intercept of the first trendline.
m2 (float): Slope of the second trendline.
b2 (float): Intercept of the second trendline.
Returns:
float: The X value where the two trendlines cross.
"""
if m1 == m2:
raise ValueError("The trendlines are parallel and do not cross.")
x_crossover = (b2 - b1) / (m1 - m2)
return x_crossover