Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import json | |
from datetime import datetime | |
from typing import Literal, List | |
import pandas as pd | |
from huggingface_hub import HfFileSystem, hf_hub_download | |
KEY_TO_CATEGORY_NAME = { | |
"full": "Overall", | |
"coding": "Coding", | |
"long_user": "Longer Query", | |
"english": "English", | |
"chinese": "Chinese", | |
"french": "French", | |
"no_tie": "Exclude Ties", | |
"no_short": "Exclude Short Query (< 5 tokens)", | |
"no_refusal": "Exclude Refusal", | |
} | |
CAT_NAME_TO_EXPLANATION = { | |
"Overall": "Overall Questions", | |
"Coding": "Coding: whether conversation contains code snippets", | |
"Longer Query": "Longer Query (>= 500 tokens)", | |
"English": "English Prompts", | |
"Chinese": "Chinese Prompts", | |
"French": "French Prompts", | |
"Exclude Ties": "Exclude Ties and Bothbad", | |
"Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)", | |
"Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")', | |
} | |
PROPRIETARY_LICENSES = [ | |
"Proprietary", | |
] | |
def download_latest_data_from_space( | |
repo_id: str, file_type: Literal["pkl", "csv"] | |
) -> str: | |
""" | |
Downloads the latest data file of the specified file type from the given repository space. | |
Args: | |
repo_id (str): The ID of the repository space. | |
file_type (Literal["pkl", "csv"]): The type of the data file to download. Must be either "pkl" or "csv". | |
Returns: | |
str: The local file path of the downloaded data file. | |
""" | |
def extract_date(filename): | |
return filename.split("/")[-1].split(".")[0].split("_")[-1] | |
fs = HfFileSystem() | |
data_file_path = f"spaces/{repo_id}/*.{file_type}" | |
files = fs.glob(data_file_path) | |
latest_file = sorted(files, key=extract_date, reverse=True)[0] | |
latest_filepath_local = hf_hub_download( | |
repo_id=repo_id, | |
filename=latest_file.split("/")[-1], | |
repo_type="space", | |
) | |
return latest_filepath_local | |
def get_constants(dfs): | |
""" | |
Calculate and return the minimum and maximum Elo scores, as well as the maximum number of models per month. | |
Parameters: | |
- dfs (dict): A dictionary containing DataFrames for different categories. | |
Returns: | |
- min_elo_score (float): The minimum Elo score across all DataFrames. | |
- max_elo_score (float): The maximum Elo score across all DataFrames. | |
- upper_models_per_month (int): The maximum number of models per month per license across all DataFrames. | |
""" | |
filter_ranges = {} | |
for k, df in dfs.items(): | |
filter_ranges[k] = { | |
"min_elo_score": df["rating"].min().round(), | |
"max_elo_score": df["rating"].max().round(), | |
"upper_models_per_month": int( | |
df.groupby(["Month-Year", "License"])["rating"] | |
.apply(lambda x: x.count()) | |
.max() | |
), | |
} | |
min_elo_score = float("inf") | |
max_elo_score = float("-inf") | |
upper_models_per_month = 0 | |
for _, value in filter_ranges.items(): | |
min_elo_score = min(min_elo_score, value["min_elo_score"]) | |
max_elo_score = max(max_elo_score, value["max_elo_score"]) | |
upper_models_per_month = max( | |
upper_models_per_month, value["upper_models_per_month"] | |
) | |
return min_elo_score, max_elo_score, upper_models_per_month | |
def update_release_date_mapping( | |
new_model_keys_to_add: List[str], | |
leaderboard_df: pd.DataFrame, | |
release_date_mapping: pd.DataFrame, | |
) -> pd.DataFrame: | |
""" | |
Update the release date mapping with new model keys. | |
Args: | |
new_model_keys_to_add (List[str]): A list of new model keys to add to the release date mapping. | |
leaderboard_df (pd.DataFrame): The leaderboard DataFrame containing the model information. | |
release_date_mapping (pd.DataFrame): The current release date mapping DataFrame. | |
Returns: | |
pd.DataFrame: The updated release date mapping DataFrame. | |
""" | |
# if any, add those to the release date mapping | |
if new_model_keys_to_add: | |
for key in new_model_keys_to_add: | |
new_entry = { | |
"key": key, | |
"Model": leaderboard_df[leaderboard_df["key"] == key]["Model"].values[ | |
0 | |
], | |
"Release Date": datetime.today().strftime("%Y-%m-%d"), | |
} | |
with open("release_date_mapping.json", "r") as file: | |
data = json.load(file) | |
data.append(new_entry) | |
with open("release_date_mapping.json", "w") as file: | |
json.dump(data, file, indent=4) | |
print(f"Added {key} to release_date_mapping.json") | |
# reload the release date mapping | |
release_date_mapping = pd.read_json( | |
"release_date_mapping.json", orient="records" | |
) | |
return release_date_mapping | |
def format_data(df): | |
""" | |
Formats the given DataFrame by performing the following operations: | |
- Converts the 'License' column values to 'Proprietary LLM' if they are in PROPRIETARY_LICENSES, otherwise 'Open LLM'. | |
- Converts the 'Release Date' column to datetime format. | |
- Adds a new 'Month-Year' column by extracting the month and year from the 'Release Date' column. | |
- Rounds the 'rating' column to the nearest integer. | |
- Resets the index of the DataFrame. | |
Args: | |
df (pandas.DataFrame): The DataFrame to be formatted. | |
Returns: | |
pandas.DataFrame: The formatted DataFrame. | |
""" | |
df["License"] = df["License"].apply( | |
lambda x: "Proprietary LLM" if x in PROPRIETARY_LICENSES else "Open LLM" | |
) | |
df["Release Date"] = pd.to_datetime(df["Release Date"]) | |
df["Month-Year"] = df["Release Date"].dt.to_period("M") | |
df["rating"] = df["rating"].round() | |
return df.reset_index(drop=True) | |