Spaces:
Runtime error
Runtime error
| import json | |
| from datetime import datetime | |
| from typing import Literal, List | |
| import pandas as pd | |
| import plotly.express as px | |
| from huggingface_hub import HfFileSystem, hf_hub_download | |
| # from: https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/monitor/monitor.py#L389 | |
| KEY_TO_CATEGORY_NAME = { | |
| "full": "Overall", | |
| "dedup": "De-duplicate Top Redundant Queries (soon to be default)", | |
| "math": "Math", | |
| "if": "Instruction Following", | |
| "multiturn": "Multi-Turn", | |
| "coding": "Coding", | |
| "hard_6": "Hard Prompts (Overall)", | |
| "hard_english_6": "Hard Prompts (English)", | |
| "long_user": "Longer Query", | |
| "english": "English", | |
| "chinese": "Chinese", | |
| "french": "French", | |
| "german": "German", | |
| "spanish": "Spanish", | |
| "russian": "Russian", | |
| "japanese": "Japanese", | |
| "korean": "Korean", | |
| "no_tie": "Exclude Ties", | |
| "no_short": "Exclude Short Query (< 5 tokens)", | |
| "no_refusal": "Exclude Refusal", | |
| "overall_limit_5_user_vote": "overall_limit_5_user_vote", | |
| "full_old": "Overall (Deprecated)", | |
| } | |
| CAT_NAME_TO_EXPLANATION = { | |
| "Overall": "Overall Questions", | |
| "De-duplicate Top Redundant Queries (soon to be default)": "De-duplicate top redundant queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).", | |
| "Math": "Math", | |
| "Instruction Following": "Instruction Following", | |
| "Multi-Turn": "Multi-Turn Conversation (>= 2 turns)", | |
| "Coding": "Coding: whether conversation contains code snippets", | |
| "Hard Prompts (Overall)": "Hard Prompts (Overall): details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)", | |
| "Hard Prompts (English)": "Hard Prompts (English), note: the delta is to English Category. details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/)", | |
| "Longer Query": "Longer Query (>= 500 tokens)", | |
| "English": "English Prompts", | |
| "Chinese": "Chinese Prompts", | |
| "French": "French Prompts", | |
| "German": "German Prompts", | |
| "Spanish": "Spanish Prompts", | |
| "Russian": "Russian Prompts", | |
| "Japanese": "Japanese Prompts", | |
| "Korean": "Korean Prompts", | |
| "Exclude Ties": "Exclude Ties and Bothbad", | |
| "Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)", | |
| "Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")', | |
| "overall_limit_5_user_vote": "overall_limit_5_user_vote", | |
| "Overall (Deprecated)": "Overall without De-duplicating Top Redundant Queries (top 0.1%). See details in [blog post](https://lmsys.org/blog/2024-05-17-category-hard/#note-enhancing-quality-through-de-duplication).", | |
| } | |
| PROPRIETARY_LICENSES = ["Proprietary", "Proprietory"] | |
| def download_latest_data_from_space( | |
| repo_id: str, file_type: Literal["pkl", "csv"] | |
| ) -> str: | |
| """ | |
| Downloads the latest data file of the specified file type from the given repository space. | |
| Args: | |
| repo_id (str): The ID of the repository space. | |
| file_type (Literal["pkl", "csv"]): The type of the data file to download. Must be either "pkl" or "csv". | |
| Returns: | |
| str: The local file path of the downloaded data file. | |
| """ | |
| def extract_date(filename): | |
| return filename.split("/")[-1].split(".")[0].split("_")[-1] | |
| fs = HfFileSystem() | |
| data_file_path = f"spaces/{repo_id}/*.{file_type}" | |
| files = fs.glob(data_file_path) | |
| files = [ | |
| file for file in files if "leaderboard_table" in file or "elo_results" in file | |
| ] | |
| latest_file = sorted(files, key=extract_date, reverse=True)[0] | |
| latest_filepath_local = hf_hub_download( | |
| repo_id=repo_id, | |
| filename=latest_file.split("/")[-1], | |
| repo_type="space", | |
| ) | |
| print(latest_file.split("/")[-1]) | |
| return latest_filepath_local | |
| def get_constants(dfs): | |
| """ | |
| Calculate and return the minimum and maximum Elo scores, as well as the maximum number of models per month. | |
| Parameters: | |
| - dfs (dict): A dictionary containing DataFrames for different categories. | |
| Returns: | |
| - min_elo_score (float): The minimum Elo score across all DataFrames. | |
| - max_elo_score (float): The maximum Elo score across all DataFrames. | |
| - upper_models_per_month (int): The maximum number of models per month per license across all DataFrames. | |
| """ | |
| filter_ranges = {} | |
| for k, df in dfs.items(): | |
| filter_ranges[k] = { | |
| "min_elo_score": df["rating"].min().round(), | |
| "max_elo_score": df["rating"].max().round(), | |
| "upper_models_per_month": int( | |
| df.groupby(["Month-Year", "License"])["rating"] | |
| .apply(lambda x: x.count()) | |
| .max() | |
| ), | |
| } | |
| min_elo_score = float("inf") | |
| max_elo_score = float("-inf") | |
| upper_models_per_month = 0 | |
| for _, value in filter_ranges.items(): | |
| min_elo_score = min(min_elo_score, value["min_elo_score"]) | |
| max_elo_score = max(max_elo_score, value["max_elo_score"]) | |
| upper_models_per_month = max( | |
| upper_models_per_month, value["upper_models_per_month"] | |
| ) | |
| return min_elo_score, max_elo_score, upper_models_per_month | |
| def update_release_date_mapping( | |
| new_model_keys_to_add: List[str], | |
| leaderboard_df: pd.DataFrame, | |
| release_date_mapping: pd.DataFrame, | |
| ) -> pd.DataFrame: | |
| """ | |
| Update the release date mapping with new model keys. | |
| Args: | |
| new_model_keys_to_add (List[str]): A list of new model keys to add to the release date mapping. | |
| leaderboard_df (pd.DataFrame): The leaderboard DataFrame containing the model information. | |
| release_date_mapping (pd.DataFrame): The current release date mapping DataFrame. | |
| Returns: | |
| pd.DataFrame: The updated release date mapping DataFrame. | |
| """ | |
| # if any, add those to the release date mapping | |
| if new_model_keys_to_add: | |
| for key in new_model_keys_to_add: | |
| new_entry = { | |
| "key": key, | |
| "Model": leaderboard_df[leaderboard_df["key"] == key]["Model"].values[ | |
| 0 | |
| ], | |
| "Release Date": datetime.today().strftime("%Y-%m-%d"), | |
| } | |
| with open("release_date_mapping.json", "r") as file: | |
| data = json.load(file) | |
| data.append(new_entry) | |
| with open("release_date_mapping.json", "w") as file: | |
| json.dump(data, file, indent=4) | |
| print(f"Added {key} to release_date_mapping.json") | |
| # reload the release date mapping | |
| release_date_mapping = pd.read_json( | |
| "release_date_mapping.json", orient="records" | |
| ) | |
| return release_date_mapping | |
| def format_data(df): | |
| """ | |
| Formats the given DataFrame by performing the following operations: | |
| - Converts the 'License' column values to 'Proprietary LLM' if they are in PROPRIETARY_LICENSES, otherwise 'Open LLM'. | |
| - Converts the 'Release Date' column to datetime format. | |
| - Adds a new 'Month-Year' column by extracting the month and year from the 'Release Date' column. | |
| - Rounds the 'rating' column to the nearest integer. | |
| - Resets the index of the DataFrame. | |
| Args: | |
| df (pandas.DataFrame): The DataFrame to be formatted. | |
| Returns: | |
| pandas.DataFrame: The formatted DataFrame. | |
| """ | |
| df["License"] = df["License"].apply( | |
| lambda x: "Proprietary LLM" if x in PROPRIETARY_LICENSES else "Open LLM" | |
| ) | |
| df["Release Date"] = pd.to_datetime(df["Release Date"]) | |
| df["Month-Year"] = df["Release Date"].dt.to_period("M") | |
| df["rating"] = df["rating"].round() | |
| return df.reset_index(drop=True) | |
| def get_trendlines(fig): | |
| trend_lines = px.get_trendline_results(fig) | |
| return [ | |
| trend_lines.iloc[i]["px_fit_results"].params.tolist() | |
| for i in range(len(trend_lines)) | |
| ] | |
| def find_crossover_point(b1, m1, b2, m2): | |
| """ | |
| Determine the X value at which two trendlines will cross over. | |
| Parameters: | |
| m1 (float): Slope of the first trendline. | |
| b1 (float): Intercept of the first trendline. | |
| m2 (float): Slope of the second trendline. | |
| b2 (float): Intercept of the second trendline. | |
| Returns: | |
| float: The X value where the two trendlines cross. | |
| """ | |
| if m1 == m2: | |
| raise ValueError("The trendlines are parallel and do not cross.") | |
| x_crossover = (b2 - b1) / (m1 - m2) | |
| return x_crossover | |