import os import json import tiktoken from alpaca_eval import utils, metrics, annotators, constants, analyze, plotting, main from alpaca_eval.metrics.glm_winrate import get_length_controlled_winrate import os import pandas as pd import json # Define the path to the top-level directory TOP_LEVEL_DIRECTORY = "submodules/alpaca_eval/results" # Initialize an empty dictionary to hold the model name to dataframe mapping model_dataframes_outputs = {} # Iterate through each subdirectory in the top-level directory for model_name in os.listdir(TOP_LEVEL_DIRECTORY): model_dir = os.path.join(TOP_LEVEL_DIRECTORY, model_name) if os.path.isdir(model_dir): model_output_file = os.path.join(model_dir, "model_outputs.json") if os.path.exists(model_output_file): df = pd.read_json(model_output_file) df["model_name"] = model_name model_dataframes_outputs[model_name] = df def get_num_words(text): return len(text.split()) ENCODING = tiktoken.get_encoding("cl100k_base") def get_num_tokens(text): """Uses tiktoken to get the number of tokens in the text.""" try: return len(ENCODING.encode(str(text))) except: breakpoint() model_name_to_num_words = {} model_name_to_num_tokens = {} for model_name, model_dataframe in model_dataframes_outputs.items(): print(f"model_name_to_num_words for {model_name}") model_dataframe["model_name"] = model_name model_dataframe["output_num_words"] = model_dataframe["output"].apply(get_num_words) model_dataframe["output_num_tokens"] = model_dataframe["output"].apply( get_num_tokens ) model_name_to_num_words[model_name] = { "mean": int(model_dataframe["output_num_words"].mean()), "std": int(model_dataframe["output_num_words"].std()), } model_name_to_num_tokens[model_name] = { "mean": int(model_dataframe["output_num_tokens"].mean()), "std": int(model_dataframe["output_num_tokens"].std()), } num_words_df = pd.DataFrame(model_name_to_num_words).T num_tokens_df = pd.DataFrame(model_name_to_num_tokens).T model_name_to_win_rate = {} for model_name in os.listdir(TOP_LEVEL_DIRECTORY): print(f"model_name_to_win_rate for {model_name}") model_dir = os.path.join(TOP_LEVEL_DIRECTORY, model_name) if os.path.isdir(model_dir): model_output_file = os.path.join( model_dir, "weighted_alpaca_eval_gpt4_turbo", "annotations.json" ) if os.path.exists(model_output_file): model_dataframe = pd.read_json(model_output_file) model_name_to_win_rate[model_name] = get_length_controlled_winrate( model_dataframe ) win_rate_df = pd.DataFrame(model_name_to_win_rate).T df = num_words_df.join(win_rate_df, how="inner") df = df.rename( columns={ "mean": "num_words_mean", "std": "num_words_std", } ) df = df.join(num_tokens_df, how="inner") df = df.rename( columns={ "mean": "num_tokens_mean", "std": "num_tokens_std", } ) df["model_name"] = df.index df = df[df["length_controlled_winrate"] > 25] df.to_json("data/model_win_rates.jsonl", orient="records", lines=True)