import pandas as pd from datasets import load_dataset import os import json from pprint import pprint pd.options.plotting.backend = "plotly" MODELS = [ "mistralai__Mistral-7B-Instruct-v0.2", # "HuggingFaceH4__zephyr-7b-beta", # "meta-llama__Llama-2-7b-chat-hf", # "01-ai__Yi-34B-Chat", ] HF_TOKEN = os.getenv("HF_TOKEN") score_turn = { 1: "multi_turn", 0: "single_turn", } def get_dataframe_lighteval() -> pd.DataFrame: samples = [] scores = [] for model in MODELS: details_lighteval = load_dataset( f"SaylorTwift/details_{model}_private", "extended_mt_bench_0", split="latest", token=HF_TOKEN, ) for d in details_lighteval: judement_prompt = d["judement_prompt"] judgement = d["judgement"] predictions = d["predictions"][0] prompts = d["full_prompt"] turns = [] for turn in range(len(predictions)): if turn == 1: prompt = prompts[turn].format(model_response=predictions[turn - 1]) else: prompt = prompts[turn] turns.append([]) turns[turn].append(prompt) turns[turn].append(predictions[turn]) turns[turn].append(judement_prompt[turn]) turns[turn].append(judgement[turn]) for i, turn in enumerate(turns): samples.append( { "model": model, "turn": i, "prompt": turn[0], "response": turn[1], "judgement_prompt": turn[2], "judgment": turn[3], "score": d["metrics"][score_turn[i]], "question_id": d["specifics"]["id"], } ) dataframe_all_samples = pd.DataFrame(samples) return dataframe_all_samples def construct_dataframe() -> pd.DataFrame: """ Construct a dataframe from the data in the data folder """ lighteval = get_dataframe_lighteval() lighteval["model"] = lighteval["model"].apply(lambda x: x.split("__")[1]) lighteval = lighteval.set_index(["question_id", "turn", "model"]) all_samples = lighteval.reset_index() all_samples = all_samples.set_index("question_id") return all_samples.dropna() def create_plot(model: str, dataframe: pd.DataFrame): new = dataframe[dataframe["model"] == model].dropna() new = new[new["turn"] == 1] new["score_lighteval"] = new["score_lighteval"].astype(int) new["score_mt_bench"] = new["score_mt_bench"].astype(int) new = new[['score_lighteval', 'score_mt_bench']] new.index = new.index.astype(str) fig = new.plot.bar(title="Scores", labels={"index": "Index", "value": "Score"}, barmode="group") return fig def get_scores(dataframe): dataframe = dataframe.dropna() dataframe["score"] = dataframe["score"].astype(int) new = dataframe[['score', "turn", "model"]] new = new.groupby(["model", "turn"]).mean() new = new.groupby(["model"]).mean() return new if __name__ == "__main__": df = construct_dataframe() from pprint import pprint pprint(df) #print(df.iloc[130]) # model = "zephyr-7b-beta" # fig = create_plot(model, df) # fig.show()