leaderboard / cli /analysis-cli.py
pminervini's picture
update
5a94e04
raw history blame
No virus
4.77 kB
#!/usr/bin/env python3
import os
import sys
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage
from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
from src.envs import QUEUE_REPO, RESULTS_REPO, API
from src.utils import my_snapshot_download
def find_json_files(json_path):
res = []
for root, dirs, files in os.walk(json_path):
for file in files:
if file.endswith(".json"):
res.append(os.path.join(root, file))
return res
my_snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
result_path_lst = find_json_files(EVAL_RESULTS_PATH_BACKEND)
request_path_lst = find_json_files(EVAL_REQUESTS_PATH_BACKEND)
model_name_to_model_map = {}
for path in request_path_lst:
with open(path, 'r') as f:
data = json.load(f)
model_name_to_model_map[data["model"]] = data
model_dataset_metric_to_result_map = {}
data_map = {}
for path in result_path_lst:
with open(path, 'r') as f:
data = json.load(f)
model_name = data["config"]["model_name"]
for dataset_name, results_dict in data["results"].items():
for metric_name, value in results_dict.items():
# print(model_name, dataset_name, metric_name, value)
if ',' in metric_name and '_stderr' not in metric_name \
and 'f1' not in metric_name \
and model_name_to_model_map[model_name]["likes"] > 256:
to_add = True
if 'selfcheck' in dataset_name:
if 'max' not in metric_name:
to_add = False
if 'nq_open' in dataset_name or 'triviaqa' in dataset_name:
to_add = False
# pass
# breakpoint()
if 'bertscore' in metric_name:
if 'precision' not in metric_name:
to_add = False
if 'correctness,' in metric_name or 'em,' in metric_name:
to_add = False
if 'rouge' in metric_name:
if 'rougeL' not in metric_name:
to_add = False
if 'ifeval' in dataset_name:
if 'prompt_level_strict_acc' not in metric_name:
to_add = False
if 'squad' in dataset_name:
to_add = False
if 'fever' in dataset_name:
to_add = False
if 'rouge' in metric_name:
value /= 100.0
if to_add:
sanitised_metric_name = metric_name.split(',')[0]
model_dataset_metric_to_result_map[(model_name, dataset_name, sanitised_metric_name)] = value
# if (model_name, dataset_name) not in data_map:
# data_map[(model_name, dataset_name)] = {}
# data_map[(model_name, dataset_name)][metric_name] = value
if model_name not in data_map:
data_map[model_name] = {}
data_map[model_name][(dataset_name, sanitised_metric_name)] = value
print('model_name', model_name, 'dataset_name', dataset_name, 'metric_name', metric_name, 'value', value)
model_name_lst = [m for m in data_map.keys()]
for m in model_name_lst:
if len(data_map[m]) < 8:
del data_map[m]
df = pd.DataFrame.from_dict(data_map, orient='index')
o_df = df.copy(deep=True)
print(df)
# Check for NaN or infinite values and replace them
df.replace([np.inf, -np.inf], np.nan, inplace=True) # Replace infinities with NaN
df.fillna(0, inplace=True) # Replace NaN with 0 (or use another imputation strategy)
from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# df = pd.DataFrame(scaler.fit_transform(df), index=df.index, columns=df.columns)
sns.set_context("notebook", font_scale=1.0)
# fig = sns.clustermap(df, method='average', metric='cosine', cmap='coolwarm', figsize=(16, 12), annot=True)
fig = sns.clustermap(df, method='ward', metric='euclidean', cmap='coolwarm', figsize=(16, 12), annot=True, mask=o_df.isnull())
# Adjust the size of the cells (less wide)
plt.setp(fig.ax_heatmap.get_yticklabels(), rotation=0)
plt.setp(fig.ax_heatmap.get_xticklabels(), rotation=90)
# Save the clustermap to file
fig.savefig('plots/clustermap.pdf')
fig.savefig('plots/clustermap.png')