Spaces:
Sleeping
Sleeping
import os | |
import re | |
import math | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import matplotlib.ticker as mtick | |
import seaborn as sns | |
import nltk | |
import evaluate | |
meteor = evaluate.load("meteor") | |
print(f"loading: {__file__}") | |
# final version | |
pattern_excessive_whitespaces = re.compile(r"\s{5,}") | |
pattern_text_repetitions = re.compile(r"(.{5}.*)\s*((\1)\s*)+", re.M | re.DOTALL) | |
def del_excessive_whitespaces(text, debug=False): | |
count = 0 | |
if isinstance(text, str): | |
if debug: | |
print("----detect excessive whitespaces----") | |
count = len(text) | |
text = pattern_excessive_whitespaces.sub("", text) | |
count -= len(text) | |
if debug and count: | |
print(f"removed excessive whitespaces: {count}") | |
return text, count | |
# final version for repetition detection | |
def detect_text_repetitions(text, debug=False): | |
count = 0 | |
if isinstance(text, str): | |
if debug: | |
print("----detect text repetitions----") | |
matches = pattern_text_repetitions.finditer(text) | |
for match in matches: | |
if debug: | |
print(match) | |
for groupNum in range(0, len(match.groups())): | |
groupNum = groupNum + 1 | |
print( | |
"Group {groupNum} found at {start}-{end}: `{group}`".format( | |
groupNum=groupNum, | |
start=match.start(groupNum), | |
end=match.end(groupNum), | |
group=match.group(groupNum), | |
) | |
) | |
start, end = match.span() | |
count += end - start | |
return count | |
def detect_repetitions(text, debug=False): | |
text, count_excessive_whitespaces = del_excessive_whitespaces(text, debug=debug) | |
count_text_repetitions = detect_text_repetitions(text, debug=debug) | |
total_repetitions = count_excessive_whitespaces + count_text_repetitions | |
result = (count_excessive_whitespaces, count_text_repetitions, total_repetitions) | |
if debug: | |
print(result) | |
return result | |
def detect_scores(text, debug=False): | |
newline_score, repetition_score, total_repetitions = detect_repetitions( | |
text, debug=debug | |
) | |
return pd.Series([newline_score, repetition_score, total_repetitions]) | |
def load_with_newline_and_repetition_scores(result_file, force_recalculate=False): | |
print(f"loading result file: {result_file}") | |
df = pd.read_csv(result_file, comment="#", on_bad_lines="warn") | |
if ( | |
force_recalculate | |
or "newline_score" not in df.columns | |
or "repetition_score" not in df.columns | |
or "total_repetitions" not in df.columns | |
): | |
df[["newline_score", "repetition_score", "total_repetitions"]] = df[ | |
"answer" | |
].apply(detect_scores) | |
df.to_csv(result_file, index=False) | |
return df | |
def replace_last(source_string, old_string, new_string): | |
head, _sep, tail = source_string.rpartition(old_string) | |
return head + new_string + tail | |
def load_for_repetition_penalty( | |
csv_result_file, repetition_penalty, force_recalculate=False | |
): | |
result_file = replace_last( | |
csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv" | |
) | |
return load_with_newline_and_repetition_scores( | |
result_file, force_recalculate=force_recalculate | |
) | |
def calc_adjusted_performance(f, r): | |
return f / math.log10(10 + r) | |
def calculate_adjusted_performance(row): | |
r = row["total_repetitions"] | |
adjusted_precision = calc_adjusted_performance(row["precision"], r) | |
adjusted_recall = calc_adjusted_performance(row["recall"], r) | |
return pd.Series([adjusted_precision, adjusted_recall]) | |
def load_performance_df(csv_result_file, repetition_penalty): | |
result_file = replace_last( | |
csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}-t2_evaluated.json" | |
) | |
result_file = result_file.replace("/results/", "/eval/") | |
print(f"loading json file: {result_file}") | |
df = pd.read_json(result_file) | |
return df | |
def calculate_performance_score_v1( | |
csv_result_file, repetition_penalty, force_recalculate=False | |
): | |
result_file = replace_last( | |
csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv" | |
) | |
print(f"loading result file: {result_file}") | |
df = pd.read_csv(result_file, comment="#", on_bad_lines="warn") | |
if force_recalculate or "f2" in df.columns or "f1" not in df.columns: | |
df.drop( | |
columns=[ | |
"precision", | |
"recall", | |
"f1", | |
"f2", | |
"entities_in_answer", | |
"entities_in_question", | |
], | |
errors="ignore", | |
inplace=True, | |
) | |
perf_df = load_performance_df(csv_result_file, repetition_penalty) | |
filtered_df = perf_df[perf_df["id"].isin(df["id"])] | |
perf_df = filtered_df.reset_index(drop=True) | |
print(f"perf_df len: {len(perf_df)}") | |
# print(perf_df.head()) | |
df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"] | |
df["precision"] = perf_df["score"].apply(lambda x: x[0]) | |
df["recall"] = perf_df["score"].apply(lambda x: x[1]) | |
df["f1"] = perf_df["score"].apply(lambda x: x[2]) | |
df[["adjusted_precision", "adjusted_recall"]] = df.apply( | |
calculate_adjusted_performance, axis=1 | |
) | |
df.to_csv(result_file, index=False) | |
print(f"performance scores saved to result file: {result_file}") | |
print(f"df len: {len(df)}") | |
return df | |
ref_df = pd.read_csv( | |
"./data/results/gpt-3.5-turbo_non_rag.csv", comment="#", on_bad_lines="warn" | |
) | |
def calculate_performance_score( | |
csv_result_file, repetition_penalty, force_recalculate=False | |
): | |
result_file = replace_last( | |
csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv" | |
) | |
re_creating = False | |
if os.path.exists(result_file): | |
print(f"loading result file: {result_file}") | |
df = pd.read_csv(result_file, comment="#", on_bad_lines="warn") | |
else: | |
print(f"re-creating result file: {result_file}") | |
df = pd.DataFrame() | |
force_recalculate = True | |
if force_recalculate or "f2" in df.columns or "f1" not in df.columns: | |
df.drop( | |
columns=[ | |
"precision", | |
"recall", | |
"f1", | |
"f2", | |
"entities_in_answer", | |
"entities_in_question", | |
"word_count", | |
], | |
errors="ignore", | |
inplace=True, | |
) | |
perf_df = load_performance_df(csv_result_file, repetition_penalty) | |
filtered_df = perf_df[perf_df["id"].isin(ref_df["id"])] | |
perf_df = filtered_df.reset_index(drop=True) | |
print(f"perf_df len: {len(perf_df)}") | |
if len(perf_df) != len(ref_df): | |
print(f"error: len(perf_df) != {len(ref_df)}") | |
missing_ids = [ | |
id for id in ref_df["id"].unique() if id not in perf_df["id"].unique() | |
] | |
print(f"missing_ids: {missing_ids}") | |
# print(perf_df.head()) | |
df["id"] = perf_df["id"] | |
df["question"] = perf_df["question"] | |
df["answer"] = perf_df["pred_answer"] | |
df["word_count"] = df["answer"].apply( | |
lambda x: len(nltk.word_tokenize(x)) if isinstance(x, str) else 0 | |
) | |
df["ground_truth"] = perf_df["ground_truth"] | |
df[["newline_score", "repetition_score", "total_repetitions"]] = df[ | |
"answer" | |
].apply(detect_scores) | |
df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"] | |
df["precision"] = perf_df["score"].apply(lambda x: x[0]) | |
df["recall"] = perf_df["score"].apply(lambda x: x[1]) | |
df["f1"] = perf_df["score"].apply(lambda x: x[2]) | |
df[["adjusted_precision", "adjusted_recall"]] = df.apply( | |
calculate_adjusted_performance, axis=1 | |
) | |
df.to_csv(result_file, index=False) | |
print(f"performance scores saved to result file: {result_file}") | |
print(f"df len: {len(df)}") | |
return df | |
def adjust_perf_scores_with_repetition_penalty(result, precision, recall): | |
newline_score = [ | |
df["newline_score"].mean() for df in result["df_list_repetition_penalty"] | |
] | |
repetition_score = [ | |
df["repetition_score"].mean() for df in result["df_list_repetition_penalty"] | |
] | |
precision = [ | |
f / math.log10(10 + n + r) | |
for f, n, r in zip(precision, newline_score, repetition_score) | |
] | |
recall = [ | |
f / math.log10(10 + n + r) | |
for f, n, r in zip(recall, newline_score, repetition_score) | |
] | |
return precision, recall | |
def plot_performance_scores( | |
result, | |
models=None, | |
title="Performance", | |
): | |
if models is None: | |
models = result.keys() | |
for model in models: | |
print(f"model: {model}") | |
df = result[model]["df_overall"] | |
# Calculate the statistics | |
precision = [ | |
df["precision"].mean() for df in result[model]["df_list_repetition_penalty"] | |
] | |
recall = [ | |
df["recall"].mean() for df in result[model]["df_list_repetition_penalty"] | |
] | |
f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)] | |
best_f1 = max(f1) | |
best_f1_index = f1.index(best_f1) | |
precision, recall = adjust_perf_scores_with_repetition_penalty( | |
result[model], precision, recall | |
) | |
afrp = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)] | |
# f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]] | |
best_afrp = max(afrp) | |
best_afrp_index = afrp.index(best_afrp) | |
adjusted_precision = [ | |
df["adjusted_precision"].mean() | |
for df in result[model]["df_list_repetition_penalty"] | |
] | |
adjusted_recall = [ | |
df["adjusted_recall"].mean() | |
for df in result[model]["df_list_repetition_penalty"] | |
] | |
afrp2 = [ | |
2 * (p * r) / (p + r) for p, r in zip(adjusted_precision, adjusted_recall) | |
] | |
best_afrp2 = max(afrp2) | |
best_afrp2_index = afrp2.index(best_afrp2) | |
repetition_penalties = list(df["repetition_penalty"]) | |
# line plot for precision, recall, f1 | |
plt.figure(figsize=(10, 6)) | |
plt.axvspan( | |
repetition_penalties[best_f1_index] - 0.01, | |
repetition_penalties[best_f1_index] + 0.01, | |
alpha=0.5, | |
edgecolor="none", | |
facecolor="blue", | |
) | |
# plt.axvspan( | |
# repetition_penalties[best_afrp2_index] - 0.01, | |
# repetition_penalties[best_afrp2_index] + 0.01, | |
# alpha=0.5, | |
# edgecolor="none", | |
# facecolor="green", | |
# ) | |
plt.axvspan( | |
repetition_penalties[best_afrp_index] - 0.01, | |
repetition_penalties[best_afrp_index] + 0.01, | |
alpha=0.5, | |
edgecolor="none", | |
facecolor="orange", | |
) | |
plt.plot(repetition_penalties, f1, label="F1", marker="D", color="blue") | |
# plt.plot( | |
# repetition_penalties, | |
# afrp2, | |
# label="Per-question RAP - F1", | |
# marker="s", | |
# color="green", | |
# ) | |
plt.plot( | |
repetition_penalties, | |
afrp, | |
label="RAP - F1", | |
marker="o", | |
color="orange", | |
) | |
plt.xlabel("Repetition Penalties") | |
plt.ylabel("Score") | |
# plt.xlim(0.99, 1.31) | |
# y in percentage | |
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0)) | |
plt.title(f"{model} {title}") | |
plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") | |
plt.show() | |
def plot_best_afrp( | |
result, | |
models=None, | |
title="Models with Best RAP - F1", | |
ref_result=None, | |
): | |
# Initialize lists to store the statistics | |
model_names = [] | |
best_f1 = [] | |
best_afrp = [] | |
best_repetition_penalty = [] | |
best_mtr = [] | |
if models is None: | |
models = result.keys() | |
for model in models: | |
print(f"model: {model}") | |
df = result[model]["df_overall"] | |
# Calculate the statistics | |
precision = [ | |
df["precision"].mean() for df in result[model]["df_list_repetition_penalty"] | |
] | |
recall = [ | |
df["recall"].mean() for df in result[model]["df_list_repetition_penalty"] | |
] | |
# f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]] | |
f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)] | |
newline_score = [ | |
df["newline_score"].mean() | |
for df in result[model]["df_list_repetition_penalty"] | |
] | |
# print(f"newline_score: {newline_score}") | |
repetition_score = [ | |
df["repetition_score"].mean() | |
for df in result[model]["df_list_repetition_penalty"] | |
] | |
# print(f"repetition_score: {repetition_score}") | |
afrp = [ | |
f / math.log10(10 + n + r) | |
for f, n, r in zip(f1, newline_score, repetition_score) | |
] | |
best_afrp.append(max(afrp)) | |
best_afrp_index = afrp.index(best_afrp[-1]) | |
best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index]) | |
best_f1.append(f1[best_afrp_index]) | |
best_mtr.append( | |
newline_score[best_afrp_index] + repetition_score[best_afrp_index] | |
) | |
# print( | |
# f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}" | |
# ) | |
df = result[model]["df_list_repetition_penalty"][best_afrp_index] | |
model_names.append( | |
f"{model} (RP={best_repetition_penalty[-1]})" | |
) # Add the model name to the list | |
if ref_result is not None: | |
print("ref_result:", ref_result) | |
for model in ref_result.keys(): | |
model_names.append(model) | |
df = pd.read_csv(ref_result[model]) | |
# df = df[df["id"].isin(wikidata_df["id"])] | |
p = df["precision"].mean() | |
r = df["recall"].mean() | |
f1 = 2 * p * r / (p + r) if p + r > 0 else 0 | |
best_f1.append(f1) | |
best_afrp.append(f1) | |
best_mtr.append(0) | |
print("model_names:", model_names) | |
# print("best_f1:", best_f1) | |
# print("best_afrp:", best_afrp) | |
# Create a DataFrame with the statistics | |
data = pd.DataFrame( | |
{ | |
"Model": model_names, | |
"RAP - F1": best_afrp, | |
"F1": best_f1, | |
} | |
) | |
# Melt the DataFrame to a long format | |
data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score") | |
# Pivot the DataFrame to a wide format | |
data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score") | |
# make sure the columns are following the order of the models | |
data_pivoted = data_pivoted[model_names] | |
# make sure three groups in the order of precision, recall, f1 | |
data_pivoted = data_pivoted.reindex(["RAP - F1", "F1"]) | |
# Plot the statistics | |
plt.figure(figsize=(15, 6)) | |
ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9) | |
plt.title(title) | |
plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") | |
# Set the rotation of the x-axis labels to 0 degrees | |
plt.xticks(rotation=0) | |
# Format the y-axis to display as percentage | |
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0)) | |
# get the max value of the y-axis | |
a1 = max(best_afrp) | |
a2 = max(best_f1) | |
max_value = max([a1, a2]) * 1.12 | |
print("max_value:", max_value) | |
# Set the y-axis limit up to 70% | |
ax.set_ylim(0, max_value) | |
# Add the values above each bar | |
for p in ax.patches: | |
ax.annotate( | |
f"{p.get_height() * 100:.1f}", | |
(p.get_x() + p.get_width() / 2.0, p.get_height()), | |
ha="center", | |
va="bottom", | |
xytext=(0, 10), | |
textcoords="offset points", | |
rotation=90, | |
) | |
plt.show() | |
return data_pivoted, best_mtr | |
def plot_best_performance( | |
result, | |
models=None, | |
title="Models with Best F1 Score", | |
adjusted_f1=False, | |
ref_result=None, | |
): | |
# Initialize lists to store the statistics | |
model_names = [] | |
best_precision = [] | |
best_recall = [] | |
best_f1 = [] | |
best_repetition_penalty = [] | |
best_mtr = [] | |
if models is None: | |
models = result.keys() | |
for model in models: | |
print(f"model: {model}") | |
df = result[model]["df_overall"] | |
# Calculate the statistics | |
precision = [ | |
df["precision"].mean() for df in result[model]["df_list_repetition_penalty"] | |
] | |
recall = [ | |
df["recall"].mean() for df in result[model]["df_list_repetition_penalty"] | |
] | |
newline_score = [ | |
df["newline_score"].mean() | |
for df in result[model]["df_list_repetition_penalty"] | |
] | |
repetition_score = [ | |
df["repetition_score"].mean() | |
for df in result[model]["df_list_repetition_penalty"] | |
] | |
if adjusted_f1: | |
precision, recall = adjust_perf_scores_with_repetition_penalty( | |
result[model], precision, recall | |
) | |
# f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]] | |
f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)] | |
best_f1.append(max(f1)) | |
best_f1_index = f1.index(best_f1[-1]) | |
best_repetition_penalty.append(df["repetition_penalty"][best_f1_index]) | |
best_precision.append(precision[best_f1_index]) | |
best_recall.append(recall[best_f1_index]) | |
best_mtr.append(newline_score[best_f1_index] + repetition_score[best_f1_index]) | |
print( | |
f"best repetition penalty: {best_repetition_penalty[-1]}, best f1: {best_f1[-1]}, precision: {best_precision[-1]}, recall: {best_recall[-1]}" | |
) | |
df = result[model]["df_list_repetition_penalty"][best_f1_index] | |
model_names.append( | |
f"{model} (RP={best_repetition_penalty[-1]})" | |
) # Add the model name to the list | |
# print sum for columns: newline_score, repetition_score | |
print( | |
f"newline_score: {df['newline_score'].sum()}, repetition_score: {df['repetition_score'].sum()}" | |
) | |
if ref_result is not None: | |
print("ref_result:", ref_result) | |
for model in ref_result.keys(): | |
model_names.append(model) | |
df = pd.read_csv(ref_result[model]) | |
# df = df[df["id"].isin(wikidata_df["id"])] | |
best_precision.append(df["precision"].mean()) | |
best_recall.append(df["recall"].mean()) | |
f1 = ( | |
2 | |
* (best_precision[-1] * best_recall[-1]) | |
/ (best_precision[-1] + best_recall[-1]) | |
) | |
# best_f1.append(df["f1"].mean()) | |
best_f1.append(f1) | |
best_mtr.append(0) | |
# Create a DataFrame with the statistics | |
data = ( | |
pd.DataFrame( | |
{ | |
"Model": model_names, | |
"Adjusted Precision with RP": best_precision, | |
"Adjusted Recall with RP": best_recall, | |
"Adjusted F1 with RP": best_f1, | |
} | |
) | |
if adjusted_f1 | |
else pd.DataFrame( | |
{ | |
"Model": model_names, | |
"Precision": best_precision, | |
"Recall": best_recall, | |
"F1": best_f1, | |
} | |
) | |
) | |
columns = list(data.columns) | |
# Melt the DataFrame to a long format | |
data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score") | |
# Pivot the DataFrame to a wide format | |
data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score") | |
# make sure the columns are following the order of the models | |
data_pivoted = data_pivoted[model_names] | |
# make sure three groups in the order of precision, recall, f1 | |
data_pivoted = data_pivoted.reindex(columns[1:]) | |
# Plot the statistics | |
plt.figure(figsize=(10, 6)) | |
ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9) | |
plt.title(title) | |
plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") | |
# Set the rotation of the x-axis labels to 0 degrees | |
plt.xticks(rotation=0) | |
# Format the y-axis to display as percentage | |
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0)) | |
# get the max value of the y-axis | |
a1 = max(best_precision) | |
a2 = max(best_recall) | |
a3 = max(best_f1) | |
max_value = max([a1, a2, a3]) * 1.12 | |
print("max_value:", max_value) | |
# Set the y-axis limit up to 70% | |
ax.set_ylim(0, max_value) | |
# Add the values above each bar | |
for p in ax.patches: | |
ax.annotate( | |
f"{p.get_height() * 100:.1f}", | |
(p.get_x() + p.get_width() / 2.0, p.get_height()), | |
ha="center", | |
va="bottom", | |
xytext=(0, 10), | |
textcoords="offset points", | |
rotation=90, | |
) | |
plt.show() | |
return data_pivoted, best_mtr | |
def plot_best_performance_ms_macro( | |
result, | |
models=None, | |
title="Models with Best RAP - Performance", | |
ref_result=None, | |
skip_generic_prompt=False, | |
include_adjusted_performance=True, | |
): | |
# Initialize lists to store the statistics | |
model_names = [] | |
best_f1 = [] | |
best_afrp = [] | |
best_repetition_penalty = [] | |
best_bleu1 = [] | |
best_rougeL = [] | |
best_mtr = [] | |
if models is None: | |
models = result.keys() | |
for model in models: | |
if skip_generic_prompt and "generic prompt" in model: | |
continue | |
print(f"model: {model}") | |
df = result[model]["df_overall"] | |
# Calculate the statistics | |
bleu1 = [x for x in df["bleu1"]] | |
rougeL = [x for x in df["rougeL"]] | |
f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)] | |
newline_score = [ | |
df["newline_score"].mean() | |
for df in result[model]["df_list_repetition_penalty"] | |
] | |
# print(f"newline_score: {newline_score}") | |
repetition_score = [ | |
df["repetition_score"].mean() | |
for df in result[model]["df_list_repetition_penalty"] | |
] | |
# print(f"repetition_score: {repetition_score}") | |
afrp = [ | |
f / math.log10(10 + n + r) | |
for f, n, r in zip(f1, newline_score, repetition_score) | |
] | |
best_afrp.append(max(afrp if include_adjusted_performance else f1)) | |
best_afrp_index = ( | |
afrp.index(best_afrp[-1]) | |
if include_adjusted_performance | |
else f1.index(best_afrp[-1]) | |
) | |
best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index]) | |
best_f1.append(f1[best_afrp_index]) | |
best_bleu1.append(bleu1[best_afrp_index]) | |
best_rougeL.append(rougeL[best_afrp_index]) | |
best_mtr.append( | |
newline_score[best_afrp_index] + repetition_score[best_afrp_index] | |
) | |
# print( | |
# f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}" | |
# ) | |
df = result[model]["df_list_repetition_penalty"][best_afrp_index] | |
model_names.append( | |
f"{model} (RP={best_repetition_penalty[-1]})" | |
) # Add the model name to the list | |
if ref_result is not None: | |
print("ref_result:", ref_result) | |
for model in ref_result.keys(): | |
model_names.append(model) | |
df = pd.read_csv(ref_result[model], comment="#", on_bad_lines="warn") | |
# df = df[df["id"].isin(wikidata_df["id"])] | |
p = df["bleu1"][0] | |
best_bleu1.append(p) | |
r = df["rougeL"][0] | |
best_rougeL.append(r) | |
f1 = 2 * p * r / (p + r) if p + r > 0 else 0 | |
best_f1.append(f1) | |
best_afrp.append(f1) | |
best_mtr.append(0) | |
# print("model_names:", model_names) | |
# print("best_f1:", best_f1) | |
# print("best_afrp:", best_afrp) | |
# Create a DataFrame with the statistics | |
data = ( | |
pd.DataFrame( | |
{ | |
"Model": model_names, | |
"RAP - Perf Score": best_afrp, | |
"Overall Perf Score": best_f1, | |
} | |
) | |
if include_adjusted_performance | |
else pd.DataFrame( | |
{ | |
"Model": model_names, | |
"Bleu-1": best_bleu1, | |
"Rouge-L": best_rougeL, | |
"Overall Perf Score": best_f1, | |
} | |
) | |
) | |
# Melt the DataFrame to a long format | |
data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score") | |
# Pivot the DataFrame to a wide format | |
data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score") | |
# make sure the columns are following the order of the models | |
data_pivoted = data_pivoted[model_names] | |
columns = list(data.columns) | |
data_pivoted = data_pivoted.reindex(columns[1:]) | |
# Plot the statistics | |
plt.figure(figsize=(10, 6)) | |
ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9) | |
plt.title(title) | |
plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") | |
# Set the rotation of the x-axis labels to 0 degrees | |
plt.xticks(rotation=0) | |
# Format the y-axis to display as percentage | |
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0)) | |
# get the max value of the y-axis | |
a1 = max(best_afrp) | |
a2 = max(best_f1) | |
a3 = max(best_bleu1) | |
a4 = max(best_rougeL) | |
max_value = ( | |
max([a1, a2] if include_adjusted_performance else [a1, a2, a3, a4]) * 1.12 | |
) | |
print("max_value:", max_value) | |
# Set the y-axis limit up to 70% | |
ax.set_ylim(0, max_value) | |
# Add the values above each bar | |
for p in ax.patches: | |
ax.annotate( | |
f"{p.get_height() * 100:.1f}", | |
(p.get_x() + p.get_width() / 2.0, p.get_height()), | |
ha="center", | |
va="bottom", | |
xytext=(0, 10), | |
textcoords="offset points", | |
rotation=90, | |
) | |
plt.show() | |
return data_pivoted, best_mtr | |
all_open_source_models = [ | |
"gemma-1.1-2b-it", | |
"Phi-3-mini-128k-instruct", | |
"gemma-1.1-7b-it", | |
"Llama-2-7b-chat-hf", | |
"Mistral-7B-Instruct-v0.2", | |
"Meta-Llama-3-8B-Instruct", | |
"Llama-2-13b-chat-hf", | |
"Llama-2-70b-chat-hf", | |
"Meta-Llama-3-70B-Instruct", | |
] | |
non_rag_csv_result_files = [ | |
"./data/results/gemma-1.1-2b-it_wd_non_rag.csv", # gemma-1.1-2b-it | |
"./data/results/Phi-3-mini-128k-instruct_wd_non_rag_batch_16.csv", # Phi-3-mini-128k-instruct(batch size:16) | |
"./data/results/gemma-1.1-7b-it_wd_non_rag.csv", # gemma-1.1-7b-it | |
"./data/results/Tune_2024-04-09_09-19-22.csv", # Llama-2-7b-chat-hf | |
"./data/results/Tune_2024-04-16_12-24-27.csv.csv", # Mistral-7B-Instruct-v0.2 | |
"./data/results/Meta-Llama-3-8B-Instruct_wd_non_rag.csv", # Meta-Llama-3-8B-Instruct | |
"./data/results/Meta-Llama-3-8B-Instruct_wd_1_non_rag.csv", # Meta-Llama-3-8B-Instruct | |
"./data/results/Tune_2024-04-10_16-53-38.csv", # Llama-2-13b-chat-hf | |
"./data/results/Llama-2-70b-chat-hf_wd_non_rag.csv", # Llama-2-70b-chat-hf | |
"./data/results/Meta-Llama-3-70B-Instruct_wd_non_rag.csv", # Meta-Llama-3-70B-Instruct | |
] | |
rag_csv_result_files = [ | |
"./data/results/gemma-1.1-2b-it_wd.csv", # gemma-1.1-2b-it | |
"./data/results/gemma-1.1-2b-it_wd_true.csv", # gemma-1.1-2b-it(true) | |
"./data/results/Phi-3-mini-128k-instruct_wd_rag_batch_4.csv", # Phi-3-mini-128k-instruct(batch size:16) | |
"./data/results/Phi-3-mini-128k-instruct_wd_true.csv", # Phi-3-mini-128k-instruct(batch size:16) | |
"./data/results/gemma-1.1-7b-it_wd.csv", # gemma-1.1-7b-it | |
"./data/results/gemma-1.1-7b-it_wd_true.csv", # gemma-1.1-7b-it(true) | |
"./data/results/Tune_2024-03-20_15-35-37.csv", # Llama-2-7b-chat-hf | |
"./data/results/Llama-2-7b-chat-hf_wd_true.csv", # Llama-2-7b-chat-hf(true) | |
"./data/results/Tune_2024-03-29_11-28-20.csv", # Mistral-7B-Instruct-v0.2 | |
"./data/results/Mistral-7B-Instruct-v0.2_wd_true.csv", # Mistral-7B-Instruct-v0.2(true) | |
"./data/results/Meta-Llama-3-8B-Instruct_wd.csv", # Meta-Llama-3-8b-instruct | |
"./data/results/Meta-Llama-3-8B-Instruct_wd_true.csv", # Meta-Llama-3-8b-instruct(true) | |
"./data/results/Tune_2024-03-25_23-32-57.csv", # Llama-2-13b-chat-hf | |
"./data/results/Llama-2-13b-chat-hf_wd_true.csv", # Llama-2-13b-chat-hf(true) | |
"./data/results/Llama-2-70b-chat-hf_wd.csv", # Llama-2-70b-chat-hf | |
"./data/results/Llama-2-70b-chat-hf_wd_true.csv", # Llama-2-70b-chat-hf | |
"./data/results/Meta-Llama-3-70B-Instruct_wd.csv", # Meta-Llama-3-70B-Instruct | |
"./data/results/Meta-Llama-3-70B-Instruct_wd_true.csv", # Meta-Llama-3-70B-Instruct(true) | |
] | |
df_ms_macro = pd.read_json("./data/datasets/ms_macro.json") | |
def load_for_repetition_penalty_ms_macro( | |
csv_result_file, repetition_penalty, force_recalculate=False | |
): | |
result_file = replace_last( | |
csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv" | |
) | |
df = load_with_newline_and_repetition_scores( | |
result_file, force_recalculate=force_recalculate | |
) | |
if len(df) != len(df_ms_macro): | |
print(f"error: len(df) != {len(df_ms_macro)}") | |
missing_ids = [ | |
id for id in df_ms_macro["id"].unique() if id not in df["id"].unique() | |
] | |
print(f"missing_ids: {missing_ids}") | |
if df["ground_truth"][0] != str(df_ms_macro["wellFormedAnswers"][0]): | |
df["ground_truth"] = df_ms_macro["wellFormedAnswers"] | |
print("ground_truth updated for:", result_file) | |
df.to_csv(result_file, index=False) | |
return df | |
# MS MACRO | |
def plot_performance_scores_ms_macro( | |
result, | |
models=None, | |
title="Performance", | |
): | |
if models is None: | |
models = result.keys() | |
for model in models: | |
print(f"model: {model}") | |
df = result[model]["df_overall"] | |
# print(result[model]["df_list_repetition_penalty"][0].describe()) | |
# Calculate the statistics | |
bleu1 = list(df["bleu1"]) | |
rougeL = list(df["rougeL"]) | |
f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)] | |
best_f1 = max(f1) | |
best_f1_index = f1.index(best_f1) | |
bleu1, rougeL = adjust_perf_scores_with_repetition_penalty( | |
result[model], bleu1, rougeL | |
) | |
afrp = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)] | |
# f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]] | |
best_afrp = max(afrp) | |
best_afrp_index = afrp.index(best_afrp) | |
repetition_penalties = list(df["repetition_penalty"]) | |
# line plot for precision, recall, f1 | |
plt.figure(figsize=(10, 6)) | |
plt.axvspan( | |
repetition_penalties[best_f1_index] - 0.01, | |
repetition_penalties[best_f1_index] + 0.01, | |
alpha=0.5, | |
edgecolor="none", | |
facecolor="blue", | |
) | |
plt.axvspan( | |
repetition_penalties[best_afrp_index] - 0.01, | |
repetition_penalties[best_afrp_index] + 0.01, | |
alpha=0.5, | |
edgecolor="none", | |
facecolor="orange", | |
) | |
plt.plot( | |
repetition_penalties, | |
f1, | |
label="Overall Perf Score", | |
marker="D", | |
color="blue", | |
) | |
plt.plot( | |
repetition_penalties, | |
afrp, | |
label="RAP - Perf Score", | |
marker="o", | |
color="orange", | |
) | |
plt.xlabel("Repetition Penalties") | |
plt.ylabel("Score") | |
# plt.xlim(0.99, 1.31) | |
# y in percentage | |
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0)) | |
plt.title(f"{model} {title}") | |
plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") | |
plt.show() | |
def plot_repetition_factors(result, groups): | |
for group in groups: | |
# Plot the statistics | |
plt.figure(figsize=(10, 6)) | |
max_value = 0 | |
for model in result.keys(): | |
if not group in model.lower(): | |
continue | |
print(f"model: {model}") | |
df = result[model]["df_overall"] | |
repetition_panelties = [ | |
repetition_penalty for repetition_penalty in df["repetition_penalty"] | |
] | |
mean_score = [ | |
# math.log10(10 + df["total_repetitions"].mean()) | |
df["total_repetitions"].mean() | |
for df in result[model]["df_list_repetition_penalty"] | |
] | |
sns.lineplot(x=repetition_panelties, y=mean_score, label=model) | |
new_max = max(mean_score) | |
if new_max > max_value: | |
max_value = new_max | |
max_value = max_value * 1.05 | |
# if max_value < 1.5: | |
# max_value = 1.5 | |
# set ylimit | |
plt.ylim(0, max_value) | |
# show grid | |
plt.grid(True) | |
plt.xlabel("Repetition Penalties") | |
plt.ylabel("Mean Total Repetitions") | |
plt.title("Mean Total Repetitions vs Repetition Penalties") | |
plt.legend() | |
plt.show() | |
def plot_repetition_factors_by_group(result, group_filter=None): | |
markers = ["D", "o", "s", "x"] | |
colors = ["blue", "orange", "green", "red"] | |
# Plot the statistics | |
plt.figure(figsize=(10, 6)) | |
index = 0 | |
max_value = 0 | |
for model in result.keys(): | |
if group_filter is not None and group_filter not in model: | |
continue | |
print(f"model: {model}") | |
df = result[model]["df_overall"] | |
repetition_panelties = [ | |
repetition_penalty for repetition_penalty in df["repetition_penalty"] | |
] | |
# Calculate the statistics | |
mean_score = [ | |
# math.log10(10 + df["total_repetitions"].mean()) | |
df["total_repetitions"].mean() | |
for df in result[model]["df_list_repetition_penalty"] | |
] | |
if len(mean_score) != len(repetition_panelties): | |
print( | |
f"model: {model} has different length of repetition penalties and mean score" | |
) | |
print("repetition_panelties:", len(repetition_panelties)) | |
print("mean_score:", len(mean_score)) | |
continue | |
new_max = max(mean_score) | |
if new_max > max_value: | |
max_value = new_max | |
sns.lineplot( | |
x=repetition_panelties, | |
y=mean_score, | |
label=model, | |
marker=markers[index], | |
color=colors[index], | |
) | |
index += 1 | |
max_value = max_value * 1.05 | |
# if max_value < 1.5: | |
# max_value = 1.5 | |
# set ylimit | |
plt.ylim(0, max_value) | |
max_value = 0 | |
plt.xlabel("Repetition Penalties") | |
plt.ylabel("Mean Total Repetitions") | |
plt.title("Mean Total Repetitions vs Repetition Penalties") | |
plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") | |
plt.show() | |
ms_marco_csv_result_files = [ | |
"data/results/gemma-1.1-2b-it_mm_true_false.csv", | |
"data/results/gemma-1.1-2b-it_mm_true.csv", | |
"data/results/gemma-1.1-2b-it_mm_true_false_non_rag.csv", | |
"data/results/Phi-3-mini-128k-instruct_mm_false.csv", | |
"data/results/Phi-3-mini-128k-instruct_mm_true.csv", | |
"data/results/Phi-3-mini-128k-instruct_mm_non_rag.csv", | |
"data/results/gemma-1.1-7b-it_mm_false.csv", | |
"data/results/gemma-1.1-7b-it_mm_true.csv", | |
"data/results/gemma-1.1-7b-it_mm_non_rag.csv", | |
"data/results/Llama-2-7b-chat-hf_mm_true_false.csv", | |
"data/results/Llama-2-7b-chat-hf_mm_true.csv", | |
"data/results/Llama-2-7b-chat-hf_mm_true_false_non_rag.csv", | |
"data/results/Mistral-7B-Instruct-v0.2_mm_false.csv", | |
"data/results/Mistral-7B-Instruct-v0.2_mm_true.csv", | |
"data/results/Mistral-7B-Instruct-v0.2_mm_non_rag.csv", | |
"data/results/Meta-Llama-3-8B-Instruct_mm_true_false.csv", | |
"data/results/Meta-Llama-3-8B-Instruct_mm_true.csv", | |
"data/results/Meta-Llama-3-8B-Instruct_mm_true_false_non_rag.csv", | |
"data/results/Llama-2-13b-chat-hf_mm_false.csv", | |
"data/results/Llama-2-13b-chat-hf_mm_true.csv", | |
"data/results/Llama-2-13b-chat-hf_mm_non_rag.csv", | |
"data/results/Llama-2-70b-chat-hf_mm_false.csv", | |
"data/results/Llama-2-70b-chat-hf_mm_true.csv", | |
"data/results/Llama-2-70b-chat-hf_mm_non_rag.csv", | |
"data/results/Meta-Llama-3-70B-Instruct_mm_false.csv", | |
"data/results/Meta-Llama-3-70B-Instruct_mm_true.csv", | |
"data/results/Meta-Llama-3-70B-Instruct_mm_non_rag.csv", | |
] | |
webqsp_csv_result_files = [] | |
webqsp_model_result_counts = {} | |
def find_model_name(file_path): | |
df = pd.read_csv(file_path, comment="#", on_bad_lines="warn") | |
return df["model"][0] | |
def add_file(file): | |
model_name = find_model_name(file) | |
if "(generic prompt)" not in model_name: | |
webqsp_csv_result_files.append(file) | |
if model_name not in webqsp_model_result_counts: | |
webqsp_model_result_counts[model_name] = 1 | |
else: | |
webqsp_model_result_counts[model_name] += 1 | |
last_model_name = None | |
non_rag_index = 0 | |
for csv_result_file in rag_csv_result_files: | |
try: | |
model_name = find_model_name(csv_result_file) | |
# print(f"processing model: {model_name} - {csv_result_file}") | |
if last_model_name != model_name and last_model_name is not None: | |
while non_rag_index < len(non_rag_csv_result_files): | |
# print(f"processing non-rag file - {file}") | |
file = non_rag_csv_result_files[non_rag_index] | |
non_model_name = find_model_name(file) | |
if non_model_name.startswith(last_model_name): | |
add_file(file) | |
non_rag_index += 1 | |
else: | |
break | |
add_file(csv_result_file) | |
last_model_name = model_name | |
except FileNotFoundError as e: | |
print("\terror processing file: ", csv_result_file, e) | |
continue | |
for file in non_rag_csv_result_files[non_rag_index:]: | |
add_file(file) | |
def calc_rap_scores(result, precision="precision", recall="recall"): | |
newline_score = [ | |
df["newline_score"].mean() for df in result["df_list_repetition_penalty"] | |
] | |
repetition_score = [ | |
df["repetition_score"].mean() for df in result["df_list_repetition_penalty"] | |
] | |
if precision in result["df_list_repetition_penalty"][0].columns: | |
precision = [ | |
df[precision].mean() for df in result["df_list_repetition_penalty"] | |
] | |
recall = [df[recall].mean() for df in result["df_list_repetition_penalty"]] | |
else: | |
precision = result["df_overall"][precision] | |
recall = result["df_overall"][recall] | |
f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)] | |
rap = [ | |
f / math.log10(10 + n + r) | |
for f, n, r in zip(f1, newline_score, repetition_score) | |
] | |
return newline_score, repetition_score, f1, rap | |
def load_webqsp_result(csv_result_files, force_recalculate=False): | |
model_name_exts = { | |
"true": "(RAG - Chat Template)", | |
"wd": "(RAG - Generic Prompt)", | |
"rag": "(Non-RAG)", | |
} | |
result = {} | |
for i, csv_result_file in enumerate(csv_result_files): | |
try: | |
df = pd.read_csv(csv_result_file) | |
parts = re.split(r"[_\.]", csv_result_file) | |
if parts[-2] in model_name_exts.keys(): | |
key = parts[-2] | |
elif csv_result_file in non_rag_csv_result_files: | |
key = "rag" | |
else: | |
key = "wd" | |
model_name = f'{df["model"][0]}{model_name_exts[key]}' | |
dfs = [ | |
calculate_performance_score( | |
csv_result_file, | |
repetition_penalty, | |
force_recalculate=force_recalculate, | |
) | |
for repetition_penalty in df["repetition_penalty"] | |
] | |
answer_lens = [] | |
for df_rpp in dfs: | |
df_rpp["answer_len"] = df_rpp["answer"].apply( | |
lambda x: len(x) if isinstance(x, str) else 0 | |
) | |
answer_lens.append(df_rpp["answer_len"].mean()) | |
result[model_name] = { | |
"df_overall": df, | |
"df_list_repetition_penalty": dfs, | |
"file": csv_result_file, | |
} | |
newline_score, repetition_score, perf, rap = calc_rap_scores( | |
result[model_name] | |
) | |
df["newline_score"] = newline_score | |
df["repetition_score"] = repetition_score | |
df["total_repetitions"] = df["newline_score"] + df["repetition_score"] | |
df["answer_len"] = answer_lens | |
df["perf"] = perf | |
df["rap"] = rap | |
except Exception as e: | |
print(f"Error: {e}") | |
return result | |
def load_ms_marco_result(csv_result_files, force_recalculate=False): | |
model_name_exts = { | |
"true": "(RAG - Chat Template)", | |
"false": "(RAG - Generic Prompt)", | |
"rag": "(Non-RAG)", | |
} | |
result = {} | |
for csv_result_file in csv_result_files: | |
try: | |
df = pd.read_csv(csv_result_file) | |
parts = re.split(r"[_\.]", csv_result_file) | |
model_name = f'{df["model"][0]}{model_name_exts[parts[-2]]}' | |
print(f"\tmodel_name: {model_name}") | |
dfs = [ | |
load_for_repetition_penalty_ms_macro( | |
csv_result_file, | |
repetition_penalty, | |
force_recalculate=force_recalculate, | |
) | |
for repetition_penalty in df["repetition_penalty"] | |
] | |
answer_lens = [] | |
for df_rpp in dfs: | |
df_rpp["answer_len"] = df_rpp["answer"].apply( | |
lambda x: len(x) if isinstance(x, str) else 0 | |
) | |
answer_lens.append(df_rpp["answer_len"].mean()) | |
result[model_name] = { | |
"df_overall": df, | |
"df_list_repetition_penalty": dfs, | |
"file": csv_result_file, | |
} | |
newline_score, repetition_score, perf, rap = calc_rap_scores( | |
result[model_name], | |
precision="bleu1", | |
recall="rougeL", | |
) | |
df["newline_score"] = newline_score | |
df["repetition_score"] = repetition_score | |
df["total_repetitions"] = df["newline_score"] + df["repetition_score"] | |
df["answer_len"] = answer_lens | |
df["perf"] = perf | |
df["rap"] = rap | |
except Exception as e: | |
print(f"Error: {e}") | |
return result | |
def load_ms_marco_result_v2(csv_result_files, force_recalculate=False): | |
model_name_exts = { | |
"true": "(RAG - Chat Template)", | |
"false": "(RAG - Generic Prompt)", | |
"rag": "(Non-RAG)", | |
} | |
result = {} | |
for csv_result_file in csv_result_files: | |
try: | |
df = pd.read_csv(csv_result_file) | |
parts = re.split(r"[_\.]", csv_result_file) | |
model_name = f'{df["model"][0]}{model_name_exts[parts[-2]]}' | |
print(f"\tmodel_name: {model_name}") | |
dfs = [ | |
load_for_repetition_penalty_ms_macro( | |
csv_result_file, | |
repetition_penalty, | |
force_recalculate=force_recalculate, | |
) | |
for repetition_penalty in df["repetition_penalty"] | |
] | |
answer_lens = [] | |
for df_rpp in dfs: | |
df_rpp["answer_len"] = df_rpp["answer"].apply( | |
lambda x: len(x) if isinstance(x, str) else 0 | |
) | |
answer_lens.append(df_rpp["answer_len"].mean()) | |
df["answer_len"] = answer_lens | |
meteor_scores = [] | |
for df_rpp in dfs: | |
meteor_score = meteor.compute( | |
predictions=df_rpp["answer"], references=df_rpp["ground_truth"] | |
)["meteor"] | |
meteor_scores.append(meteor_score) | |
df["meteor_scores"] = meteor_scores | |
result[model_name] = { | |
"df_overall": df, | |
"df_list_repetition_penalty": dfs, | |
"file": csv_result_file, | |
} | |
newline_score, repetition_score, perf, rap = calc_rap_scores( | |
result[model_name], | |
precision="meteor_scores", | |
recall="meteor_scores", | |
) | |
df["newline_score"] = newline_score | |
df["repetition_score"] = repetition_score | |
df["total_repetitions"] = df["newline_score"] + df["repetition_score"] | |
df["perf"] = perf | |
df["rap"] = rap | |
except Exception as e: | |
print(f"Error: {e}") | |
return result | |