Spaces:
Sleeping
Sleeping
import re | |
import math | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import matplotlib.ticker as mtick | |
import seaborn as sns | |
# final version | |
pattern_abnormal_newlines = re.compile(r"\n{5,}") | |
pattern_text_repetitions = re.compile(r"\b(\w.+?)\b(\1+)", re.M | re.DOTALL) | |
exception_pattern = re.compile(r"(\w+\.)\1") | |
# final version for repetition detection | |
def detect_repetitions( | |
text, debug=False, pattern_text_repetitions=pattern_text_repetitions | |
): | |
subtotals = [0, 0] | |
if isinstance(text, str): | |
patterns = [pattern_abnormal_newlines, pattern_text_repetitions] | |
for i, pattern in enumerate(patterns): | |
if debug: | |
print( | |
f"----detect {'abnormal newlines' if i == 0 else 'text repetitions'}----" | |
) | |
matches = pattern.finditer(text) | |
for match in matches: | |
if debug: | |
print(match) | |
for groupNum in range(0, len(match.groups())): | |
groupNum = groupNum + 1 | |
print( | |
"Group {groupNum} found at {start}-{end}: `{group}`".format( | |
groupNum=groupNum, | |
start=match.start(groupNum), | |
end=match.end(groupNum), | |
group=match.group(groupNum), | |
) | |
) | |
if exception_pattern.match(match[0]): | |
if debug: | |
print("ignored: ", match[0]) | |
continue | |
start, end = match.span() | |
subtotals[i] += end - start | |
result = (subtotals[0], subtotals[1], subtotals[0] + subtotals[1]) | |
if debug: | |
print(result) | |
return result | |
def detect_abnormal_newlines(text, debug=False): | |
return detect_repetitions(text, debug=debug)[0] | |
def detect_text_repetitions(text, debug=False): | |
return detect_repetitions(text, debug=debug)[1] | |
def detect_scores(text, debug=False): | |
newline_score, repetition_score, total_repetitions = detect_repetitions( | |
text, debug=debug | |
) | |
return pd.Series([newline_score, repetition_score, total_repetitions]) | |
def load_with_newline_and_repetition_scores(result_file, force_recalculate=False): | |
print(f"loading result file: {result_file}") | |
df = pd.read_csv(result_file, comment="#", on_bad_lines="warn") | |
if ( | |
force_recalculate | |
or "newline_score" not in df.columns | |
or "repetition_score" not in df.columns | |
or "total_repetitions" not in df.columns | |
): | |
df[["newline_score", "repetition_score", "total_repetitions"]] = df[ | |
"answer" | |
].apply(detect_scores) | |
df.to_csv(result_file, index=False) | |
return df | |
def replace_last(source_string, old_string, new_string): | |
head, _sep, tail = source_string.rpartition(old_string) | |
return head + new_string + tail | |
def load_for_repetition_penalty( | |
csv_result_file, repetition_penalty, force_recalculate=False | |
): | |
result_file = replace_last( | |
csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv" | |
) | |
return load_with_newline_and_repetition_scores( | |
result_file, force_recalculate=force_recalculate | |
) | |
def calc_adjusted_performance(f, r): | |
return f / math.log10(10 + r) | |
def calculate_adjusted_performance(row): | |
r = row["total_repetitions"] | |
adjusted_precision = calc_adjusted_performance(row["precision"], r) | |
adjusted_recall = calc_adjusted_performance(row["recall"], r) | |
return pd.Series([adjusted_precision, adjusted_recall]) | |
def load_performance_df(csv_result_file, repetition_penalty): | |
result_file = replace_last( | |
csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}-t2_evaluated.json" | |
) | |
result_file = result_file.replace("/results/", "/eval/") | |
print(f"loading json file: {result_file}") | |
df = pd.read_json(result_file) | |
return df | |
def calculate_performance_score( | |
csv_result_file, repetition_penalty, force_recalculate=False | |
): | |
result_file = replace_last( | |
csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv" | |
) | |
print(f"loading result file: {result_file}") | |
df = pd.read_csv(result_file, comment="#", on_bad_lines="warn") | |
if force_recalculate or "f2" in df.columns or "f1" not in df.columns: | |
df.drop( | |
columns=[ | |
"precision", | |
"recall", | |
"f1", | |
"f2", | |
"entities_in_answer", | |
"entities_in_question", | |
], | |
errors="ignore", | |
inplace=True, | |
) | |
perf_df = load_performance_df(csv_result_file, repetition_penalty) | |
filtered_df = perf_df[perf_df["id"].isin(df["id"])] | |
perf_df = filtered_df.reset_index(drop=True) | |
print(f"perf_df len: {len(perf_df)}") | |
# print(perf_df.head()) | |
df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"] | |
df["precision"] = perf_df["score"].apply(lambda x: x[0]) | |
df["recall"] = perf_df["score"].apply(lambda x: x[1]) | |
df[["adjusted_precision", "adjusted_recall"]] = df.apply( | |
calculate_adjusted_performance, axis=1 | |
) | |
df.to_csv(result_file, index=False) | |
print(f"performance scores saved to result file: {result_file}") | |
print(f"df len: {len(df)}") | |
return df | |
def adjust_perf_scores_with_repetition_penalty(result, precision, recall): | |
newline_score = [ | |
df["newline_score"].mean() for df in result["df_list_repetition_penalty"] | |
] | |
print(f"newline_score: {newline_score}") | |
repetition_score = [ | |
df["repetition_score"].mean() for df in result["df_list_repetition_penalty"] | |
] | |
print(f"repetition_score: {repetition_score}") | |
precision = [ | |
f / math.log10(10 + n + r) | |
for f, n, r in zip(precision, newline_score, repetition_score) | |
] | |
recall = [ | |
f / math.log10(10 + n + r) | |
for f, n, r in zip(recall, newline_score, repetition_score) | |
] | |
return precision, recall | |
def plot_performance_scores( | |
result, | |
models=None, | |
title="Performance", | |
): | |
if models is None: | |
models = result.keys() | |
for model in models: | |
print(f"model: {model}") | |
df = result[model]["df_overall"] | |
# Calculate the statistics | |
precision = [ | |
df["precision"].mean() for df in result[model]["df_list_repetition_penalty"] | |
] | |
recall = [ | |
df["recall"].mean() for df in result[model]["df_list_repetition_penalty"] | |
] | |
f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)] | |
best_f1 = max(f1) | |
best_f1_index = f1.index(best_f1) | |
precision, recall = adjust_perf_scores_with_repetition_penalty( | |
result[model], precision, recall | |
) | |
afrp = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)] | |
# f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]] | |
best_afrp = max(afrp) | |
best_afrp_index = afrp.index(best_afrp) | |
adjusted_precision = [ | |
df["adjusted_precision"].mean() | |
for df in result[model]["df_list_repetition_penalty"] | |
] | |
adjusted_recall = [ | |
df["adjusted_recall"].mean() | |
for df in result[model]["df_list_repetition_penalty"] | |
] | |
afrp2 = [ | |
2 * (p * r) / (p + r) for p, r in zip(adjusted_precision, adjusted_recall) | |
] | |
best_afrp2 = max(afrp2) | |
best_afrp2_index = afrp2.index(best_afrp2) | |
repetition_penalties = list(df["repetition_penalty"]) | |
# line plot for precision, recall, f1 | |
plt.figure(figsize=(10, 6)) | |
plt.axvspan( | |
repetition_penalties[best_f1_index] - 0.01, | |
repetition_penalties[best_f1_index] + 0.01, | |
alpha=0.5, | |
edgecolor="none", | |
facecolor="blue", | |
) | |
plt.axvspan( | |
repetition_penalties[best_afrp2_index] - 0.01, | |
repetition_penalties[best_afrp2_index] + 0.01, | |
alpha=0.5, | |
edgecolor="none", | |
facecolor="green", | |
) | |
plt.axvspan( | |
repetition_penalties[best_afrp_index] - 0.01, | |
repetition_penalties[best_afrp_index] + 0.01, | |
alpha=0.5, | |
edgecolor="none", | |
facecolor="orange", | |
) | |
plt.plot(repetition_penalties, f1, label="F1", marker="D", color="blue") | |
plt.plot( | |
repetition_penalties, | |
afrp2, | |
label="Per-question RF Adjusted F1", | |
marker="s", | |
color="green", | |
) | |
plt.plot( | |
repetition_penalties, | |
afrp, | |
label="Overall RF Adjusted F1", | |
marker="o", | |
color="orange", | |
) | |
plt.xlabel("Repetition Penalties") | |
plt.ylabel("Score") | |
plt.xlim(0.99, 1.31) | |
# y in percentage | |
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0)) | |
plt.title(f"{model} {title}") | |
plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") | |
plt.show() | |
def plot_best_afrp( | |
result, | |
models=None, | |
title="Models with Best Repetition Factor Adjusted F1", | |
ref_result=None, | |
): | |
# Initialize lists to store the statistics | |
model_names = [] | |
best_f1 = [] | |
best_afrp = [] | |
best_repetition_penalty = [] | |
if models is None: | |
models = result.keys() | |
for model in models: | |
print(f"model: {model}") | |
df = result[model]["df_overall"] | |
# Calculate the statistics | |
precision = [ | |
df["precision"].mean() for df in result[model]["df_list_repetition_penalty"] | |
] | |
recall = [ | |
df["recall"].mean() for df in result[model]["df_list_repetition_penalty"] | |
] | |
# f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]] | |
f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)] | |
newline_score = [ | |
df["newline_score"].mean() | |
for df in result[model]["df_list_repetition_penalty"] | |
] | |
print(f"newline_score: {newline_score}") | |
repetition_score = [ | |
df["repetition_score"].mean() | |
for df in result[model]["df_list_repetition_penalty"] | |
] | |
print(f"repetition_score: {repetition_score}") | |
afrp = [ | |
f / math.log10(10 + n + r) | |
for f, n, r in zip(f1, newline_score, repetition_score) | |
] | |
best_afrp.append(max(afrp)) | |
best_afrp_index = afrp.index(best_afrp[-1]) | |
best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index]) | |
best_f1.append(f1[best_afrp_index]) | |
print( | |
f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}" | |
) | |
df = result[model]["df_list_repetition_penalty"][best_afrp_index] | |
model_names.append( | |
f"{model} (RP={best_repetition_penalty[-1]})" | |
) # Add the model name to the list | |
if ref_result is not None: | |
print("ref_result:", ref_result) | |
for model in ref_result.keys(): | |
model_names.append(model) | |
df = pd.read_csv(ref_result[model]) | |
# df = df[df["id"].isin(wikidata_df["id"])] | |
p = df["precision"].mean() | |
r = df["recall"].mean() | |
f1 = 2 * p * r / (p + r) if p + r > 0 else 0 | |
best_f1.append(f1) | |
best_afrp.append(f1) | |
print("model_names:", model_names) | |
print("best_f1:", best_f1) | |
print("best_afrp:", best_afrp) | |
# Create a DataFrame with the statistics | |
data = pd.DataFrame( | |
{ | |
"Model": model_names, | |
"Repetition Factor Adjusted F1": best_afrp, | |
"F1": best_f1, | |
} | |
) | |
# Melt the DataFrame to a long format | |
data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score") | |
# Pivot the DataFrame to a wide format | |
data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score") | |
# make sure the columns are following the order of the models | |
data_pivoted = data_pivoted[model_names] | |
# make sure three groups in the order of precision, recall, f1 | |
data_pivoted = data_pivoted.reindex(["Repetition Factor Adjusted F1", "F1"]) | |
# Plot the statistics | |
plt.figure(figsize=(10, 6)) | |
ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9) | |
plt.title(title) | |
plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") | |
# Set the rotation of the x-axis labels to 0 degrees | |
plt.xticks(rotation=0) | |
# Format the y-axis to display as percentage | |
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0)) | |
# get the max value of the y-axis | |
a1 = max(best_afrp) | |
a2 = max(best_f1) | |
max_value = max([a1, a2]) * 1.12 | |
print("max_value:", max_value) | |
# Set the y-axis limit up to 70% | |
ax.set_ylim(0, max_value) | |
# Add the values above each bar | |
for p in ax.patches: | |
ax.annotate( | |
f"{p.get_height() * 100:.1f}", | |
(p.get_x() + p.get_width() / 2.0, p.get_height()), | |
ha="center", | |
va="bottom", | |
xytext=(0, 10), | |
textcoords="offset points", | |
rotation=90, | |
) | |
plt.show() | |
def plot_best_performance( | |
result, | |
models=None, | |
title="Models with Best F1 Score", | |
adjusted_f1=False, | |
ref_result=None, | |
): | |
# Initialize lists to store the statistics | |
model_names = [] | |
best_precision = [] | |
best_recall = [] | |
best_f1 = [] | |
best_repetition_penalty = [] | |
if models is None: | |
models = result.keys() | |
for model in models: | |
print(f"model: {model}") | |
df = result[model]["df_overall"] | |
# Calculate the statistics | |
precision = [ | |
df["precision"].mean() for df in result[model]["df_list_repetition_penalty"] | |
] | |
recall = [ | |
df["recall"].mean() for df in result[model]["df_list_repetition_penalty"] | |
] | |
if adjusted_f1: | |
precision, recall = adjust_perf_scores_with_repetition_penalty( | |
result[model], precision, recall | |
) | |
# f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]] | |
f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)] | |
best_f1.append(max(f1)) | |
best_f1_index = f1.index(best_f1[-1]) | |
best_repetition_penalty.append(df["repetition_penalty"][best_f1_index]) | |
best_precision.append(precision[best_f1_index]) | |
best_recall.append(recall[best_f1_index]) | |
print( | |
f"best repetition penalty: {best_repetition_penalty[-1]}, best f1: {best_f1[-1]}, precision: {best_precision[-1]}, recall: {best_recall[-1]}" | |
) | |
df = result[model]["df_list_repetition_penalty"][best_f1_index] | |
model_names.append( | |
f"{model} (RP={best_repetition_penalty[-1]})" | |
) # Add the model name to the list | |
# print sum for columns: newline_score, repetition_score | |
print( | |
f"newline_score: {df['newline_score'].sum()}, repetition_score: {df['repetition_score'].sum()}" | |
) | |
if ref_result is not None: | |
print("ref_result:", ref_result) | |
for model in ref_result.keys(): | |
model_names.append(model) | |
df = pd.read_csv(ref_result[model]) | |
# df = df[df["id"].isin(wikidata_df["id"])] | |
best_precision.append(df["precision"].mean()) | |
best_recall.append(df["recall"].mean()) | |
f1 = ( | |
2 | |
* (best_precision[-1] * best_recall[-1]) | |
/ (best_precision[-1] + best_recall[-1]) | |
) | |
# best_f1.append(df["f1"].mean()) | |
best_f1.append(f1) | |
# Create a DataFrame with the statistics | |
data = ( | |
pd.DataFrame( | |
{ | |
"Model": model_names, | |
"Adjusted Precision with RP": best_precision, | |
"Adjusted Recall with RP": best_recall, | |
"Adjusted F1 with RP": best_f1, | |
} | |
) | |
if adjusted_f1 | |
else pd.DataFrame( | |
{ | |
"Model": model_names, | |
"Precision": best_precision, | |
"Recall": best_recall, | |
"F1": best_f1, | |
} | |
) | |
) | |
columns = list(data.columns) | |
# Melt the DataFrame to a long format | |
data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score") | |
# Pivot the DataFrame to a wide format | |
data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score") | |
# make sure the columns are following the order of the models | |
data_pivoted = data_pivoted[model_names] | |
# make sure three groups in the order of precision, recall, f1 | |
data_pivoted = data_pivoted.reindex(columns[1:]) | |
# Plot the statistics | |
plt.figure(figsize=(10, 6)) | |
ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9) | |
plt.title(title) | |
plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") | |
# Set the rotation of the x-axis labels to 0 degrees | |
plt.xticks(rotation=0) | |
# Format the y-axis to display as percentage | |
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0)) | |
# get the max value of the y-axis | |
a1 = max(best_precision) | |
a2 = max(best_recall) | |
a3 = max(best_f1) | |
max_value = max([a1, a2, a3]) * 1.12 | |
print("max_value:", max_value) | |
# Set the y-axis limit up to 70% | |
ax.set_ylim(0, max_value) | |
# Add the values above each bar | |
for p in ax.patches: | |
ax.annotate( | |
f"{p.get_height() * 100:.1f}", | |
(p.get_x() + p.get_width() / 2.0, p.get_height()), | |
ha="center", | |
va="bottom", | |
xytext=(0, 10), | |
textcoords="offset points", | |
rotation=90, | |
) | |
plt.show() | |
def plot_best_performance_ms_macro( | |
result, | |
models=None, | |
title="Models with Best Repetition Factor Adjusted Performance", | |
ref_result=None, | |
skip_generic_prompt=False, | |
include_adjusted_performance=True, | |
): | |
# Initialize lists to store the statistics | |
model_names = [] | |
best_f1 = [] | |
best_afrp = [] | |
best_repetition_penalty = [] | |
best_bleu1 = [] | |
best_rougeL = [] | |
if models is None: | |
models = result.keys() | |
for model in models: | |
if skip_generic_prompt and "generic prompt" in model: | |
continue | |
print(f"model: {model}") | |
df = result[model]["df_overall"] | |
# Calculate the statistics | |
bleu1 = [x for x in df["bleu1"]] | |
rougeL = [x for x in df["rougeL"]] | |
f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)] | |
newline_score = [ | |
df["newline_score"].mean() | |
for df in result[model]["df_list_repetition_penalty"] | |
] | |
print(f"newline_score: {newline_score}") | |
repetition_score = [ | |
df["repetition_score"].mean() | |
for df in result[model]["df_list_repetition_penalty"] | |
] | |
print(f"repetition_score: {repetition_score}") | |
afrp = [ | |
f / math.log10(10 + n + r) | |
for f, n, r in zip(f1, newline_score, repetition_score) | |
] | |
best_afrp.append(max(afrp if include_adjusted_performance else f1)) | |
best_afrp_index = ( | |
afrp.index(best_afrp[-1]) | |
if include_adjusted_performance | |
else f1.index(best_afrp[-1]) | |
) | |
best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index]) | |
best_f1.append(f1[best_afrp_index]) | |
best_bleu1.append(bleu1[best_afrp_index]) | |
best_rougeL.append(rougeL[best_afrp_index]) | |
print( | |
f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}" | |
) | |
df = result[model]["df_list_repetition_penalty"][best_afrp_index] | |
model_names.append( | |
f"{model} (RP={best_repetition_penalty[-1]})" | |
) # Add the model name to the list | |
if ref_result is not None: | |
print("ref_result:", ref_result) | |
for model in ref_result.keys(): | |
model_names.append(model) | |
df = pd.read_csv(ref_result[model], comment="#", on_bad_lines="warn") | |
# df = df[df["id"].isin(wikidata_df["id"])] | |
p = df["bleu1"][0] | |
best_bleu1.append(p) | |
r = df["rougeL"][0] | |
best_rougeL.append(r) | |
f1 = 2 * p * r / (p + r) if p + r > 0 else 0 | |
best_f1.append(f1) | |
best_afrp.append(f1) | |
print("model_names:", model_names) | |
print("best_f1:", best_f1) | |
print("best_afrp:", best_afrp) | |
# Create a DataFrame with the statistics | |
data = ( | |
pd.DataFrame( | |
{ | |
"Model": model_names, | |
"Repetition Factor Adjusted Perf Score": best_afrp, | |
"Overall Perf Score": best_f1, | |
} | |
) | |
if include_adjusted_performance | |
else pd.DataFrame( | |
{ | |
"Model": model_names, | |
"Bleu-1": best_bleu1, | |
"Rouge-L": best_rougeL, | |
"Overall Perf Score": best_f1, | |
} | |
) | |
) | |
# Melt the DataFrame to a long format | |
data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score") | |
# Pivot the DataFrame to a wide format | |
data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score") | |
# make sure the columns are following the order of the models | |
data_pivoted = data_pivoted[model_names] | |
columns = list(data.columns) | |
data_pivoted = data_pivoted.reindex(columns[1:]) | |
# Plot the statistics | |
plt.figure(figsize=(10, 6)) | |
ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9) | |
plt.title(title) | |
plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") | |
# Set the rotation of the x-axis labels to 0 degrees | |
plt.xticks(rotation=0) | |
# Format the y-axis to display as percentage | |
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0)) | |
# get the max value of the y-axis | |
a1 = max(best_afrp) | |
a2 = max(best_f1) | |
a3 = max(best_bleu1) | |
a4 = max(best_rougeL) | |
max_value = ( | |
max([a1, a2] if include_adjusted_performance else [a1, a2, a3, a4]) * 1.12 | |
) | |
print("max_value:", max_value) | |
# Set the y-axis limit up to 70% | |
ax.set_ylim(0, max_value) | |
# Add the values above each bar | |
for p in ax.patches: | |
ax.annotate( | |
f"{p.get_height() * 100:.1f}", | |
(p.get_x() + p.get_width() / 2.0, p.get_height()), | |
ha="center", | |
va="bottom", | |
xytext=(0, 10), | |
textcoords="offset points", | |
rotation=90, | |
) | |
plt.show() | |
non_rag_csv_result_files = [ | |
"./data/results/Phi-3-mini-128k-instruct_wd_non_rag_batch_16.csv", # Phi-3-mini-128k-instruct(batch size:16) | |
# "./data/results/Tune_2024-04-12_17-14-28.csv", # Orca-2-7b | |
"./data/results/Tune_2024-04-09_09-19-22.csv", # Llama-2-7b-chat-hf | |
# "./data/results/Tune_2024-04-15_12-43-48.csv", # Llama-2-7b-chat-hf(cwq) | |
"./data/results/Meta-Llama-3-8B-Instruct_wd_non_rag.csv", # Meta-Llama-3-8B-Instruct | |
"./data/results/Meta-Llama-3-8B-Instruct_wd_1_non_rag.csv", # Meta-Llama-3-8B-Instruct | |
"./data/results/Tune_2024-04-16_12-24-27.csv.csv", # Mistral-7B-Instruct-v0.2 | |
"./data/results/gemma-1.1-2b-it_wd_non_rag.csv", # gemma-1.1-2b-it | |
# "./data/results/Tune_2024-04-17_04-23-15.csv", # gemma-1.1-2b-it(cwq) | |
"./data/results/gemma-1.1-7b-it_wd_non_rag.csv", # gemma-1.1-2b-it | |
# "./data/results/Tune_2024-04-18_21-56-52.csv", # gemma-1.1-7b-it | |
# "./data/results/Tune_2024-04-19_08-14-49.csv", # gemma-1.1-7b-it(cwq) | |
# "./data/results/Tune_2024-04-17_23-52-04.csv", # Orca-2-13b | |
"./data/results/Tune_2024-04-10_16-53-38.csv", # Llama-2-13b-chat-hf | |
"./data/results/Llama-2-70b-chat-hf_wd_non_rag.csv", # Llama-2-70b-chat-hf | |
"./data/results/Meta-Llama-3-70B-Instruct_wd_non_rag.csv", # Meta-Llama-3-70B-Instruct | |
# "./data/results/llama-3-70b-instruct-awq_wd_non_rag.csv", # Llama-3-70b-instruct-awq | |
] | |
rag_csv_result_files = [ | |
"./data/results/Phi-3-mini-128k-instruct_wd_rag_batch_4.csv", # Phi-3-mini-128k-instruct(batch size:16) | |
"./data/results/Phi-3-mini-128k-instruct_wd_true.csv", # Phi-3-mini-128k-instruct(batch size:16) | |
# "./data/results/Tune_2024-03-19_19-13-36.csv", # Orca-2-7b | |
"./data/results/Tune_2024-03-20_15-35-37.csv", # Llama-2-7b-chat-hf | |
"./data/results/Llama-2-7b-chat-hf_wd_true.csv", # Llama-2-7b-chat-hf(true) | |
# "./data/results/Tune_2024-04-15_14-52-31.csv", # Llama-2-7b-chat-hf(cwq) | |
"./data/results/Meta-Llama-3-8B-Instruct_wd.csv", # Meta-Llama-3-8b-instruct | |
"./data/results/Meta-Llama-3-8B-Instruct_wd_true.csv", # Meta-Llama-3-8b-instruct(true) | |
"./data/results/Tune_2024-03-29_11-28-20.csv", # Mistral-7B-Instruct-v0.2 | |
"./data/results/Mistral-7B-Instruct-v0.2_wd_true.csv", # Mistral-7B-Instruct-v0.2(true) | |
"./data/results/gemma-1.1-2b-it_wd.csv", # gemma-1.1-2b-it | |
"./data/results/gemma-1.1-2b-it_wd_true.csv", # gemma-1.1-7b-it(true) | |
# "./data/results/Tune_2024-04-20_13-12-43.csv", # gemma-1.1-2b-it | |
# "./data/results/Tune_2024-04-16_06-48-32.csv", # gemma-1.1-2b-it(cwq) | |
"./data/results/gemma-1.1-7b-it_wd.csv", # gemma-1.1-7b-it | |
"./data/results/gemma-1.1-7b-it_wd_true.csv", # gemma-1.1-7b-it(true) | |
# "./data/results/Tune_2024-04-18_13-18-38.csv", # gemma-1.1-7b-it | |
# "./data/results/Tune_2024-04-19_04-26-33.csv", # gemma-1.1-7b-it(cwq) | |
# "./data/results/Orca-2-13b_wd.csv", # Orca-2-13b | |
# "./data/results/Tune_2024-03-22_09-28-56.csv", # Orca-2-13b | |
"./data/results/Tune_2024-03-25_23-32-57.csv", # Llama-2-13b-chat-hf | |
"./data/results/Llama-2-13b-chat-hf_wd_true.csv", # Llama-2-13b-chat-hf(true) | |
"./data/results/Llama-2-70b-chat-hf_wd.csv", # Llama-2-70b-chat-hf | |
"./data/results/Llama-2-70b-chat-hf_wd_true.csv", # Llama-2-70b-chat-hf | |
"./data/results/Meta-Llama-3-70B-Instruct_wd.csv", # Meta-Llama-3-70B-Instruct | |
"./data/results/Meta-Llama-3-70B-Instruct_wd_true.csv", # Meta-Llama-3-70B-Instruct(true) | |
] | |
df_ms_macro = pd.read_json("./data/datasets/ms_macro.json") | |
def load_for_repetition_penalty_ms_macro( | |
csv_result_file, repetition_penalty, force_recalculate=False | |
): | |
result_file = replace_last( | |
csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv" | |
) | |
df = load_with_newline_and_repetition_scores( | |
result_file, force_recalculate=force_recalculate | |
) | |
if df["ground_truth"][0] != df_ms_macro["wellFormedAnswers"][0]: | |
df["ground_truth"] = df_ms_macro["wellFormedAnswers"] | |
print("ground_truth updated for:", result_file) | |
df.to_csv(result_file, index=False) | |
return df | |
# MS MACRO | |
def plot_performance_scores_ms_macro( | |
result, | |
models=None, | |
title="Performance", | |
): | |
if models is None: | |
models = result.keys() | |
for model in models: | |
print(f"model: {model}") | |
df = result[model]["df_overall"] | |
# print(result[model]["df_list_repetition_penalty"][0].describe()) | |
# Calculate the statistics | |
bleu1 = list(df["bleu1"]) | |
rougeL = list(df["rougeL"]) | |
f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)] | |
best_f1 = max(f1) | |
best_f1_index = f1.index(best_f1) | |
bleu1, rougeL = adjust_perf_scores_with_repetition_penalty( | |
result[model], bleu1, rougeL | |
) | |
afrp = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)] | |
# f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]] | |
best_afrp = max(afrp) | |
best_afrp_index = afrp.index(best_afrp) | |
repetition_penalties = list(df["repetition_penalty"]) | |
# line plot for precision, recall, f1 | |
plt.figure(figsize=(10, 6)) | |
plt.axvspan( | |
repetition_penalties[best_f1_index] - 0.01, | |
repetition_penalties[best_f1_index] + 0.01, | |
alpha=0.5, | |
edgecolor="none", | |
facecolor="blue", | |
) | |
plt.axvspan( | |
repetition_penalties[best_afrp_index] - 0.01, | |
repetition_penalties[best_afrp_index] + 0.01, | |
alpha=0.5, | |
edgecolor="none", | |
facecolor="orange", | |
) | |
plt.plot( | |
repetition_penalties, | |
f1, | |
label="Overall Perf Score", | |
marker="D", | |
color="blue", | |
) | |
plt.plot( | |
repetition_penalties, | |
afrp, | |
label="RF Adjusted Perf Score", | |
marker="o", | |
color="orange", | |
) | |
plt.xlabel("Repetition Penalties") | |
plt.ylabel("Score") | |
plt.xlim(0.99, 1.31) | |
# y in percentage | |
plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0)) | |
plt.title(f"{model} {title}") | |
plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left") | |
plt.show() | |
def plot_repetition_factors(result, groups): | |
for group in groups: | |
# Plot the statistics | |
plt.figure(figsize=(10, 6)) | |
max_value = 0 | |
for model in result.keys(): | |
if not group in model.lower(): | |
continue | |
print(f"model: {model}") | |
df = result[model]["df_overall"] | |
repetition_panelties = [ | |
repetition_penalty for repetition_penalty in df["repetition_penalty"] | |
] | |
mean_score = [ | |
math.log10(10 + df["total_repetitions"].mean()) | |
for df in result[model]["df_list_repetition_penalty"] | |
] | |
sns.lineplot(x=repetition_panelties, y=mean_score, label=model) | |
new_max = max(mean_score) | |
if new_max > max_value: | |
max_value = new_max | |
max_value = max_value * 1.05 | |
if max_value < 1.5: | |
max_value = 1.5 | |
# set ylimit | |
plt.ylim(1, max_value) | |
# show grid | |
plt.grid(True) | |
plt.xlabel("Repetition Penalties") | |
plt.ylabel("Repetition Factors") | |
plt.title("Repetition Factors vs Repetition Penalties") | |
plt.legend() | |
plt.show() | |