llm-qa-bench

Running

App Files Files Community

dh-mc commited on Jun 20

Commit

04522ca

•

1 Parent(s): 8f1a330

added comparison of different repetition detection methods

Browse files

Files changed (8) hide show

eval_modules/calc_repetitions.py +65 -0
eval_modules/calc_repetitions_v1.py +929 -0
eval_modules/calc_repetitions_v2.py +1087 -0
eval_modules/calc_repetitions_v3.py +1095 -0
eval_modules/calc_repetitions_v4.py +1296 -0
eval_modules/calc_repetitions_v5.py +1383 -0
notebooks/00_Repetition_Algorithms_Comparison.ipynb +0 -0
notebooks/04_RAPGeT_v2.ipynb +0 -0

eval_modules/calc_repetitions.py CHANGED Viewed

@@ -7,6 +7,9 @@ import matplotlib.pyplot as plt
 import matplotlib.ticker as mtick
 import seaborn as sns
 import nltk
 print(f"loading: {__file__}")
@@ -1316,3 +1319,65 @@ def load_ms_marco_result(csv_result_files, force_recalculate=False):
             print(f"Error: {e}")
     return result

 import matplotlib.ticker as mtick
 import seaborn as sns
 import nltk
+import evaluate
+meteor = evaluate.load("meteor")
 print(f"loading: {__file__}")
             print(f"Error: {e}")
     return result
+def load_ms_marco_result_v2(csv_result_files, force_recalculate=False):
+    model_name_exts = {
+        "true": "(RAG - Chat Template)",
+        "false": "(RAG - Generic Prompt)",
+        "rag": "(Non-RAG)",
+    }
+    result = {}
+    for csv_result_file in csv_result_files:
+        try:
+            df = pd.read_csv(csv_result_file)
+            parts = re.split(r"[_\.]", csv_result_file)
+            model_name = f'{df["model"][0]}{model_name_exts[parts[-2]]}'
+            print(f"\tmodel_name: {model_name}")
+            dfs = [
+                load_for_repetition_penalty_ms_macro(
+                    csv_result_file,
+                    repetition_penalty,
+                    force_recalculate=force_recalculate,
+                )
+                for repetition_penalty in df["repetition_penalty"]
+            ]
+            answer_lens = []
+            for df_rpp in dfs:
+                df_rpp["answer_len"] = df_rpp["answer"].apply(
+                    lambda x: len(x) if isinstance(x, str) else 0
+                )
+                answer_lens.append(df_rpp["answer_len"].mean())
+            df["answer_len"] = answer_lens
+            meteor_scores = []
+            for df_rpp in dfs:
+                meteor_score = meteor.compute(
+                    predictions=df_rpp["answer"], references=df_rpp["ground_truth"]
+                )["meteor"]
+                meteor_scores.append(meteor_score)
+            df["meteor_scores"] = meteor_scores
+            result[model_name] = {
+                "df_overall": df,
+                "df_list_repetition_penalty": dfs,
+                "file": csv_result_file,
+            }
+            newline_score, repetition_score, perf, rap = calc_rap_scores(
+                result[model_name],
+                precision="meteor_scores",
+                recall="meteor_scores",
+            )
+            df["newline_score"] = newline_score
+            df["repetition_score"] = repetition_score
+            df["total_repetitions"] = df["newline_score"] + df["repetition_score"]
+            df["perf"] = perf
+            df["rap"] = rap
+        except Exception as e:
+            print(f"Error: {e}")
+    return result

eval_modules/calc_repetitions_v1.py ADDED Viewed

	@@ -0,0 +1,929 @@

+import re
+import math
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mtick
+import seaborn as sns
+# final version
+pattern_abnormal_newlines = re.compile(r"\n{5,}")
+pattern_text_repetitions = re.compile(r"\b(\w.+?)\b(\1+)", re.M | re.DOTALL)
+exception_pattern = re.compile(r"(\w+\.)\1")
+# final version for repetition detection
+def detect_repetitions(
+    text, debug=False, pattern_text_repetitions=pattern_text_repetitions
+):
+    subtotals = [0, 0]
+    if isinstance(text, str):
+        patterns = [pattern_abnormal_newlines, pattern_text_repetitions]
+        for i, pattern in enumerate(patterns):
+            if debug:
+                print(
+                    f"----detect {'abnormal newlines' if i == 0 else 'text repetitions'}----"
+                )
+            matches = pattern.finditer(text)
+            for match in matches:
+                if debug:
+                    print(match)
+                    for groupNum in range(0, len(match.groups())):
+                        groupNum = groupNum + 1
+                        print(
+                            "Group {groupNum} found at {start}-{end}: `{group}`".format(
+                                groupNum=groupNum,
+                                start=match.start(groupNum),
+                                end=match.end(groupNum),
+                                group=match.group(groupNum),
+                            )
+                        )
+                if exception_pattern.match(match[0]):
+                    if debug:
+                        print("ignored: ", match[0])
+                    continue
+                start, end = match.span()
+                subtotals[i] += end - start
+    result = (subtotals[0], subtotals[1], subtotals[0] + subtotals[1])
+    if debug:
+        print(result)
+    return result
+def detect_abnormal_newlines(text, debug=False):
+    return detect_repetitions(text, debug=debug)[0]
+def detect_text_repetitions(text, debug=False):
+    return detect_repetitions(text, debug=debug)[1]
+def detect_scores(text, debug=False):
+    newline_score, repetition_score, total_repetitions = detect_repetitions(
+        text, debug=debug
+    )
+    return pd.Series([newline_score, repetition_score, total_repetitions])
+def load_with_newline_and_repetition_scores(result_file, force_recalculate=False):
+    print(f"loading result file: {result_file}")
+    df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
+    if (
+        force_recalculate
+        or "newline_score" not in df.columns
+        or "repetition_score" not in df.columns
+        or "total_repetitions" not in df.columns
+    ):
+        df[["newline_score", "repetition_score", "total_repetitions"]] = df[
+            "answer"
+        ].apply(detect_scores)
+        df.to_csv(result_file, index=False)
+    return df
+def replace_last(source_string, old_string, new_string):
+    head, _sep, tail = source_string.rpartition(old_string)
+    return head + new_string + tail
+def load_for_repetition_penalty(
+    csv_result_file, repetition_penalty, force_recalculate=False
+):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
+    )
+    return load_with_newline_and_repetition_scores(
+        result_file, force_recalculate=force_recalculate
+    )
+def calc_adjusted_performance(f, r):
+    return f / math.log10(10 + r)
+def calculate_adjusted_performance(row):
+    r = row["total_repetitions"]
+    adjusted_precision = calc_adjusted_performance(row["precision"], r)
+    adjusted_recall = calc_adjusted_performance(row["recall"], r)
+    return pd.Series([adjusted_precision, adjusted_recall])
+def load_performance_df(csv_result_file, repetition_penalty):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}-t2_evaluated.json"
+    )
+    result_file = result_file.replace("/results/", "/eval/")
+    print(f"loading json file: {result_file}")
+    df = pd.read_json(result_file)
+    return df
+def calculate_performance_score(
+    csv_result_file, repetition_penalty, force_recalculate=False
+):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
+    )
+    print(f"loading result file: {result_file}")
+    df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
+    if force_recalculate or "f2" in df.columns or "f1" not in df.columns:
+        df.drop(
+            columns=[
+                "precision",
+                "recall",
+                "f1",
+                "f2",
+                "entities_in_answer",
+                "entities_in_question",
+            ],
+            errors="ignore",
+            inplace=True,
+        )
+        perf_df = load_performance_df(csv_result_file, repetition_penalty)
+        filtered_df = perf_df[perf_df["id"].isin(df["id"])]
+        perf_df = filtered_df.reset_index(drop=True)
+        print(f"perf_df len: {len(perf_df)}")
+        # print(perf_df.head())
+        df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"]
+        df["precision"] = perf_df["score"].apply(lambda x: x[0])
+        df["recall"] = perf_df["score"].apply(lambda x: x[1])
+        df[["adjusted_precision", "adjusted_recall"]] = df.apply(
+            calculate_adjusted_performance, axis=1
+        )
+        df.to_csv(result_file, index=False)
+        print(f"performance scores saved to result file: {result_file}")
+    print(f"df len: {len(df)}")
+    return df
+def adjust_perf_scores_with_repetition_penalty(result, precision, recall):
+    newline_score = [
+        df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
+    ]
+    print(f"newline_score: {newline_score}")
+    repetition_score = [
+        df["repetition_score"].mean() for df in result["df_list_repetition_penalty"]
+    ]
+    print(f"repetition_score: {repetition_score}")
+    precision = [
+        f / math.log10(10 + n + r)
+        for f, n, r in zip(precision, newline_score, repetition_score)
+    ]
+    recall = [
+        f / math.log10(10 + n + r)
+        for f, n, r in zip(recall, newline_score, repetition_score)
+    ]
+    return precision, recall
+def plot_performance_scores(
+    result,
+    models=None,
+    title="Performance",
+):
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        precision = [
+            df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        recall = [
+            df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        best_f1 = max(f1)
+        best_f1_index = f1.index(best_f1)
+        precision, recall = adjust_perf_scores_with_repetition_penalty(
+            result[model], precision, recall
+        )
+        afrp = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        best_afrp = max(afrp)
+        best_afrp_index = afrp.index(best_afrp)
+        adjusted_precision = [
+            df["adjusted_precision"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        adjusted_recall = [
+            df["adjusted_recall"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        afrp2 = [
+            2 * (p * r) / (p + r) for p, r in zip(adjusted_precision, adjusted_recall)
+        ]
+        best_afrp2 = max(afrp2)
+        best_afrp2_index = afrp2.index(best_afrp2)
+        repetition_penalties = list(df["repetition_penalty"])
+        # line plot for precision, recall, f1
+        plt.figure(figsize=(10, 6))
+        plt.axvspan(
+            repetition_penalties[best_f1_index] - 0.01,
+            repetition_penalties[best_f1_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="blue",
+        )
+        plt.axvspan(
+            repetition_penalties[best_afrp2_index] - 0.01,
+            repetition_penalties[best_afrp2_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="green",
+        )
+        plt.axvspan(
+            repetition_penalties[best_afrp_index] - 0.01,
+            repetition_penalties[best_afrp_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="orange",
+        )
+        plt.plot(repetition_penalties, f1, label="F1", marker="D", color="blue")
+        plt.plot(
+            repetition_penalties,
+            afrp2,
+            label="Per-question RF Adjusted F1",
+            marker="s",
+            color="green",
+        )
+        plt.plot(
+            repetition_penalties,
+            afrp,
+            label="Overall RF Adjusted F1",
+            marker="o",
+            color="orange",
+        )
+        plt.xlabel("Repetition Penalties")
+        plt.ylabel("Score")
+        plt.xlim(0.99, 1.31)
+        # y in percentage
+        plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+        plt.title(f"{model} {title}")
+        plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+        plt.show()
+def plot_best_afrp(
+    result,
+    models=None,
+    title="Models with Best Repetition Factor Adjusted F1",
+    ref_result=None,
+):
+    # Initialize lists to store the statistics
+    model_names = []
+    best_f1 = []
+    best_afrp = []
+    best_repetition_penalty = []
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        precision = [
+            df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        recall = [
+            df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        newline_score = [
+            df["newline_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        print(f"newline_score: {newline_score}")
+        repetition_score = [
+            df["repetition_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        print(f"repetition_score: {repetition_score}")
+        afrp = [
+            f / math.log10(10 + n + r)
+            for f, n, r in zip(f1, newline_score, repetition_score)
+        ]
+        best_afrp.append(max(afrp))
+        best_afrp_index = afrp.index(best_afrp[-1])
+        best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
+        best_f1.append(f1[best_afrp_index])
+        print(
+            f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
+        )
+        df = result[model]["df_list_repetition_penalty"][best_afrp_index]
+        model_names.append(
+            f"{model} (RP={best_repetition_penalty[-1]})"
+        )  # Add the model name to the list
+    if ref_result is not None:
+        print("ref_result:", ref_result)
+        for model in ref_result.keys():
+            model_names.append(model)
+            df = pd.read_csv(ref_result[model])
+            # df = df[df["id"].isin(wikidata_df["id"])]
+            p = df["precision"].mean()
+            r = df["recall"].mean()
+            f1 = 2 * p * r / (p + r) if p + r > 0 else 0
+            best_f1.append(f1)
+            best_afrp.append(f1)
+    print("model_names:", model_names)
+    print("best_f1:", best_f1)
+    print("best_afrp:", best_afrp)
+    # Create a DataFrame with the statistics
+    data = pd.DataFrame(
+        {
+            "Model": model_names,
+            "Repetition Factor Adjusted F1": best_afrp,
+            "F1": best_f1,
+        }
+    )
+    # Melt the DataFrame to a long format
+    data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
+    # Pivot the DataFrame to a wide format
+    data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
+    # make sure the columns are following the order of the models
+    data_pivoted = data_pivoted[model_names]
+    # make sure three groups in the order of precision, recall, f1
+    data_pivoted = data_pivoted.reindex(["Repetition Factor Adjusted F1", "F1"])
+    # Plot the statistics
+    plt.figure(figsize=(10, 6))
+    ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
+    plt.title(title)
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    # Set the rotation of the x-axis labels to 0 degrees
+    plt.xticks(rotation=0)
+    # Format the y-axis to display as percentage
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+    # get the max value of the y-axis
+    a1 = max(best_afrp)
+    a2 = max(best_f1)
+    max_value = max([a1, a2]) * 1.12
+    print("max_value:", max_value)
+    # Set the y-axis limit up to 70%
+    ax.set_ylim(0, max_value)
+    # Add the values above each bar
+    for p in ax.patches:
+        ax.annotate(
+            f"{p.get_height() * 100:.1f}",
+            (p.get_x() + p.get_width() / 2.0, p.get_height()),
+            ha="center",
+            va="bottom",
+            xytext=(0, 10),
+            textcoords="offset points",
+            rotation=90,
+        )
+    plt.show()
+def plot_best_performance(
+    result,
+    models=None,
+    title="Models with Best F1 Score",
+    adjusted_f1=False,
+    ref_result=None,
+):
+    # Initialize lists to store the statistics
+    model_names = []
+    best_precision = []
+    best_recall = []
+    best_f1 = []
+    best_repetition_penalty = []
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        precision = [
+            df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        recall = [
+            df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        if adjusted_f1:
+            precision, recall = adjust_perf_scores_with_repetition_penalty(
+                result[model], precision, recall
+            )
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        best_f1.append(max(f1))
+        best_f1_index = f1.index(best_f1[-1])
+        best_repetition_penalty.append(df["repetition_penalty"][best_f1_index])
+        best_precision.append(precision[best_f1_index])
+        best_recall.append(recall[best_f1_index])
+        print(
+            f"best repetition penalty: {best_repetition_penalty[-1]}, best f1: {best_f1[-1]}, precision: {best_precision[-1]}, recall: {best_recall[-1]}"
+        )
+        df = result[model]["df_list_repetition_penalty"][best_f1_index]
+        model_names.append(
+            f"{model} (RP={best_repetition_penalty[-1]})"
+        )  # Add the model name to the list
+        # print sum for columns: newline_score, repetition_score
+        print(
+            f"newline_score: {df['newline_score'].sum()}, repetition_score: {df['repetition_score'].sum()}"
+        )
+    if ref_result is not None:
+        print("ref_result:", ref_result)
+        for model in ref_result.keys():
+            model_names.append(model)
+            df = pd.read_csv(ref_result[model])
+            # df = df[df["id"].isin(wikidata_df["id"])]
+            best_precision.append(df["precision"].mean())
+            best_recall.append(df["recall"].mean())
+            f1 = (
+                2
+                * (best_precision[-1] * best_recall[-1])
+                / (best_precision[-1] + best_recall[-1])
+            )
+            # best_f1.append(df["f1"].mean())
+            best_f1.append(f1)
+    # Create a DataFrame with the statistics
+    data = (
+        pd.DataFrame(
+            {
+                "Model": model_names,
+                "Adjusted Precision with RP": best_precision,
+                "Adjusted Recall with RP": best_recall,
+                "Adjusted F1 with RP": best_f1,
+            }
+        )
+        if adjusted_f1
+        else pd.DataFrame(
+            {
+                "Model": model_names,
+                "Precision": best_precision,
+                "Recall": best_recall,
+                "F1": best_f1,
+            }
+        )
+    )
+    columns = list(data.columns)
+    # Melt the DataFrame to a long format
+    data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
+    # Pivot the DataFrame to a wide format
+    data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
+    # make sure the columns are following the order of the models
+    data_pivoted = data_pivoted[model_names]
+    # make sure three groups in the order of precision, recall, f1
+    data_pivoted = data_pivoted.reindex(columns[1:])
+    # Plot the statistics
+    plt.figure(figsize=(10, 6))
+    ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
+    plt.title(title)
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    # Set the rotation of the x-axis labels to 0 degrees
+    plt.xticks(rotation=0)
+    # Format the y-axis to display as percentage
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+    # get the max value of the y-axis
+    a1 = max(best_precision)
+    a2 = max(best_recall)
+    a3 = max(best_f1)
+    max_value = max([a1, a2, a3]) * 1.12
+    print("max_value:", max_value)
+    # Set the y-axis limit up to 70%
+    ax.set_ylim(0, max_value)
+    # Add the values above each bar
+    for p in ax.patches:
+        ax.annotate(
+            f"{p.get_height() * 100:.1f}",
+            (p.get_x() + p.get_width() / 2.0, p.get_height()),
+            ha="center",
+            va="bottom",
+            xytext=(0, 10),
+            textcoords="offset points",
+            rotation=90,
+        )
+    plt.show()
+def plot_best_performance_ms_macro(
+    result,
+    models=None,
+    title="Models with Best Repetition Factor Adjusted Performance",
+    ref_result=None,
+    skip_generic_prompt=False,
+    include_adjusted_performance=True,
+):
+    # Initialize lists to store the statistics
+    model_names = []
+    best_f1 = []
+    best_afrp = []
+    best_repetition_penalty = []
+    best_bleu1 = []
+    best_rougeL = []
+    if models is None:
+        models = result.keys()
+    for model in models:
+        if skip_generic_prompt and "generic prompt" in model:
+            continue
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        bleu1 = [x for x in df["bleu1"]]
+        rougeL = [x for x in df["rougeL"]]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
+        newline_score = [
+            df["newline_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        print(f"newline_score: {newline_score}")
+        repetition_score = [
+            df["repetition_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        print(f"repetition_score: {repetition_score}")
+        afrp = [
+            f / math.log10(10 + n + r)
+            for f, n, r in zip(f1, newline_score, repetition_score)
+        ]
+        best_afrp.append(max(afrp if include_adjusted_performance else f1))
+        best_afrp_index = (
+            afrp.index(best_afrp[-1])
+            if include_adjusted_performance
+            else f1.index(best_afrp[-1])
+        )
+        best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
+        best_f1.append(f1[best_afrp_index])
+        best_bleu1.append(bleu1[best_afrp_index])
+        best_rougeL.append(rougeL[best_afrp_index])
+        print(
+            f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
+        )
+        df = result[model]["df_list_repetition_penalty"][best_afrp_index]
+        model_names.append(
+            f"{model} (RP={best_repetition_penalty[-1]})"
+        )  # Add the model name to the list
+    if ref_result is not None:
+        print("ref_result:", ref_result)
+        for model in ref_result.keys():
+            model_names.append(model)
+            df = pd.read_csv(ref_result[model], comment="#", on_bad_lines="warn")
+            # df = df[df["id"].isin(wikidata_df["id"])]
+            p = df["bleu1"][0]
+            best_bleu1.append(p)
+            r = df["rougeL"][0]
+            best_rougeL.append(r)
+            f1 = 2 * p * r / (p + r) if p + r > 0 else 0
+            best_f1.append(f1)
+            best_afrp.append(f1)
+    print("model_names:", model_names)
+    print("best_f1:", best_f1)
+    print("best_afrp:", best_afrp)
+    # Create a DataFrame with the statistics
+    data = (
+        pd.DataFrame(
+            {
+                "Model": model_names,
+                "Repetition Factor Adjusted Perf Score": best_afrp,
+                "Overall Perf Score": best_f1,
+            }
+        )
+        if include_adjusted_performance
+        else pd.DataFrame(
+            {
+                "Model": model_names,
+                "Bleu-1": best_bleu1,
+                "Rouge-L": best_rougeL,
+                "Overall Perf Score": best_f1,
+            }
+        )
+    )
+    # Melt the DataFrame to a long format
+    data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
+    # Pivot the DataFrame to a wide format
+    data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
+    # make sure the columns are following the order of the models
+    data_pivoted = data_pivoted[model_names]
+    columns = list(data.columns)
+    data_pivoted = data_pivoted.reindex(columns[1:])
+    # Plot the statistics
+    plt.figure(figsize=(10, 6))
+    ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
+    plt.title(title)
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    # Set the rotation of the x-axis labels to 0 degrees
+    plt.xticks(rotation=0)
+    # Format the y-axis to display as percentage
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+    # get the max value of the y-axis
+    a1 = max(best_afrp)
+    a2 = max(best_f1)
+    a3 = max(best_bleu1)
+    a4 = max(best_rougeL)
+    max_value = (
+        max([a1, a2] if include_adjusted_performance else [a1, a2, a3, a4]) * 1.12
+    )
+    print("max_value:", max_value)
+    # Set the y-axis limit up to 70%
+    ax.set_ylim(0, max_value)
+    # Add the values above each bar
+    for p in ax.patches:
+        ax.annotate(
+            f"{p.get_height() * 100:.1f}",
+            (p.get_x() + p.get_width() / 2.0, p.get_height()),
+            ha="center",
+            va="bottom",
+            xytext=(0, 10),
+            textcoords="offset points",
+            rotation=90,
+        )
+    plt.show()
+non_rag_csv_result_files = [
+    "./data/results/Phi-3-mini-128k-instruct_wd_non_rag_batch_16.csv",  # Phi-3-mini-128k-instruct(batch size:16)
+    # "./data/results/Tune_2024-04-12_17-14-28.csv",  # Orca-2-7b
+    "./data/results/Tune_2024-04-09_09-19-22.csv",  # Llama-2-7b-chat-hf
+    # "./data/results/Tune_2024-04-15_12-43-48.csv",  # Llama-2-7b-chat-hf(cwq)
+    "./data/results/Meta-Llama-3-8B-Instruct_wd_non_rag.csv",  # Meta-Llama-3-8B-Instruct
+    "./data/results/Meta-Llama-3-8B-Instruct_wd_1_non_rag.csv",  # Meta-Llama-3-8B-Instruct
+    "./data/results/Tune_2024-04-16_12-24-27.csv.csv",  # Mistral-7B-Instruct-v0.2
+    "./data/results/gemma-1.1-2b-it_wd_non_rag.csv",  # gemma-1.1-2b-it
+    # "./data/results/Tune_2024-04-17_04-23-15.csv",  # gemma-1.1-2b-it(cwq)
+    "./data/results/gemma-1.1-7b-it_wd_non_rag.csv",  # gemma-1.1-2b-it
+    # "./data/results/Tune_2024-04-18_21-56-52.csv",  # gemma-1.1-7b-it
+    # "./data/results/Tune_2024-04-19_08-14-49.csv",  # gemma-1.1-7b-it(cwq)
+    # "./data/results/Tune_2024-04-17_23-52-04.csv",  # Orca-2-13b
+    "./data/results/Tune_2024-04-10_16-53-38.csv",  # Llama-2-13b-chat-hf
+    "./data/results/Llama-2-70b-chat-hf_wd_non_rag.csv",  # Llama-2-70b-chat-hf
+    "./data/results/Meta-Llama-3-70B-Instruct_wd_non_rag.csv",  # Meta-Llama-3-70B-Instruct
+    # "./data/results/llama-3-70b-instruct-awq_wd_non_rag.csv",  # Llama-3-70b-instruct-awq
+]
+rag_csv_result_files = [
+    "./data/results/Phi-3-mini-128k-instruct_wd_rag_batch_4.csv",  # Phi-3-mini-128k-instruct(batch size:16)
+    "./data/results/Phi-3-mini-128k-instruct_wd_true.csv",  # Phi-3-mini-128k-instruct(batch size:16)
+    # "./data/results/Tune_2024-03-19_19-13-36.csv",  # Orca-2-7b
+    "./data/results/Tune_2024-03-20_15-35-37.csv",  # Llama-2-7b-chat-hf
+    "./data/results/Llama-2-7b-chat-hf_wd_true.csv",  # Llama-2-7b-chat-hf(true)
+    # "./data/results/Tune_2024-04-15_14-52-31.csv",  # Llama-2-7b-chat-hf(cwq)
+    "./data/results/Meta-Llama-3-8B-Instruct_wd.csv",  # Meta-Llama-3-8b-instruct
+    "./data/results/Meta-Llama-3-8B-Instruct_wd_true.csv",  # Meta-Llama-3-8b-instruct(true)
+    "./data/results/Tune_2024-03-29_11-28-20.csv",  # Mistral-7B-Instruct-v0.2
+    "./data/results/Mistral-7B-Instruct-v0.2_wd_true.csv",  # Mistral-7B-Instruct-v0.2(true)
+    "./data/results/gemma-1.1-2b-it_wd.csv",  # gemma-1.1-2b-it
+    "./data/results/gemma-1.1-2b-it_wd_true.csv",  # gemma-1.1-7b-it(true)
+    # "./data/results/Tune_2024-04-20_13-12-43.csv",  # gemma-1.1-2b-it
+    # "./data/results/Tune_2024-04-16_06-48-32.csv",  # gemma-1.1-2b-it(cwq)
+    "./data/results/gemma-1.1-7b-it_wd.csv",  # gemma-1.1-7b-it
+    "./data/results/gemma-1.1-7b-it_wd_true.csv",  # gemma-1.1-7b-it(true)
+    # "./data/results/Tune_2024-04-18_13-18-38.csv",  # gemma-1.1-7b-it
+    # "./data/results/Tune_2024-04-19_04-26-33.csv",  # gemma-1.1-7b-it(cwq)
+    # "./data/results/Orca-2-13b_wd.csv",  # Orca-2-13b
+    # "./data/results/Tune_2024-03-22_09-28-56.csv",  # Orca-2-13b
+    "./data/results/Tune_2024-03-25_23-32-57.csv",  # Llama-2-13b-chat-hf
+    "./data/results/Llama-2-13b-chat-hf_wd_true.csv",  # Llama-2-13b-chat-hf(true)
+    "./data/results/Llama-2-70b-chat-hf_wd.csv",  # Llama-2-70b-chat-hf
+    "./data/results/Llama-2-70b-chat-hf_wd_true.csv",  # Llama-2-70b-chat-hf
+    "./data/results/Meta-Llama-3-70B-Instruct_wd.csv",  # Meta-Llama-3-70B-Instruct
+    "./data/results/Meta-Llama-3-70B-Instruct_wd_true.csv",  # Meta-Llama-3-70B-Instruct(true)
+]
+df_ms_macro = pd.read_json("./data/datasets/ms_macro.json")
+def load_for_repetition_penalty_ms_macro(
+    csv_result_file, repetition_penalty, force_recalculate=False
+):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
+    )
+    df = load_with_newline_and_repetition_scores(
+        result_file, force_recalculate=force_recalculate
+    )
+    if df["ground_truth"][0] != df_ms_macro["wellFormedAnswers"][0]:
+        df["ground_truth"] = df_ms_macro["wellFormedAnswers"]
+        print("ground_truth updated for:", result_file)
+        df.to_csv(result_file, index=False)
+    return df
+# MS MACRO
+def plot_performance_scores_ms_macro(
+    result,
+    models=None,
+    title="Performance",
+):
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # print(result[model]["df_list_repetition_penalty"][0].describe())
+        # Calculate the statistics
+        bleu1 = list(df["bleu1"])
+        rougeL = list(df["rougeL"])
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
+        best_f1 = max(f1)
+        best_f1_index = f1.index(best_f1)
+        bleu1, rougeL = adjust_perf_scores_with_repetition_penalty(
+            result[model], bleu1, rougeL
+        )
+        afrp = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        best_afrp = max(afrp)
+        best_afrp_index = afrp.index(best_afrp)
+        repetition_penalties = list(df["repetition_penalty"])
+        # line plot for precision, recall, f1
+        plt.figure(figsize=(10, 6))
+        plt.axvspan(
+            repetition_penalties[best_f1_index] - 0.01,
+            repetition_penalties[best_f1_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="blue",
+        )
+        plt.axvspan(
+            repetition_penalties[best_afrp_index] - 0.01,
+            repetition_penalties[best_afrp_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="orange",
+        )
+        plt.plot(
+            repetition_penalties,
+            f1,
+            label="Overall Perf Score",
+            marker="D",
+            color="blue",
+        )
+        plt.plot(
+            repetition_penalties,
+            afrp,
+            label="RF Adjusted Perf Score",
+            marker="o",
+            color="orange",
+        )
+        plt.xlabel("Repetition Penalties")
+        plt.ylabel("Score")
+        plt.xlim(0.99, 1.31)
+        # y in percentage
+        plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+        plt.title(f"{model} {title}")
+        plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+        plt.show()
+def plot_repetition_factors(result, groups):
+    for group in groups:
+        # Plot the statistics
+        plt.figure(figsize=(10, 6))
+        max_value = 0
+        for model in result.keys():
+            if not group in model.lower():
+                continue
+            print(f"model: {model}")
+            df = result[model]["df_overall"]
+            repetition_panelties = [
+                repetition_penalty for repetition_penalty in df["repetition_penalty"]
+            ]
+            mean_score = [
+                math.log10(10 + df["total_repetitions"].mean())
+                for df in result[model]["df_list_repetition_penalty"]
+            ]
+            sns.lineplot(x=repetition_panelties, y=mean_score, label=model)
+            new_max = max(mean_score)
+            if new_max > max_value:
+                max_value = new_max
+        max_value = max_value * 1.05
+        if max_value < 1.5:
+            max_value = 1.5
+        # set ylimit
+        plt.ylim(1, max_value)
+        # show grid
+        plt.grid(True)
+        plt.xlabel("Repetition Penalties")
+        plt.ylabel("Repetition Factors")
+        plt.title("Repetition Factors vs Repetition Penalties")
+        plt.legend()
+        plt.show()

eval_modules/calc_repetitions_v2.py ADDED Viewed

	@@ -0,0 +1,1087 @@

+import os
+import re
+import math
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mtick
+import seaborn as sns
+import nltk
+# final version
+pattern_abnormal_newlines = re.compile(r"\n{5,}")
+pattern_text_repetitions = re.compile(r"(.{5,}?)(\1+)", re.M | re.DOTALL)
+exception_patterns = [re.compile(r"(\w+\.?)\1")]
+# final version for repetition detection
+def detect_repetitions(
+    text, debug=False, pattern_text_repetitions=pattern_text_repetitions
+):
+    subtotals = [0, 0]
+    if isinstance(text, str):
+        patterns = [pattern_abnormal_newlines, pattern_text_repetitions]
+        for i, pattern in enumerate(patterns):
+            if debug:
+                print(
+                    f"----detect {'abnormal newlines' if i == 0 else 'text repetitions'}----"
+                )
+            matches = pattern.finditer(text)
+            for match in matches:
+                if i > 0:
+                    ignored = False
+                    for exception_pattern in exception_patterns:
+                        if exception_pattern.match(match[0]):
+                            if debug:
+                                print("ignored: ", match[0])
+                            ignored = True
+                            break
+                    if ignored:
+                        continue
+                if debug:
+                    print(match)
+                    for groupNum in range(0, len(match.groups())):
+                        groupNum = groupNum + 1
+                        print(
+                            "Group {groupNum} found at {start}-{end}: `{group}`".format(
+                                groupNum=groupNum,
+                                start=match.start(groupNum),
+                                end=match.end(groupNum),
+                                group=match.group(groupNum),
+                            )
+                        )
+                start, end = match.span()
+                subtotals[i] += end - start
+            if i == 0 and subtotals[i] > 0:
+                text = pattern.sub("", text)
+                if debug:
+                    print(f"removed abnormal newlines: {subtotals[i]}")
+    result = (subtotals[0], subtotals[1], subtotals[0] + subtotals[1])
+    if debug:
+        print(result)
+    return result
+def detect_abnormal_newlines(text, debug=False):
+    return detect_repetitions(text, debug=debug)[0]
+def detect_text_repetitions(text, debug=False):
+    return detect_repetitions(text, debug=debug)[1]
+def detect_scores(text, debug=False):
+    newline_score, repetition_score, total_repetitions = detect_repetitions(
+        text, debug=debug
+    )
+    return pd.Series([newline_score, repetition_score, total_repetitions])
+def load_with_newline_and_repetition_scores(result_file, force_recalculate=False):
+    print(f"loading result file: {result_file}")
+    df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
+    if (
+        force_recalculate
+        or "newline_score" not in df.columns
+        or "repetition_score" not in df.columns
+        or "total_repetitions" not in df.columns
+    ):
+        df[["newline_score", "repetition_score", "total_repetitions"]] = df[
+            "answer"
+        ].apply(detect_scores)
+        df.to_csv(result_file, index=False)
+    return df
+def replace_last(source_string, old_string, new_string):
+    head, _sep, tail = source_string.rpartition(old_string)
+    return head + new_string + tail
+def load_for_repetition_penalty(
+    csv_result_file, repetition_penalty, force_recalculate=False
+):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
+    )
+    return load_with_newline_and_repetition_scores(
+        result_file, force_recalculate=force_recalculate
+    )
+def calc_adjusted_performance(f, r):
+    return f / math.log10(10 + r)
+def calculate_adjusted_performance(row):
+    r = row["total_repetitions"]
+    adjusted_precision = calc_adjusted_performance(row["precision"], r)
+    adjusted_recall = calc_adjusted_performance(row["recall"], r)
+    return pd.Series([adjusted_precision, adjusted_recall])
+def load_performance_df(csv_result_file, repetition_penalty):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}-t2_evaluated.json"
+    )
+    result_file = result_file.replace("/results/", "/eval/")
+    print(f"loading json file: {result_file}")
+    df = pd.read_json(result_file)
+    return df
+def calculate_performance_score_v1(
+    csv_result_file, repetition_penalty, force_recalculate=False
+):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
+    )
+    print(f"loading result file: {result_file}")
+    df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
+    if force_recalculate or "f2" in df.columns or "f1" not in df.columns:
+        df.drop(
+            columns=[
+                "precision",
+                "recall",
+                "f1",
+                "f2",
+                "entities_in_answer",
+                "entities_in_question",
+            ],
+            errors="ignore",
+            inplace=True,
+        )
+        perf_df = load_performance_df(csv_result_file, repetition_penalty)
+        filtered_df = perf_df[perf_df["id"].isin(df["id"])]
+        perf_df = filtered_df.reset_index(drop=True)
+        print(f"perf_df len: {len(perf_df)}")
+        # print(perf_df.head())
+        df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"]
+        df["precision"] = perf_df["score"].apply(lambda x: x[0])
+        df["recall"] = perf_df["score"].apply(lambda x: x[1])
+        df["f1"] = perf_df["score"].apply(lambda x: x[2])
+        df[["adjusted_precision", "adjusted_recall"]] = df.apply(
+            calculate_adjusted_performance, axis=1
+        )
+        df.to_csv(result_file, index=False)
+        print(f"performance scores saved to result file: {result_file}")
+    print(f"df len: {len(df)}")
+    return df
+ref_df = pd.read_csv(
+    "./data/results/gpt-3.5-turbo_non_rag.csv", comment="#", on_bad_lines="warn"
+)
+def calculate_performance_score(
+    csv_result_file, repetition_penalty, force_recalculate=False
+):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
+    )
+    re_creating = False
+    if os.path.exists(result_file):
+        print(f"loading result file: {result_file}")
+        df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
+    else:
+        print(f"re-creating result file: {result_file}")
+        df = pd.DataFrame()
+        force_recalculate = True
+    if force_recalculate or "f2" in df.columns or "f1" not in df.columns:
+        df.drop(
+            columns=[
+                "precision",
+                "recall",
+                "f1",
+                "f2",
+                "entities_in_answer",
+                "entities_in_question",
+                "word_count",
+            ],
+            errors="ignore",
+            inplace=True,
+        )
+        perf_df = load_performance_df(csv_result_file, repetition_penalty)
+        filtered_df = perf_df[perf_df["id"].isin(ref_df["id"])]
+        perf_df = filtered_df.reset_index(drop=True)
+        print(f"perf_df len: {len(perf_df)}")
+        if len(perf_df) != len(ref_df):
+            print(f"error: len(perf_df) != {len(ref_df)}")
+            missing_ids = [
+                id for id in ref_df["id"].unique() if id not in perf_df["id"].unique()
+            ]
+            print(f"missing_ids: {missing_ids}")
+        # print(perf_df.head())
+        df["id"] = perf_df["id"]
+        df["question"] = perf_df["question"]
+        df["answer"] = perf_df["pred_answer"]
+        df["word_count"] = df["answer"].apply(
+            lambda x: len(nltk.word_tokenize(x)) if isinstance(x, str) else 0
+        )
+        df["ground_truth"] = perf_df["ground_truth"]
+        df[["newline_score", "repetition_score", "total_repetitions"]] = df[
+            "answer"
+        ].apply(detect_scores)
+        df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"]
+        df["precision"] = perf_df["score"].apply(lambda x: x[0])
+        df["recall"] = perf_df["score"].apply(lambda x: x[1])
+        df["f1"] = perf_df["score"].apply(lambda x: x[2])
+        df[["adjusted_precision", "adjusted_recall"]] = df.apply(
+            calculate_adjusted_performance, axis=1
+        )
+        df.to_csv(result_file, index=False)
+        print(f"performance scores saved to result file: {result_file}")
+    print(f"df len: {len(df)}")
+    return df
+def adjust_perf_scores_with_repetition_penalty(result, precision, recall):
+    newline_score = [
+        df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
+    ]
+    print(f"newline_score: {newline_score}")
+    repetition_score = [
+        df["repetition_score"].mean() for df in result["df_list_repetition_penalty"]
+    ]
+    print(f"repetition_score: {repetition_score}")
+    precision = [
+        f / math.log10(10 + n + r)
+        for f, n, r in zip(precision, newline_score, repetition_score)
+    ]
+    recall = [
+        f / math.log10(10 + n + r)
+        for f, n, r in zip(recall, newline_score, repetition_score)
+    ]
+    return precision, recall
+def plot_performance_scores(
+    result,
+    models=None,
+    title="Performance",
+):
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        precision = [
+            df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        recall = [
+            df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        best_f1 = max(f1)
+        best_f1_index = f1.index(best_f1)
+        precision, recall = adjust_perf_scores_with_repetition_penalty(
+            result[model], precision, recall
+        )
+        afrp = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        best_afrp = max(afrp)
+        best_afrp_index = afrp.index(best_afrp)
+        adjusted_precision = [
+            df["adjusted_precision"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        adjusted_recall = [
+            df["adjusted_recall"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        afrp2 = [
+            2 * (p * r) / (p + r) for p, r in zip(adjusted_precision, adjusted_recall)
+        ]
+        best_afrp2 = max(afrp2)
+        best_afrp2_index = afrp2.index(best_afrp2)
+        repetition_penalties = list(df["repetition_penalty"])
+        # line plot for precision, recall, f1
+        plt.figure(figsize=(10, 6))
+        plt.axvspan(
+            repetition_penalties[best_f1_index] - 0.01,
+            repetition_penalties[best_f1_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="blue",
+        )
+        # plt.axvspan(
+        #     repetition_penalties[best_afrp2_index] - 0.01,
+        #     repetition_penalties[best_afrp2_index] + 0.01,
+        #     alpha=0.5,
+        #     edgecolor="none",
+        #     facecolor="green",
+        # )
+        plt.axvspan(
+            repetition_penalties[best_afrp_index] - 0.01,
+            repetition_penalties[best_afrp_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="orange",
+        )
+        plt.plot(repetition_penalties, f1, label="F1", marker="D", color="blue")
+        # plt.plot(
+        #     repetition_penalties,
+        #     afrp2,
+        #     label="Per-question RF Adjusted F1",
+        #     marker="s",
+        #     color="green",
+        # )
+        plt.plot(
+            repetition_penalties,
+            afrp,
+            label="RF Adjusted F1",
+            marker="o",
+            color="orange",
+        )
+        plt.xlabel("Repetition Penalties")
+        plt.ylabel("Score")
+        plt.xlim(0.99, 1.31)
+        # y in percentage
+        plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+        plt.title(f"{model} {title}")
+        plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+        plt.show()
+def plot_best_afrp(
+    result,
+    models=None,
+    title="Models with Best Repetition Factor Adjusted F1",
+    ref_result=None,
+):
+    # Initialize lists to store the statistics
+    model_names = []
+    best_f1 = []
+    best_afrp = []
+    best_repetition_penalty = []
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        precision = [
+            df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        recall = [
+            df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        newline_score = [
+            df["newline_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        print(f"newline_score: {newline_score}")
+        repetition_score = [
+            df["repetition_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        print(f"repetition_score: {repetition_score}")
+        afrp = [
+            f / math.log10(10 + n + r)
+            for f, n, r in zip(f1, newline_score, repetition_score)
+        ]
+        best_afrp.append(max(afrp))
+        best_afrp_index = afrp.index(best_afrp[-1])
+        best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
+        best_f1.append(f1[best_afrp_index])
+        print(
+            f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
+        )
+        df = result[model]["df_list_repetition_penalty"][best_afrp_index]
+        model_names.append(
+            f"{model} (RP={best_repetition_penalty[-1]})"
+        )  # Add the model name to the list
+    if ref_result is not None:
+        print("ref_result:", ref_result)
+        for model in ref_result.keys():
+            model_names.append(model)
+            df = pd.read_csv(ref_result[model])
+            # df = df[df["id"].isin(wikidata_df["id"])]
+            p = df["precision"].mean()
+            r = df["recall"].mean()
+            f1 = 2 * p * r / (p + r) if p + r > 0 else 0
+            best_f1.append(f1)
+            best_afrp.append(f1)
+    print("model_names:", model_names)
+    print("best_f1:", best_f1)
+    print("best_afrp:", best_afrp)
+    # Create a DataFrame with the statistics
+    data = pd.DataFrame(
+        {
+            "Model": model_names,
+            "Repetition Factor Adjusted F1": best_afrp,
+            "F1": best_f1,
+        }
+    )
+    # Melt the DataFrame to a long format
+    data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
+    # Pivot the DataFrame to a wide format
+    data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
+    # make sure the columns are following the order of the models
+    data_pivoted = data_pivoted[model_names]
+    # make sure three groups in the order of precision, recall, f1
+    data_pivoted = data_pivoted.reindex(["Repetition Factor Adjusted F1", "F1"])
+    # Plot the statistics
+    plt.figure(figsize=(15, 6))
+    ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
+    plt.title(title)
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    # Set the rotation of the x-axis labels to 0 degrees
+    plt.xticks(rotation=0)
+    # Format the y-axis to display as percentage
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+    # get the max value of the y-axis
+    a1 = max(best_afrp)
+    a2 = max(best_f1)
+    max_value = max([a1, a2]) * 1.12
+    print("max_value:", max_value)
+    # Set the y-axis limit up to 70%
+    ax.set_ylim(0, max_value)
+    # Add the values above each bar
+    for p in ax.patches:
+        ax.annotate(
+            f"{p.get_height() * 100:.1f}",
+            (p.get_x() + p.get_width() / 2.0, p.get_height()),
+            ha="center",
+            va="bottom",
+            xytext=(0, 10),
+            textcoords="offset points",
+            rotation=90,
+        )
+    plt.show()
+def plot_best_performance(
+    result,
+    models=None,
+    title="Models with Best F1 Score",
+    adjusted_f1=False,
+    ref_result=None,
+):
+    # Initialize lists to store the statistics
+    model_names = []
+    best_precision = []
+    best_recall = []
+    best_f1 = []
+    best_repetition_penalty = []
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        precision = [
+            df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        recall = [
+            df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        if adjusted_f1:
+            precision, recall = adjust_perf_scores_with_repetition_penalty(
+                result[model], precision, recall
+            )
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        best_f1.append(max(f1))
+        best_f1_index = f1.index(best_f1[-1])
+        best_repetition_penalty.append(df["repetition_penalty"][best_f1_index])
+        best_precision.append(precision[best_f1_index])
+        best_recall.append(recall[best_f1_index])
+        print(
+            f"best repetition penalty: {best_repetition_penalty[-1]}, best f1: {best_f1[-1]}, precision: {best_precision[-1]}, recall: {best_recall[-1]}"
+        )
+        df = result[model]["df_list_repetition_penalty"][best_f1_index]
+        model_names.append(
+            f"{model} (RP={best_repetition_penalty[-1]})"
+        )  # Add the model name to the list
+        # print sum for columns: newline_score, repetition_score
+        print(
+            f"newline_score: {df['newline_score'].sum()}, repetition_score: {df['repetition_score'].sum()}"
+        )
+    if ref_result is not None:
+        print("ref_result:", ref_result)
+        for model in ref_result.keys():
+            model_names.append(model)
+            df = pd.read_csv(ref_result[model])
+            # df = df[df["id"].isin(wikidata_df["id"])]
+            best_precision.append(df["precision"].mean())
+            best_recall.append(df["recall"].mean())
+            f1 = (
+                2
+                * (best_precision[-1] * best_recall[-1])
+                / (best_precision[-1] + best_recall[-1])
+            )
+            # best_f1.append(df["f1"].mean())
+            best_f1.append(f1)
+    # Create a DataFrame with the statistics
+    data = (
+        pd.DataFrame(
+            {
+                "Model": model_names,
+                "Adjusted Precision with RP": best_precision,
+                "Adjusted Recall with RP": best_recall,
+                "Adjusted F1 with RP": best_f1,
+            }
+        )
+        if adjusted_f1
+        else pd.DataFrame(
+            {
+                "Model": model_names,
+                "Precision": best_precision,
+                "Recall": best_recall,
+                "F1": best_f1,
+            }
+        )
+    )
+    columns = list(data.columns)
+    # Melt the DataFrame to a long format
+    data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
+    # Pivot the DataFrame to a wide format
+    data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
+    # make sure the columns are following the order of the models
+    data_pivoted = data_pivoted[model_names]
+    # make sure three groups in the order of precision, recall, f1
+    data_pivoted = data_pivoted.reindex(columns[1:])
+    # Plot the statistics
+    plt.figure(figsize=(10, 6))
+    ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
+    plt.title(title)
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    # Set the rotation of the x-axis labels to 0 degrees
+    plt.xticks(rotation=0)
+    # Format the y-axis to display as percentage
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+    # get the max value of the y-axis
+    a1 = max(best_precision)
+    a2 = max(best_recall)
+    a3 = max(best_f1)
+    max_value = max([a1, a2, a3]) * 1.12
+    print("max_value:", max_value)
+    # Set the y-axis limit up to 70%
+    ax.set_ylim(0, max_value)
+    # Add the values above each bar
+    for p in ax.patches:
+        ax.annotate(
+            f"{p.get_height() * 100:.1f}",
+            (p.get_x() + p.get_width() / 2.0, p.get_height()),
+            ha="center",
+            va="bottom",
+            xytext=(0, 10),
+            textcoords="offset points",
+            rotation=90,
+        )
+    plt.show()
+def plot_best_performance_ms_macro(
+    result,
+    models=None,
+    title="Models with Best Repetition Factor Adjusted Performance",
+    ref_result=None,
+    skip_generic_prompt=False,
+    include_adjusted_performance=True,
+):
+    # Initialize lists to store the statistics
+    model_names = []
+    best_f1 = []
+    best_afrp = []
+    best_repetition_penalty = []
+    best_bleu1 = []
+    best_rougeL = []
+    if models is None:
+        models = result.keys()
+    for model in models:
+        if skip_generic_prompt and "generic prompt" in model:
+            continue
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        bleu1 = [x for x in df["bleu1"]]
+        rougeL = [x for x in df["rougeL"]]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
+        newline_score = [
+            df["newline_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        print(f"newline_score: {newline_score}")
+        repetition_score = [
+            df["repetition_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        print(f"repetition_score: {repetition_score}")
+        afrp = [
+            f / math.log10(10 + n + r)
+            for f, n, r in zip(f1, newline_score, repetition_score)
+        ]
+        best_afrp.append(max(afrp if include_adjusted_performance else f1))
+        best_afrp_index = (
+            afrp.index(best_afrp[-1])
+            if include_adjusted_performance
+            else f1.index(best_afrp[-1])
+        )
+        best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
+        best_f1.append(f1[best_afrp_index])
+        best_bleu1.append(bleu1[best_afrp_index])
+        best_rougeL.append(rougeL[best_afrp_index])
+        print(
+            f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
+        )
+        df = result[model]["df_list_repetition_penalty"][best_afrp_index]
+        model_names.append(
+            f"{model} (RP={best_repetition_penalty[-1]})"
+        )  # Add the model name to the list
+    if ref_result is not None:
+        print("ref_result:", ref_result)
+        for model in ref_result.keys():
+            model_names.append(model)
+            df = pd.read_csv(ref_result[model], comment="#", on_bad_lines="warn")
+            # df = df[df["id"].isin(wikidata_df["id"])]
+            p = df["bleu1"][0]
+            best_bleu1.append(p)
+            r = df["rougeL"][0]
+            best_rougeL.append(r)
+            f1 = 2 * p * r / (p + r) if p + r > 0 else 0
+            best_f1.append(f1)
+            best_afrp.append(f1)
+    print("model_names:", model_names)
+    print("best_f1:", best_f1)
+    print("best_afrp:", best_afrp)
+    # Create a DataFrame with the statistics
+    data = (
+        pd.DataFrame(
+            {
+                "Model": model_names,
+                "Repetition Factor Adjusted Perf Score": best_afrp,
+                "Overall Perf Score": best_f1,
+            }
+        )
+        if include_adjusted_performance
+        else pd.DataFrame(
+            {
+                "Model": model_names,
+                "Bleu-1": best_bleu1,
+                "Rouge-L": best_rougeL,
+                "Overall Perf Score": best_f1,
+            }
+        )
+    )
+    # Melt the DataFrame to a long format
+    data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
+    # Pivot the DataFrame to a wide format
+    data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
+    # make sure the columns are following the order of the models
+    data_pivoted = data_pivoted[model_names]
+    columns = list(data.columns)
+    data_pivoted = data_pivoted.reindex(columns[1:])
+    # Plot the statistics
+    plt.figure(figsize=(10, 6))
+    ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
+    plt.title(title)
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    # Set the rotation of the x-axis labels to 0 degrees
+    plt.xticks(rotation=0)
+    # Format the y-axis to display as percentage
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+    # get the max value of the y-axis
+    a1 = max(best_afrp)
+    a2 = max(best_f1)
+    a3 = max(best_bleu1)
+    a4 = max(best_rougeL)
+    max_value = (
+        max([a1, a2] if include_adjusted_performance else [a1, a2, a3, a4]) * 1.12
+    )
+    print("max_value:", max_value)
+    # Set the y-axis limit up to 70%
+    ax.set_ylim(0, max_value)
+    # Add the values above each bar
+    for p in ax.patches:
+        ax.annotate(
+            f"{p.get_height() * 100:.1f}",
+            (p.get_x() + p.get_width() / 2.0, p.get_height()),
+            ha="center",
+            va="bottom",
+            xytext=(0, 10),
+            textcoords="offset points",
+            rotation=90,
+        )
+    plt.show()
+all_open_source_models = [
+    "gemma-1.1-2b-it",
+    "Phi-3-mini-128k-instruct",
+    "gemma-1.1-7b-it",
+    "Llama-2-7b-chat-hf",
+    "Mistral-7B-Instruct-v0.2",
+    "Meta-Llama-3-8B-Instruct",
+    "Llama-2-13b-chat-hf",
+    "Llama-2-70b-chat-hf",
+    "Meta-Llama-3-70B-Instruct",
+]
+non_rag_csv_result_files = [
+    "./data/results/gemma-1.1-2b-it_wd_non_rag.csv",  # gemma-1.1-2b-it
+    "./data/results/Phi-3-mini-128k-instruct_wd_non_rag_batch_16.csv",  # Phi-3-mini-128k-instruct(batch size:16)
+    "./data/results/gemma-1.1-7b-it_wd_non_rag.csv",  # gemma-1.1-7b-it
+    "./data/results/Tune_2024-04-09_09-19-22.csv",  # Llama-2-7b-chat-hf
+    "./data/results/Tune_2024-04-16_12-24-27.csv.csv",  # Mistral-7B-Instruct-v0.2
+    "./data/results/Meta-Llama-3-8B-Instruct_wd_non_rag.csv",  # Meta-Llama-3-8B-Instruct
+    "./data/results/Meta-Llama-3-8B-Instruct_wd_1_non_rag.csv",  # Meta-Llama-3-8B-Instruct
+    "./data/results/Tune_2024-04-10_16-53-38.csv",  # Llama-2-13b-chat-hf
+    "./data/results/Llama-2-70b-chat-hf_wd_non_rag.csv",  # Llama-2-70b-chat-hf
+    "./data/results/Meta-Llama-3-70B-Instruct_wd_non_rag.csv",  # Meta-Llama-3-70B-Instruct
+]
+rag_csv_result_files = [
+    "./data/results/gemma-1.1-2b-it_wd.csv",  # gemma-1.1-2b-it
+    "./data/results/gemma-1.1-2b-it_wd_true.csv",  # gemma-1.1-2b-it(true)
+    "./data/results/Phi-3-mini-128k-instruct_wd_rag_batch_4.csv",  # Phi-3-mini-128k-instruct(batch size:16)
+    "./data/results/Phi-3-mini-128k-instruct_wd_true.csv",  # Phi-3-mini-128k-instruct(batch size:16)
+    "./data/results/gemma-1.1-7b-it_wd.csv",  # gemma-1.1-7b-it
+    "./data/results/gemma-1.1-7b-it_wd_true.csv",  # gemma-1.1-7b-it(true)
+    "./data/results/Tune_2024-03-20_15-35-37.csv",  # Llama-2-7b-chat-hf
+    "./data/results/Llama-2-7b-chat-hf_wd_true.csv",  # Llama-2-7b-chat-hf(true)
+    "./data/results/Tune_2024-03-29_11-28-20.csv",  # Mistral-7B-Instruct-v0.2
+    "./data/results/Mistral-7B-Instruct-v0.2_wd_true.csv",  # Mistral-7B-Instruct-v0.2(true)
+    "./data/results/Meta-Llama-3-8B-Instruct_wd.csv",  # Meta-Llama-3-8b-instruct
+    "./data/results/Meta-Llama-3-8B-Instruct_wd_true.csv",  # Meta-Llama-3-8b-instruct(true)
+    "./data/results/Tune_2024-03-25_23-32-57.csv",  # Llama-2-13b-chat-hf
+    "./data/results/Llama-2-13b-chat-hf_wd_true.csv",  # Llama-2-13b-chat-hf(true)
+    "./data/results/Llama-2-70b-chat-hf_wd.csv",  # Llama-2-70b-chat-hf
+    "./data/results/Llama-2-70b-chat-hf_wd_true.csv",  # Llama-2-70b-chat-hf
+    "./data/results/Meta-Llama-3-70B-Instruct_wd.csv",  # Meta-Llama-3-70B-Instruct
+    "./data/results/Meta-Llama-3-70B-Instruct_wd_true.csv",  # Meta-Llama-3-70B-Instruct(true)
+]
+df_ms_macro = pd.read_json("./data/datasets/ms_macro.json")
+def load_for_repetition_penalty_ms_macro(
+    csv_result_file, repetition_penalty, force_recalculate=False
+):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
+    )
+    df = load_with_newline_and_repetition_scores(
+        result_file, force_recalculate=force_recalculate
+    )
+    if len(df) != len(df_ms_macro):
+        print(f"error: len(df) != {len(df_ms_macro)}")
+        missing_ids = [
+            id for id in df_ms_macro["id"].unique() if id not in df["id"].unique()
+        ]
+        print(f"missing_ids: {missing_ids}")
+    if df["ground_truth"][0] != str(df_ms_macro["wellFormedAnswers"][0]):
+        df["ground_truth"] = df_ms_macro["wellFormedAnswers"]
+        print("ground_truth updated for:", result_file)
+        df.to_csv(result_file, index=False)
+    return df
+# MS MACRO
+def plot_performance_scores_ms_macro(
+    result,
+    models=None,
+    title="Performance",
+):
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # print(result[model]["df_list_repetition_penalty"][0].describe())
+        # Calculate the statistics
+        bleu1 = list(df["bleu1"])
+        rougeL = list(df["rougeL"])
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
+        best_f1 = max(f1)
+        best_f1_index = f1.index(best_f1)
+        bleu1, rougeL = adjust_perf_scores_with_repetition_penalty(
+            result[model], bleu1, rougeL
+        )
+        afrp = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        best_afrp = max(afrp)
+        best_afrp_index = afrp.index(best_afrp)
+        repetition_penalties = list(df["repetition_penalty"])
+        # line plot for precision, recall, f1
+        plt.figure(figsize=(10, 6))
+        plt.axvspan(
+            repetition_penalties[best_f1_index] - 0.01,
+            repetition_penalties[best_f1_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="blue",
+        )
+        plt.axvspan(
+            repetition_penalties[best_afrp_index] - 0.01,
+            repetition_penalties[best_afrp_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="orange",
+        )
+        plt.plot(
+            repetition_penalties,
+            f1,
+            label="Overall Perf Score",
+            marker="D",
+            color="blue",
+        )
+        plt.plot(
+            repetition_penalties,
+            afrp,
+            label="RF Adjusted Perf Score",
+            marker="o",
+            color="orange",
+        )
+        plt.xlabel("Repetition Penalties")
+        plt.ylabel("Score")
+        plt.xlim(0.99, 1.31)
+        # y in percentage
+        plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+        plt.title(f"{model} {title}")
+        plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+        plt.show()
+def plot_repetition_factors(result, groups):
+    for group in groups:
+        # Plot the statistics
+        plt.figure(figsize=(10, 6))
+        max_value = 0
+        for model in result.keys():
+            if not group in model.lower():
+                continue
+            print(f"model: {model}")
+            df = result[model]["df_overall"]
+            repetition_panelties = [
+                repetition_penalty for repetition_penalty in df["repetition_penalty"]
+            ]
+            mean_score = [
+                math.log10(10 + df["total_repetitions"].mean())
+                for df in result[model]["df_list_repetition_penalty"]
+            ]
+            sns.lineplot(x=repetition_panelties, y=mean_score, label=model)
+            new_max = max(mean_score)
+            if new_max > max_value:
+                max_value = new_max
+        max_value = max_value * 1.05
+        if max_value < 1.5:
+            max_value = 1.5
+        # set ylimit
+        plt.ylim(1, max_value)
+        # show grid
+        plt.grid(True)
+        plt.xlabel("Repetition Penalties")
+        plt.ylabel("Repetition Factors")
+        plt.title("Repetition Factors vs Repetition Penalties")
+        plt.legend()
+        plt.show()
+def plot_repetition_factors_by_group(result, group_filter=None):
+    markers = ["D", "o", "s", "x"]
+    colors = ["blue", "orange", "green", "red"]
+    # Plot the statistics
+    plt.figure(figsize=(10, 6))
+    index = 0
+    max_value = 0
+    for model in result.keys():
+        if group_filter is not None and group_filter not in model:
+            continue
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        repetition_panelties = [
+            repetition_penalty for repetition_penalty in df["repetition_penalty"]
+        ]
+        # Calculate the statistics
+        mean_score = [
+            math.log10(10 + df["total_repetitions"].mean())
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        if len(mean_score) != len(repetition_panelties):
+            print(
+                f"model: {model} has different length of repetition penalties and mean score"
+            )
+            print("repetition_panelties:", len(repetition_panelties))
+            print("mean_score:", len(mean_score))
+            continue
+        new_max = max(mean_score)
+        if new_max > max_value:
+            max_value = new_max
+        sns.lineplot(
+            x=repetition_panelties,
+            y=mean_score,
+            label=model,
+            marker=markers[index],
+            color=colors[index],
+        )
+        index += 1
+    max_value = max_value * 1.05
+    if max_value < 1.5:
+        max_value = 1.5
+    # set ylimit
+    plt.ylim(1, max_value)
+    max_value = 0
+    plt.xlabel("Repetition Penalties")
+    plt.ylabel("Repetition Factors")
+    plt.title("Repetition Factors vs Repetition Penalties")
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    plt.show()

eval_modules/calc_repetitions_v3.py ADDED Viewed

	@@ -0,0 +1,1095 @@

+import os
+import re
+import math
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mtick
+import seaborn as sns
+import nltk
+# final version
+pattern_abnormal_newlines = re.compile(r"\n{5,}")
+pattern_text_repetitions = re.compile(r"(.{5}.*)\s*((\1)\s*)+", re.M | re.DOTALL)
+exception_patterns = [
+    re.compile(r"(\w+\.?)(\1)+$"),
+    re.compile(r"\W*(wink|nudge|Virginia)\W*((\1)\W*)+$"),
+    re.compile(r"\s+$"),
+]
+# final version for repetition detection
+def detect_repetitions(
+    text, debug=False, pattern_text_repetitions=pattern_text_repetitions
+):
+    subtotals = [0, 0]
+    if isinstance(text, str):
+        patterns = [pattern_abnormal_newlines, pattern_text_repetitions]
+        for i, pattern in enumerate(patterns):
+            if debug:
+                print(
+                    f"----detect {'abnormal newlines' if i == 0 else 'text repetitions'}----"
+                )
+            matches = pattern.finditer(text)
+            for match in matches:
+                if i > 0:
+                    ignored = False
+                    for exception_pattern in exception_patterns:
+                        match_ex = exception_pattern.match(match[0])
+                        if match_ex:
+                            if debug:
+                                print("ignored: ", match[0])
+                                print("exception: ", match_ex)
+                            ignored = True
+                            break
+                    if ignored:
+                        continue
+                if debug:
+                    print(match)
+                    for groupNum in range(0, len(match.groups())):
+                        groupNum = groupNum + 1
+                        print(
+                            "Group {groupNum} found at {start}-{end}: `{group}`".format(
+                                groupNum=groupNum,
+                                start=match.start(groupNum),
+                                end=match.end(groupNum),
+                                group=match.group(groupNum),
+                            )
+                        )
+                start, end = match.span()
+                subtotals[i] += end - start
+            if i == 0:
+                text = text.strip()
+                if subtotals[i] > 0:
+                    text = pattern.sub("", text)
+                    if debug:
+                        print(f"removed abnormal newlines: {subtotals[i]}")
+    result = (subtotals[0], subtotals[1], subtotals[0] + subtotals[1])
+    if debug:
+        print(result)
+    return result
+def detect_abnormal_newlines(text, debug=False):
+    return detect_repetitions(text, debug=debug)[0]
+def detect_text_repetitions(text, debug=False):
+    return detect_repetitions(text, debug=debug)[1]
+def detect_scores(text, debug=False):
+    newline_score, repetition_score, total_repetitions = detect_repetitions(
+        text, debug=debug
+    )
+    return pd.Series([newline_score, repetition_score, total_repetitions])
+def load_with_newline_and_repetition_scores(result_file, force_recalculate=False):
+    print(f"loading result file: {result_file}")
+    df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
+    if (
+        force_recalculate
+        or "newline_score" not in df.columns
+        or "repetition_score" not in df.columns
+        or "total_repetitions" not in df.columns
+    ):
+        df[["newline_score", "repetition_score", "total_repetitions"]] = df[
+            "answer"
+        ].apply(detect_scores)
+        df.to_csv(result_file, index=False)
+    return df
+def replace_last(source_string, old_string, new_string):
+    head, _sep, tail = source_string.rpartition(old_string)
+    return head + new_string + tail
+def load_for_repetition_penalty(
+    csv_result_file, repetition_penalty, force_recalculate=False
+):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
+    )
+    return load_with_newline_and_repetition_scores(
+        result_file, force_recalculate=force_recalculate
+    )
+def calc_adjusted_performance(f, r):
+    return f / math.log10(10 + r)
+def calculate_adjusted_performance(row):
+    r = row["total_repetitions"]
+    adjusted_precision = calc_adjusted_performance(row["precision"], r)
+    adjusted_recall = calc_adjusted_performance(row["recall"], r)
+    return pd.Series([adjusted_precision, adjusted_recall])
+def load_performance_df(csv_result_file, repetition_penalty):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}-t2_evaluated.json"
+    )
+    result_file = result_file.replace("/results/", "/eval/")
+    print(f"loading json file: {result_file}")
+    df = pd.read_json(result_file)
+    return df
+def calculate_performance_score_v1(
+    csv_result_file, repetition_penalty, force_recalculate=False
+):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
+    )
+    print(f"loading result file: {result_file}")
+    df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
+    if force_recalculate or "f2" in df.columns or "f1" not in df.columns:
+        df.drop(
+            columns=[
+                "precision",
+                "recall",
+                "f1",
+                "f2",
+                "entities_in_answer",
+                "entities_in_question",
+            ],
+            errors="ignore",
+            inplace=True,
+        )
+        perf_df = load_performance_df(csv_result_file, repetition_penalty)
+        filtered_df = perf_df[perf_df["id"].isin(df["id"])]
+        perf_df = filtered_df.reset_index(drop=True)
+        print(f"perf_df len: {len(perf_df)}")
+        # print(perf_df.head())
+        df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"]
+        df["precision"] = perf_df["score"].apply(lambda x: x[0])
+        df["recall"] = perf_df["score"].apply(lambda x: x[1])
+        df["f1"] = perf_df["score"].apply(lambda x: x[2])
+        df[["adjusted_precision", "adjusted_recall"]] = df.apply(
+            calculate_adjusted_performance, axis=1
+        )
+        df.to_csv(result_file, index=False)
+        print(f"performance scores saved to result file: {result_file}")
+    print(f"df len: {len(df)}")
+    return df
+ref_df = pd.read_csv(
+    "./data/results/gpt-3.5-turbo_non_rag.csv", comment="#", on_bad_lines="warn"
+)
+def calculate_performance_score(
+    csv_result_file, repetition_penalty, force_recalculate=False
+):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
+    )
+    re_creating = False
+    if os.path.exists(result_file):
+        print(f"loading result file: {result_file}")
+        df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
+    else:
+        print(f"re-creating result file: {result_file}")
+        df = pd.DataFrame()
+        force_recalculate = True
+    if force_recalculate or "f2" in df.columns or "f1" not in df.columns:
+        df.drop(
+            columns=[
+                "precision",
+                "recall",
+                "f1",
+                "f2",
+                "entities_in_answer",
+                "entities_in_question",
+                "word_count",
+            ],
+            errors="ignore",
+            inplace=True,
+        )
+        perf_df = load_performance_df(csv_result_file, repetition_penalty)
+        filtered_df = perf_df[perf_df["id"].isin(ref_df["id"])]
+        perf_df = filtered_df.reset_index(drop=True)
+        print(f"perf_df len: {len(perf_df)}")
+        if len(perf_df) != len(ref_df):
+            print(f"error: len(perf_df) != {len(ref_df)}")
+            missing_ids = [
+                id for id in ref_df["id"].unique() if id not in perf_df["id"].unique()
+            ]
+            print(f"missing_ids: {missing_ids}")
+        # print(perf_df.head())
+        df["id"] = perf_df["id"]
+        df["question"] = perf_df["question"]
+        df["answer"] = perf_df["pred_answer"]
+        df["word_count"] = df["answer"].apply(
+            lambda x: len(nltk.word_tokenize(x)) if isinstance(x, str) else 0
+        )
+        df["ground_truth"] = perf_df["ground_truth"]
+        df[["newline_score", "repetition_score", "total_repetitions"]] = df[
+            "answer"
+        ].apply(detect_scores)
+        df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"]
+        df["precision"] = perf_df["score"].apply(lambda x: x[0])
+        df["recall"] = perf_df["score"].apply(lambda x: x[1])
+        df["f1"] = perf_df["score"].apply(lambda x: x[2])
+        df[["adjusted_precision", "adjusted_recall"]] = df.apply(
+            calculate_adjusted_performance, axis=1
+        )
+        df.to_csv(result_file, index=False)
+        print(f"performance scores saved to result file: {result_file}")
+    print(f"df len: {len(df)}")
+    return df
+def adjust_perf_scores_with_repetition_penalty(result, precision, recall):
+    newline_score = [
+        df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
+    ]
+    print(f"newline_score: {newline_score}")
+    repetition_score = [
+        df["repetition_score"].mean() for df in result["df_list_repetition_penalty"]
+    ]
+    print(f"repetition_score: {repetition_score}")
+    precision = [
+        f / math.log10(10 + n + r)
+        for f, n, r in zip(precision, newline_score, repetition_score)
+    ]
+    recall = [
+        f / math.log10(10 + n + r)
+        for f, n, r in zip(recall, newline_score, repetition_score)
+    ]
+    return precision, recall
+def plot_performance_scores(
+    result,
+    models=None,
+    title="Performance",
+):
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        precision = [
+            df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        recall = [
+            df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        best_f1 = max(f1)
+        best_f1_index = f1.index(best_f1)
+        precision, recall = adjust_perf_scores_with_repetition_penalty(
+            result[model], precision, recall
+        )
+        afrp = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        best_afrp = max(afrp)
+        best_afrp_index = afrp.index(best_afrp)
+        adjusted_precision = [
+            df["adjusted_precision"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        adjusted_recall = [
+            df["adjusted_recall"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        afrp2 = [
+            2 * (p * r) / (p + r) for p, r in zip(adjusted_precision, adjusted_recall)
+        ]
+        best_afrp2 = max(afrp2)
+        best_afrp2_index = afrp2.index(best_afrp2)
+        repetition_penalties = list(df["repetition_penalty"])
+        # line plot for precision, recall, f1
+        plt.figure(figsize=(10, 6))
+        plt.axvspan(
+            repetition_penalties[best_f1_index] - 0.01,
+            repetition_penalties[best_f1_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="blue",
+        )
+        # plt.axvspan(
+        #     repetition_penalties[best_afrp2_index] - 0.01,
+        #     repetition_penalties[best_afrp2_index] + 0.01,
+        #     alpha=0.5,
+        #     edgecolor="none",
+        #     facecolor="green",
+        # )
+        plt.axvspan(
+            repetition_penalties[best_afrp_index] - 0.01,
+            repetition_penalties[best_afrp_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="orange",
+        )
+        plt.plot(repetition_penalties, f1, label="F1", marker="D", color="blue")
+        # plt.plot(
+        #     repetition_penalties,
+        #     afrp2,
+        #     label="Per-question RF Adjusted F1",
+        #     marker="s",
+        #     color="green",
+        # )
+        plt.plot(
+            repetition_penalties,
+            afrp,
+            label="RF Adjusted F1",
+            marker="o",
+            color="orange",
+        )
+        plt.xlabel("Repetition Penalties")
+        plt.ylabel("Score")
+        plt.xlim(0.99, 1.31)
+        # y in percentage
+        plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+        plt.title(f"{model} {title}")
+        plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+        plt.show()
+def plot_best_afrp(
+    result,
+    models=None,
+    title="Models with Best Repetition Factor Adjusted F1",
+    ref_result=None,
+):
+    # Initialize lists to store the statistics
+    model_names = []
+    best_f1 = []
+    best_afrp = []
+    best_repetition_penalty = []
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        precision = [
+            df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        recall = [
+            df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        newline_score = [
+            df["newline_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        print(f"newline_score: {newline_score}")
+        repetition_score = [
+            df["repetition_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        print(f"repetition_score: {repetition_score}")
+        afrp = [
+            f / math.log10(10 + n + r)
+            for f, n, r in zip(f1, newline_score, repetition_score)
+        ]
+        best_afrp.append(max(afrp))
+        best_afrp_index = afrp.index(best_afrp[-1])
+        best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
+        best_f1.append(f1[best_afrp_index])
+        print(
+            f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
+        )
+        df = result[model]["df_list_repetition_penalty"][best_afrp_index]
+        model_names.append(
+            f"{model} (RP={best_repetition_penalty[-1]})"
+        )  # Add the model name to the list
+    if ref_result is not None:
+        print("ref_result:", ref_result)
+        for model in ref_result.keys():
+            model_names.append(model)
+            df = pd.read_csv(ref_result[model])
+            # df = df[df["id"].isin(wikidata_df["id"])]
+            p = df["precision"].mean()
+            r = df["recall"].mean()
+            f1 = 2 * p * r / (p + r) if p + r > 0 else 0
+            best_f1.append(f1)
+            best_afrp.append(f1)
+    print("model_names:", model_names)
+    print("best_f1:", best_f1)
+    print("best_afrp:", best_afrp)
+    # Create a DataFrame with the statistics
+    data = pd.DataFrame(
+        {
+            "Model": model_names,
+            "Repetition Factor Adjusted F1": best_afrp,
+            "F1": best_f1,
+        }
+    )
+    # Melt the DataFrame to a long format
+    data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
+    # Pivot the DataFrame to a wide format
+    data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
+    # make sure the columns are following the order of the models
+    data_pivoted = data_pivoted[model_names]
+    # make sure three groups in the order of precision, recall, f1
+    data_pivoted = data_pivoted.reindex(["Repetition Factor Adjusted F1", "F1"])
+    # Plot the statistics
+    plt.figure(figsize=(15, 6))
+    ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
+    plt.title(title)
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    # Set the rotation of the x-axis labels to 0 degrees
+    plt.xticks(rotation=0)
+    # Format the y-axis to display as percentage
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+    # get the max value of the y-axis
+    a1 = max(best_afrp)
+    a2 = max(best_f1)
+    max_value = max([a1, a2]) * 1.12
+    print("max_value:", max_value)
+    # Set the y-axis limit up to 70%
+    ax.set_ylim(0, max_value)
+    # Add the values above each bar
+    for p in ax.patches:
+        ax.annotate(
+            f"{p.get_height() * 100:.1f}",
+            (p.get_x() + p.get_width() / 2.0, p.get_height()),
+            ha="center",
+            va="bottom",
+            xytext=(0, 10),
+            textcoords="offset points",
+            rotation=90,
+        )
+    plt.show()
+def plot_best_performance(
+    result,
+    models=None,
+    title="Models with Best F1 Score",
+    adjusted_f1=False,
+    ref_result=None,
+):
+    # Initialize lists to store the statistics
+    model_names = []
+    best_precision = []
+    best_recall = []
+    best_f1 = []
+    best_repetition_penalty = []
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        precision = [
+            df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        recall = [
+            df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        if adjusted_f1:
+            precision, recall = adjust_perf_scores_with_repetition_penalty(
+                result[model], precision, recall
+            )
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        best_f1.append(max(f1))
+        best_f1_index = f1.index(best_f1[-1])
+        best_repetition_penalty.append(df["repetition_penalty"][best_f1_index])
+        best_precision.append(precision[best_f1_index])
+        best_recall.append(recall[best_f1_index])
+        print(
+            f"best repetition penalty: {best_repetition_penalty[-1]}, best f1: {best_f1[-1]}, precision: {best_precision[-1]}, recall: {best_recall[-1]}"
+        )
+        df = result[model]["df_list_repetition_penalty"][best_f1_index]
+        model_names.append(
+            f"{model} (RP={best_repetition_penalty[-1]})"
+        )  # Add the model name to the list
+        # print sum for columns: newline_score, repetition_score
+        print(
+            f"newline_score: {df['newline_score'].sum()}, repetition_score: {df['repetition_score'].sum()}"
+        )
+    if ref_result is not None:
+        print("ref_result:", ref_result)
+        for model in ref_result.keys():
+            model_names.append(model)
+            df = pd.read_csv(ref_result[model])
+            # df = df[df["id"].isin(wikidata_df["id"])]
+            best_precision.append(df["precision"].mean())
+            best_recall.append(df["recall"].mean())
+            f1 = (
+                2
+                * (best_precision[-1] * best_recall[-1])
+                / (best_precision[-1] + best_recall[-1])
+            )
+            # best_f1.append(df["f1"].mean())
+            best_f1.append(f1)
+    # Create a DataFrame with the statistics
+    data = (
+        pd.DataFrame(
+            {
+                "Model": model_names,
+                "Adjusted Precision with RP": best_precision,
+                "Adjusted Recall with RP": best_recall,
+                "Adjusted F1 with RP": best_f1,
+            }
+        )
+        if adjusted_f1
+        else pd.DataFrame(
+            {
+                "Model": model_names,
+                "Precision": best_precision,
+                "Recall": best_recall,
+                "F1": best_f1,
+            }
+        )
+    )
+    columns = list(data.columns)
+    # Melt the DataFrame to a long format
+    data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
+    # Pivot the DataFrame to a wide format
+    data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
+    # make sure the columns are following the order of the models
+    data_pivoted = data_pivoted[model_names]
+    # make sure three groups in the order of precision, recall, f1
+    data_pivoted = data_pivoted.reindex(columns[1:])
+    # Plot the statistics
+    plt.figure(figsize=(10, 6))
+    ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
+    plt.title(title)
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    # Set the rotation of the x-axis labels to 0 degrees
+    plt.xticks(rotation=0)
+    # Format the y-axis to display as percentage
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+    # get the max value of the y-axis
+    a1 = max(best_precision)
+    a2 = max(best_recall)
+    a3 = max(best_f1)
+    max_value = max([a1, a2, a3]) * 1.12
+    print("max_value:", max_value)
+    # Set the y-axis limit up to 70%
+    ax.set_ylim(0, max_value)
+    # Add the values above each bar
+    for p in ax.patches:
+        ax.annotate(
+            f"{p.get_height() * 100:.1f}",
+            (p.get_x() + p.get_width() / 2.0, p.get_height()),
+            ha="center",
+            va="bottom",
+            xytext=(0, 10),
+            textcoords="offset points",
+            rotation=90,
+        )
+    plt.show()
+def plot_best_performance_ms_macro(
+    result,
+    models=None,
+    title="Models with Best Repetition Factor Adjusted Performance",
+    ref_result=None,
+    skip_generic_prompt=False,
+    include_adjusted_performance=True,
+):
+    # Initialize lists to store the statistics
+    model_names = []
+    best_f1 = []
+    best_afrp = []
+    best_repetition_penalty = []
+    best_bleu1 = []
+    best_rougeL = []
+    if models is None:
+        models = result.keys()
+    for model in models:
+        if skip_generic_prompt and "generic prompt" in model:
+            continue
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        bleu1 = [x for x in df["bleu1"]]
+        rougeL = [x for x in df["rougeL"]]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
+        newline_score = [
+            df["newline_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        print(f"newline_score: {newline_score}")
+        repetition_score = [
+            df["repetition_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        print(f"repetition_score: {repetition_score}")
+        afrp = [
+            f / math.log10(10 + n + r)
+            for f, n, r in zip(f1, newline_score, repetition_score)
+        ]
+        best_afrp.append(max(afrp if include_adjusted_performance else f1))
+        best_afrp_index = (
+            afrp.index(best_afrp[-1])
+            if include_adjusted_performance
+            else f1.index(best_afrp[-1])
+        )
+        best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
+        best_f1.append(f1[best_afrp_index])
+        best_bleu1.append(bleu1[best_afrp_index])
+        best_rougeL.append(rougeL[best_afrp_index])
+        print(
+            f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
+        )
+        df = result[model]["df_list_repetition_penalty"][best_afrp_index]
+        model_names.append(
+            f"{model} (RP={best_repetition_penalty[-1]})"
+        )  # Add the model name to the list
+    if ref_result is not None:
+        print("ref_result:", ref_result)
+        for model in ref_result.keys():
+            model_names.append(model)
+            df = pd.read_csv(ref_result[model], comment="#", on_bad_lines="warn")
+            # df = df[df["id"].isin(wikidata_df["id"])]
+            p = df["bleu1"][0]
+            best_bleu1.append(p)
+            r = df["rougeL"][0]
+            best_rougeL.append(r)
+            f1 = 2 * p * r / (p + r) if p + r > 0 else 0
+            best_f1.append(f1)
+            best_afrp.append(f1)
+    print("model_names:", model_names)
+    print("best_f1:", best_f1)
+    print("best_afrp:", best_afrp)
+    # Create a DataFrame with the statistics
+    data = (
+        pd.DataFrame(
+            {
+                "Model": model_names,
+                "Repetition Factor Adjusted Perf Score": best_afrp,
+                "Overall Perf Score": best_f1,
+            }
+        )
+        if include_adjusted_performance
+        else pd.DataFrame(
+            {
+                "Model": model_names,
+                "Bleu-1": best_bleu1,
+                "Rouge-L": best_rougeL,
+                "Overall Perf Score": best_f1,
+            }
+        )
+    )
+    # Melt the DataFrame to a long format
+    data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
+    # Pivot the DataFrame to a wide format
+    data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
+    # make sure the columns are following the order of the models
+    data_pivoted = data_pivoted[model_names]
+    columns = list(data.columns)
+    data_pivoted = data_pivoted.reindex(columns[1:])
+    # Plot the statistics
+    plt.figure(figsize=(10, 6))
+    ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
+    plt.title(title)
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    # Set the rotation of the x-axis labels to 0 degrees
+    plt.xticks(rotation=0)
+    # Format the y-axis to display as percentage
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+    # get the max value of the y-axis
+    a1 = max(best_afrp)
+    a2 = max(best_f1)
+    a3 = max(best_bleu1)
+    a4 = max(best_rougeL)
+    max_value = (
+        max([a1, a2] if include_adjusted_performance else [a1, a2, a3, a4]) * 1.12
+    )
+    print("max_value:", max_value)
+    # Set the y-axis limit up to 70%
+    ax.set_ylim(0, max_value)
+    # Add the values above each bar
+    for p in ax.patches:
+        ax.annotate(
+            f"{p.get_height() * 100:.1f}",
+            (p.get_x() + p.get_width() / 2.0, p.get_height()),
+            ha="center",
+            va="bottom",
+            xytext=(0, 10),
+            textcoords="offset points",
+            rotation=90,
+        )
+    plt.show()
+all_open_source_models = [
+    "gemma-1.1-2b-it",
+    "Phi-3-mini-128k-instruct",
+    "gemma-1.1-7b-it",
+    "Llama-2-7b-chat-hf",
+    "Mistral-7B-Instruct-v0.2",
+    "Meta-Llama-3-8B-Instruct",
+    "Llama-2-13b-chat-hf",
+    "Llama-2-70b-chat-hf",
+    "Meta-Llama-3-70B-Instruct",
+]
+non_rag_csv_result_files = [
+    "./data/results/gemma-1.1-2b-it_wd_non_rag.csv",  # gemma-1.1-2b-it
+    "./data/results/Phi-3-mini-128k-instruct_wd_non_rag_batch_16.csv",  # Phi-3-mini-128k-instruct(batch size:16)
+    "./data/results/gemma-1.1-7b-it_wd_non_rag.csv",  # gemma-1.1-7b-it
+    "./data/results/Tune_2024-04-09_09-19-22.csv",  # Llama-2-7b-chat-hf
+    "./data/results/Tune_2024-04-16_12-24-27.csv.csv",  # Mistral-7B-Instruct-v0.2
+    "./data/results/Meta-Llama-3-8B-Instruct_wd_non_rag.csv",  # Meta-Llama-3-8B-Instruct
+    "./data/results/Meta-Llama-3-8B-Instruct_wd_1_non_rag.csv",  # Meta-Llama-3-8B-Instruct
+    "./data/results/Tune_2024-04-10_16-53-38.csv",  # Llama-2-13b-chat-hf
+    "./data/results/Llama-2-70b-chat-hf_wd_non_rag.csv",  # Llama-2-70b-chat-hf
+    "./data/results/Meta-Llama-3-70B-Instruct_wd_non_rag.csv",  # Meta-Llama-3-70B-Instruct
+]
+rag_csv_result_files = [
+    "./data/results/gemma-1.1-2b-it_wd.csv",  # gemma-1.1-2b-it
+    "./data/results/gemma-1.1-2b-it_wd_true.csv",  # gemma-1.1-2b-it(true)
+    "./data/results/Phi-3-mini-128k-instruct_wd_rag_batch_4.csv",  # Phi-3-mini-128k-instruct(batch size:16)
+    "./data/results/Phi-3-mini-128k-instruct_wd_true.csv",  # Phi-3-mini-128k-instruct(batch size:16)
+    "./data/results/gemma-1.1-7b-it_wd.csv",  # gemma-1.1-7b-it
+    "./data/results/gemma-1.1-7b-it_wd_true.csv",  # gemma-1.1-7b-it(true)
+    "./data/results/Tune_2024-03-20_15-35-37.csv",  # Llama-2-7b-chat-hf
+    "./data/results/Llama-2-7b-chat-hf_wd_true.csv",  # Llama-2-7b-chat-hf(true)
+    "./data/results/Tune_2024-03-29_11-28-20.csv",  # Mistral-7B-Instruct-v0.2
+    "./data/results/Mistral-7B-Instruct-v0.2_wd_true.csv",  # Mistral-7B-Instruct-v0.2(true)
+    "./data/results/Meta-Llama-3-8B-Instruct_wd.csv",  # Meta-Llama-3-8b-instruct
+    "./data/results/Meta-Llama-3-8B-Instruct_wd_true.csv",  # Meta-Llama-3-8b-instruct(true)
+    "./data/results/Tune_2024-03-25_23-32-57.csv",  # Llama-2-13b-chat-hf
+    "./data/results/Llama-2-13b-chat-hf_wd_true.csv",  # Llama-2-13b-chat-hf(true)
+    "./data/results/Llama-2-70b-chat-hf_wd.csv",  # Llama-2-70b-chat-hf
+    "./data/results/Llama-2-70b-chat-hf_wd_true.csv",  # Llama-2-70b-chat-hf
+    "./data/results/Meta-Llama-3-70B-Instruct_wd.csv",  # Meta-Llama-3-70B-Instruct
+    "./data/results/Meta-Llama-3-70B-Instruct_wd_true.csv",  # Meta-Llama-3-70B-Instruct(true)
+]
+df_ms_macro = pd.read_json("./data/datasets/ms_macro.json")
+def load_for_repetition_penalty_ms_macro(
+    csv_result_file, repetition_penalty, force_recalculate=False
+):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
+    )
+    df = load_with_newline_and_repetition_scores(
+        result_file, force_recalculate=force_recalculate
+    )
+    if len(df) != len(df_ms_macro):
+        print(f"error: len(df) != {len(df_ms_macro)}")
+        missing_ids = [
+            id for id in df_ms_macro["id"].unique() if id not in df["id"].unique()
+        ]
+        print(f"missing_ids: {missing_ids}")
+    if df["ground_truth"][0] != str(df_ms_macro["wellFormedAnswers"][0]):
+        df["ground_truth"] = df_ms_macro["wellFormedAnswers"]
+        print("ground_truth updated for:", result_file)
+        df.to_csv(result_file, index=False)
+    return df
+# MS MACRO
+def plot_performance_scores_ms_macro(
+    result,
+    models=None,
+    title="Performance",
+):
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # print(result[model]["df_list_repetition_penalty"][0].describe())
+        # Calculate the statistics
+        bleu1 = list(df["bleu1"])
+        rougeL = list(df["rougeL"])
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
+        best_f1 = max(f1)
+        best_f1_index = f1.index(best_f1)
+        bleu1, rougeL = adjust_perf_scores_with_repetition_penalty(
+            result[model], bleu1, rougeL
+        )
+        afrp = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        best_afrp = max(afrp)
+        best_afrp_index = afrp.index(best_afrp)
+        repetition_penalties = list(df["repetition_penalty"])
+        # line plot for precision, recall, f1
+        plt.figure(figsize=(10, 6))
+        plt.axvspan(
+            repetition_penalties[best_f1_index] - 0.01,
+            repetition_penalties[best_f1_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="blue",
+        )
+        plt.axvspan(
+            repetition_penalties[best_afrp_index] - 0.01,
+            repetition_penalties[best_afrp_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="orange",
+        )
+        plt.plot(
+            repetition_penalties,
+            f1,
+            label="Overall Perf Score",
+            marker="D",
+            color="blue",
+        )
+        plt.plot(
+            repetition_penalties,
+            afrp,
+            label="RF Adjusted Perf Score",
+            marker="o",
+            color="orange",
+        )
+        plt.xlabel("Repetition Penalties")
+        plt.ylabel("Score")
+        plt.xlim(0.99, 1.31)
+        # y in percentage
+        plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+        plt.title(f"{model} {title}")
+        plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+        plt.show()
+def plot_repetition_factors(result, groups):
+    for group in groups:
+        # Plot the statistics
+        plt.figure(figsize=(10, 6))
+        max_value = 0
+        for model in result.keys():
+            if not group in model.lower():
+                continue
+            print(f"model: {model}")
+            df = result[model]["df_overall"]
+            repetition_panelties = [
+                repetition_penalty for repetition_penalty in df["repetition_penalty"]
+            ]
+            mean_score = [
+                math.log10(10 + df["total_repetitions"].mean())
+                for df in result[model]["df_list_repetition_penalty"]
+            ]
+            sns.lineplot(x=repetition_panelties, y=mean_score, label=model)
+            new_max = max(mean_score)
+            if new_max > max_value:
+                max_value = new_max
+        max_value = max_value * 1.05
+        if max_value < 1.5:
+            max_value = 1.5
+        # set ylimit
+        plt.ylim(1, max_value)
+        # show grid
+        plt.grid(True)
+        plt.xlabel("Repetition Penalties")
+        plt.ylabel("Repetition Factors")
+        plt.title("Repetition Factors vs Repetition Penalties")
+        plt.legend()
+        plt.show()
+def plot_repetition_factors_by_group(result, group_filter=None):
+    markers = ["D", "o", "s", "x"]
+    colors = ["blue", "orange", "green", "red"]
+    # Plot the statistics
+    plt.figure(figsize=(10, 6))
+    index = 0
+    max_value = 0
+    for model in result.keys():
+        if group_filter is not None and group_filter not in model:
+            continue
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        repetition_panelties = [
+            repetition_penalty for repetition_penalty in df["repetition_penalty"]
+        ]
+        # Calculate the statistics
+        mean_score = [
+            math.log10(10 + df["total_repetitions"].mean())
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        if len(mean_score) != len(repetition_panelties):
+            print(
+                f"model: {model} has different length of repetition penalties and mean score"
+            )
+            print("repetition_panelties:", len(repetition_panelties))
+            print("mean_score:", len(mean_score))
+            continue
+        new_max = max(mean_score)
+        if new_max > max_value:
+            max_value = new_max
+        sns.lineplot(
+            x=repetition_panelties,
+            y=mean_score,
+            label=model,
+            marker=markers[index],
+            color=colors[index],
+        )
+        index += 1
+    max_value = max_value * 1.05
+    if max_value < 1.5:
+        max_value = 1.5
+    # set ylimit
+    plt.ylim(1, max_value)
+    max_value = 0
+    plt.xlabel("Repetition Penalties")
+    plt.ylabel("Repetition Factors")
+    plt.title("Repetition Factors vs Repetition Penalties")
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    plt.show()

eval_modules/calc_repetitions_v4.py ADDED Viewed

	@@ -0,0 +1,1296 @@

+import os
+import re
+import math
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mtick
+import seaborn as sns
+import nltk
+print(f"loading: {__file__}")
+# final version
+pattern_excessive_whitespaces = re.compile(r"\s{5,}")
+pattern_text_repetitions = re.compile(r"(.{5}.*)\s*((\1)\s*)+", re.M | re.DOTALL)
+# final version for repetition detection
+def detect_repetitions(text, debug=False):
+    subtotals = [0, 0]
+    if isinstance(text, str):
+        patterns = [pattern_excessive_whitespaces, pattern_text_repetitions]
+        for i, pattern in enumerate(patterns):
+            if debug:
+                print(
+                    f"----detect {'excessive whitespaces' if i == 0 else 'text repetitions'}----"
+                )
+            matches = pattern.finditer(text)
+            for match in matches:
+                if debug:
+                    print(match)
+                    for groupNum in range(0, len(match.groups())):
+                        groupNum = groupNum + 1
+                        print(
+                            "Group {groupNum} found at {start}-{end}: `{group}`".format(
+                                groupNum=groupNum,
+                                start=match.start(groupNum),
+                                end=match.end(groupNum),
+                                group=match.group(groupNum),
+                            )
+                        )
+                start, end = match.span()
+                subtotals[i] += end - start
+            if i == 0 and subtotals[i] > 0:
+                text = pattern.sub("", text)
+                if debug:
+                    print(f"removed excessive whitespaces: {subtotals[i]}")
+    result = (subtotals[0], subtotals[1], subtotals[0] + subtotals[1])
+    if debug:
+        print(result)
+    return result
+def detect_excessive_whitespaces(text, debug=False):
+    return detect_repetitions(text, debug=debug)[0]
+def detect_text_repetitions(text, debug=False):
+    return detect_repetitions(text, debug=debug)[1]
+def detect_scores(text, debug=False):
+    newline_score, repetition_score, total_repetitions = detect_repetitions(
+        text, debug=debug
+    )
+    return pd.Series([newline_score, repetition_score, total_repetitions])
+def load_with_newline_and_repetition_scores(result_file, force_recalculate=False):
+    print(f"loading result file: {result_file}")
+    df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
+    if (
+        force_recalculate
+        or "newline_score" not in df.columns
+        or "repetition_score" not in df.columns
+        or "total_repetitions" not in df.columns
+    ):
+        df[["newline_score", "repetition_score", "total_repetitions"]] = df[
+            "answer"
+        ].apply(detect_scores)
+        df.to_csv(result_file, index=False)
+    return df
+def replace_last(source_string, old_string, new_string):
+    head, _sep, tail = source_string.rpartition(old_string)
+    return head + new_string + tail
+def load_for_repetition_penalty(
+    csv_result_file, repetition_penalty, force_recalculate=False
+):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
+    )
+    return load_with_newline_and_repetition_scores(
+        result_file, force_recalculate=force_recalculate
+    )
+def calc_adjusted_performance(f, r):
+    return f / math.log10(10 + r)
+def calculate_adjusted_performance(row):
+    r = row["total_repetitions"]
+    adjusted_precision = calc_adjusted_performance(row["precision"], r)
+    adjusted_recall = calc_adjusted_performance(row["recall"], r)
+    return pd.Series([adjusted_precision, adjusted_recall])
+def load_performance_df(csv_result_file, repetition_penalty):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}-t2_evaluated.json"
+    )
+    result_file = result_file.replace("/results/", "/eval/")
+    print(f"loading json file: {result_file}")
+    df = pd.read_json(result_file)
+    return df
+def calculate_performance_score_v1(
+    csv_result_file, repetition_penalty, force_recalculate=False
+):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
+    )
+    print(f"loading result file: {result_file}")
+    df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
+    if force_recalculate or "f2" in df.columns or "f1" not in df.columns:
+        df.drop(
+            columns=[
+                "precision",
+                "recall",
+                "f1",
+                "f2",
+                "entities_in_answer",
+                "entities_in_question",
+            ],
+            errors="ignore",
+            inplace=True,
+        )
+        perf_df = load_performance_df(csv_result_file, repetition_penalty)
+        filtered_df = perf_df[perf_df["id"].isin(df["id"])]
+        perf_df = filtered_df.reset_index(drop=True)
+        print(f"perf_df len: {len(perf_df)}")
+        # print(perf_df.head())
+        df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"]
+        df["precision"] = perf_df["score"].apply(lambda x: x[0])
+        df["recall"] = perf_df["score"].apply(lambda x: x[1])
+        df["f1"] = perf_df["score"].apply(lambda x: x[2])
+        df[["adjusted_precision", "adjusted_recall"]] = df.apply(
+            calculate_adjusted_performance, axis=1
+        )
+        df.to_csv(result_file, index=False)
+        print(f"performance scores saved to result file: {result_file}")
+    print(f"df len: {len(df)}")
+    return df
+ref_df = pd.read_csv(
+    "./data/results/gpt-3.5-turbo_non_rag.csv", comment="#", on_bad_lines="warn"
+)
+def calculate_performance_score(
+    csv_result_file, repetition_penalty, force_recalculate=False
+):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
+    )
+    re_creating = False
+    if os.path.exists(result_file):
+        print(f"loading result file: {result_file}")
+        df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
+    else:
+        print(f"re-creating result file: {result_file}")
+        df = pd.DataFrame()
+        force_recalculate = True
+    if force_recalculate or "f2" in df.columns or "f1" not in df.columns:
+        df.drop(
+            columns=[
+                "precision",
+                "recall",
+                "f1",
+                "f2",
+                "entities_in_answer",
+                "entities_in_question",
+                "word_count",
+            ],
+            errors="ignore",
+            inplace=True,
+        )
+        perf_df = load_performance_df(csv_result_file, repetition_penalty)
+        filtered_df = perf_df[perf_df["id"].isin(ref_df["id"])]
+        perf_df = filtered_df.reset_index(drop=True)
+        print(f"perf_df len: {len(perf_df)}")
+        if len(perf_df) != len(ref_df):
+            print(f"error: len(perf_df) != {len(ref_df)}")
+            missing_ids = [
+                id for id in ref_df["id"].unique() if id not in perf_df["id"].unique()
+            ]
+            print(f"missing_ids: {missing_ids}")
+        # print(perf_df.head())
+        df["id"] = perf_df["id"]
+        df["question"] = perf_df["question"]
+        df["answer"] = perf_df["pred_answer"]
+        df["word_count"] = df["answer"].apply(
+            lambda x: len(nltk.word_tokenize(x)) if isinstance(x, str) else 0
+        )
+        df["ground_truth"] = perf_df["ground_truth"]
+        df[["newline_score", "repetition_score", "total_repetitions"]] = df[
+            "answer"
+        ].apply(detect_scores)
+        df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"]
+        df["precision"] = perf_df["score"].apply(lambda x: x[0])
+        df["recall"] = perf_df["score"].apply(lambda x: x[1])
+        df["f1"] = perf_df["score"].apply(lambda x: x[2])
+        df[["adjusted_precision", "adjusted_recall"]] = df.apply(
+            calculate_adjusted_performance, axis=1
+        )
+        df.to_csv(result_file, index=False)
+        print(f"performance scores saved to result file: {result_file}")
+    print(f"df len: {len(df)}")
+    return df
+def adjust_perf_scores_with_repetition_penalty(result, precision, recall):
+    newline_score = [
+        df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
+    ]
+    repetition_score = [
+        df["repetition_score"].mean() for df in result["df_list_repetition_penalty"]
+    ]
+    precision = [
+        f / math.log10(10 + n + r)
+        for f, n, r in zip(precision, newline_score, repetition_score)
+    ]
+    recall = [
+        f / math.log10(10 + n + r)
+        for f, n, r in zip(recall, newline_score, repetition_score)
+    ]
+    return precision, recall
+def plot_performance_scores(
+    result,
+    models=None,
+    title="Performance",
+):
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        precision = [
+            df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        recall = [
+            df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        best_f1 = max(f1)
+        best_f1_index = f1.index(best_f1)
+        precision, recall = adjust_perf_scores_with_repetition_penalty(
+            result[model], precision, recall
+        )
+        afrp = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        best_afrp = max(afrp)
+        best_afrp_index = afrp.index(best_afrp)
+        adjusted_precision = [
+            df["adjusted_precision"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        adjusted_recall = [
+            df["adjusted_recall"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        afrp2 = [
+            2 * (p * r) / (p + r) for p, r in zip(adjusted_precision, adjusted_recall)
+        ]
+        best_afrp2 = max(afrp2)
+        best_afrp2_index = afrp2.index(best_afrp2)
+        repetition_penalties = list(df["repetition_penalty"])
+        # line plot for precision, recall, f1
+        plt.figure(figsize=(10, 6))
+        plt.axvspan(
+            repetition_penalties[best_f1_index] - 0.01,
+            repetition_penalties[best_f1_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="blue",
+        )
+        # plt.axvspan(
+        #     repetition_penalties[best_afrp2_index] - 0.01,
+        #     repetition_penalties[best_afrp2_index] + 0.01,
+        #     alpha=0.5,
+        #     edgecolor="none",
+        #     facecolor="green",
+        # )
+        plt.axvspan(
+            repetition_penalties[best_afrp_index] - 0.01,
+            repetition_penalties[best_afrp_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="orange",
+        )
+        plt.plot(repetition_penalties, f1, label="F1", marker="D", color="blue")
+        # plt.plot(
+        #     repetition_penalties,
+        #     afrp2,
+        #     label="Per-question RF Adjusted F1",
+        #     marker="s",
+        #     color="green",
+        # )
+        plt.plot(
+            repetition_penalties,
+            afrp,
+            label="RF Adjusted F1",
+            marker="o",
+            color="orange",
+        )
+        plt.xlabel("Repetition Penalties")
+        plt.ylabel("Score")
+        plt.xlim(0.99, 1.31)
+        # y in percentage
+        plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+        plt.title(f"{model} {title}")
+        plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+        plt.show()
+def plot_best_afrp(
+    result,
+    models=None,
+    title="Models with Best Repetition Factor Adjusted F1",
+    ref_result=None,
+):
+    # Initialize lists to store the statistics
+    model_names = []
+    best_f1 = []
+    best_afrp = []
+    best_repetition_penalty = []
+    best_mtp = []
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        precision = [
+            df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        recall = [
+            df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        newline_score = [
+            df["newline_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        # print(f"newline_score: {newline_score}")
+        repetition_score = [
+            df["repetition_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        # print(f"repetition_score: {repetition_score}")
+        afrp = [
+            f / math.log10(10 + n + r)
+            for f, n, r in zip(f1, newline_score, repetition_score)
+        ]
+        best_afrp.append(max(afrp))
+        best_afrp_index = afrp.index(best_afrp[-1])
+        best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
+        best_f1.append(f1[best_afrp_index])
+        best_mtp.append(
+            newline_score[best_afrp_index] + repetition_score[best_afrp_index]
+        )
+        # print(
+        #     f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
+        # )
+        df = result[model]["df_list_repetition_penalty"][best_afrp_index]
+        model_names.append(
+            f"{model} (RP={best_repetition_penalty[-1]})"
+        )  # Add the model name to the list
+    if ref_result is not None:
+        print("ref_result:", ref_result)
+        for model in ref_result.keys():
+            model_names.append(model)
+            df = pd.read_csv(ref_result[model])
+            # df = df[df["id"].isin(wikidata_df["id"])]
+            p = df["precision"].mean()
+            r = df["recall"].mean()
+            f1 = 2 * p * r / (p + r) if p + r > 0 else 0
+            best_f1.append(f1)
+            best_afrp.append(f1)
+            best_mtp.append(0)
+    print("model_names:", model_names)
+    # print("best_f1:", best_f1)
+    # print("best_afrp:", best_afrp)
+    # Create a DataFrame with the statistics
+    data = pd.DataFrame(
+        {
+            "Model": model_names,
+            "Repetition Factor Adjusted F1": best_afrp,
+            "F1": best_f1,
+        }
+    )
+    # Melt the DataFrame to a long format
+    data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
+    # Pivot the DataFrame to a wide format
+    data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
+    # make sure the columns are following the order of the models
+    data_pivoted = data_pivoted[model_names]
+    # make sure three groups in the order of precision, recall, f1
+    data_pivoted = data_pivoted.reindex(["Repetition Factor Adjusted F1", "F1"])
+    # Plot the statistics
+    plt.figure(figsize=(15, 6))
+    ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
+    plt.title(title)
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    # Set the rotation of the x-axis labels to 0 degrees
+    plt.xticks(rotation=0)
+    # Format the y-axis to display as percentage
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+    # get the max value of the y-axis
+    a1 = max(best_afrp)
+    a2 = max(best_f1)
+    max_value = max([a1, a2]) * 1.12
+    print("max_value:", max_value)
+    # Set the y-axis limit up to 70%
+    ax.set_ylim(0, max_value)
+    # Add the values above each bar
+    for p in ax.patches:
+        ax.annotate(
+            f"{p.get_height() * 100:.1f}",
+            (p.get_x() + p.get_width() / 2.0, p.get_height()),
+            ha="center",
+            va="bottom",
+            xytext=(0, 10),
+            textcoords="offset points",
+            rotation=90,
+        )
+    plt.show()
+    return data_pivoted, best_mtp
+def plot_best_performance(
+    result,
+    models=None,
+    title="Models with Best F1 Score",
+    adjusted_f1=False,
+    ref_result=None,
+):
+    # Initialize lists to store the statistics
+    model_names = []
+    best_precision = []
+    best_recall = []
+    best_f1 = []
+    best_repetition_penalty = []
+    best_mtp = []
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        precision = [
+            df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        recall = [
+            df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        newline_score = [
+            df["newline_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        repetition_score = [
+            df["repetition_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        if adjusted_f1:
+            precision, recall = adjust_perf_scores_with_repetition_penalty(
+                result[model], precision, recall
+            )
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        best_f1.append(max(f1))
+        best_f1_index = f1.index(best_f1[-1])
+        best_repetition_penalty.append(df["repetition_penalty"][best_f1_index])
+        best_precision.append(precision[best_f1_index])
+        best_recall.append(recall[best_f1_index])
+        best_mtp.append(newline_score[best_f1_index] + repetition_score[best_f1_index])
+        print(
+            f"best repetition penalty: {best_repetition_penalty[-1]}, best f1: {best_f1[-1]}, precision: {best_precision[-1]}, recall: {best_recall[-1]}"
+        )
+        df = result[model]["df_list_repetition_penalty"][best_f1_index]
+        model_names.append(
+            f"{model} (RP={best_repetition_penalty[-1]})"
+        )  # Add the model name to the list
+        # print sum for columns: newline_score, repetition_score
+        print(
+            f"newline_score: {df['newline_score'].sum()}, repetition_score: {df['repetition_score'].sum()}"
+        )
+    if ref_result is not None:
+        print("ref_result:", ref_result)
+        for model in ref_result.keys():
+            model_names.append(model)
+            df = pd.read_csv(ref_result[model])
+            # df = df[df["id"].isin(wikidata_df["id"])]
+            best_precision.append(df["precision"].mean())
+            best_recall.append(df["recall"].mean())
+            f1 = (
+                2
+                * (best_precision[-1] * best_recall[-1])
+                / (best_precision[-1] + best_recall[-1])
+            )
+            # best_f1.append(df["f1"].mean())
+            best_f1.append(f1)
+            best_mtp.append(0)
+    # Create a DataFrame with the statistics
+    data = (
+        pd.DataFrame(
+            {
+                "Model": model_names,
+                "Adjusted Precision with RP": best_precision,
+                "Adjusted Recall with RP": best_recall,
+                "Adjusted F1 with RP": best_f1,
+            }
+        )
+        if adjusted_f1
+        else pd.DataFrame(
+            {
+                "Model": model_names,
+                "Precision": best_precision,
+                "Recall": best_recall,
+                "F1": best_f1,
+            }
+        )
+    )
+    columns = list(data.columns)
+    # Melt the DataFrame to a long format
+    data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
+    # Pivot the DataFrame to a wide format
+    data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
+    # make sure the columns are following the order of the models
+    data_pivoted = data_pivoted[model_names]
+    # make sure three groups in the order of precision, recall, f1
+    data_pivoted = data_pivoted.reindex(columns[1:])
+    # Plot the statistics
+    plt.figure(figsize=(10, 6))
+    ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
+    plt.title(title)
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    # Set the rotation of the x-axis labels to 0 degrees
+    plt.xticks(rotation=0)
+    # Format the y-axis to display as percentage
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+    # get the max value of the y-axis
+    a1 = max(best_precision)
+    a2 = max(best_recall)
+    a3 = max(best_f1)
+    max_value = max([a1, a2, a3]) * 1.12
+    print("max_value:", max_value)
+    # Set the y-axis limit up to 70%
+    ax.set_ylim(0, max_value)
+    # Add the values above each bar
+    for p in ax.patches:
+        ax.annotate(
+            f"{p.get_height() * 100:.1f}",
+            (p.get_x() + p.get_width() / 2.0, p.get_height()),
+            ha="center",
+            va="bottom",
+            xytext=(0, 10),
+            textcoords="offset points",
+            rotation=90,
+        )
+    plt.show()
+    return data_pivoted, best_mtp
+def plot_best_performance_ms_macro(
+    result,
+    models=None,
+    title="Models with Best Repetition Factor Adjusted Performance",
+    ref_result=None,
+    skip_generic_prompt=False,
+    include_adjusted_performance=True,
+):
+    # Initialize lists to store the statistics
+    model_names = []
+    best_f1 = []
+    best_afrp = []
+    best_repetition_penalty = []
+    best_bleu1 = []
+    best_rougeL = []
+    best_mtp = []
+    if models is None:
+        models = result.keys()
+    for model in models:
+        if skip_generic_prompt and "generic prompt" in model:
+            continue
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        bleu1 = [x for x in df["bleu1"]]
+        rougeL = [x for x in df["rougeL"]]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
+        newline_score = [
+            df["newline_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        # print(f"newline_score: {newline_score}")
+        repetition_score = [
+            df["repetition_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        # print(f"repetition_score: {repetition_score}")
+        afrp = [
+            f / math.log10(10 + n + r)
+            for f, n, r in zip(f1, newline_score, repetition_score)
+        ]
+        best_afrp.append(max(afrp if include_adjusted_performance else f1))
+        best_afrp_index = (
+            afrp.index(best_afrp[-1])
+            if include_adjusted_performance
+            else f1.index(best_afrp[-1])
+        )
+        best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
+        best_f1.append(f1[best_afrp_index])
+        best_bleu1.append(bleu1[best_afrp_index])
+        best_rougeL.append(rougeL[best_afrp_index])
+        best_mtp.append(
+            newline_score[best_afrp_index] + repetition_score[best_afrp_index]
+        )
+        # print(
+        #     f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
+        # )
+        df = result[model]["df_list_repetition_penalty"][best_afrp_index]
+        model_names.append(
+            f"{model} (RP={best_repetition_penalty[-1]})"
+        )  # Add the model name to the list
+    if ref_result is not None:
+        print("ref_result:", ref_result)
+        for model in ref_result.keys():
+            model_names.append(model)
+            df = pd.read_csv(ref_result[model], comment="#", on_bad_lines="warn")
+            # df = df[df["id"].isin(wikidata_df["id"])]
+            p = df["bleu1"][0]
+            best_bleu1.append(p)
+            r = df["rougeL"][0]
+            best_rougeL.append(r)
+            f1 = 2 * p * r / (p + r) if p + r > 0 else 0
+            best_f1.append(f1)
+            best_afrp.append(f1)
+            best_mtp.append(0)
+    # print("model_names:", model_names)
+    # print("best_f1:", best_f1)
+    # print("best_afrp:", best_afrp)
+    # Create a DataFrame with the statistics
+    data = (
+        pd.DataFrame(
+            {
+                "Model": model_names,
+                "Repetition Factor Adjusted Perf Score": best_afrp,
+                "Overall Perf Score": best_f1,
+            }
+        )
+        if include_adjusted_performance
+        else pd.DataFrame(
+            {
+                "Model": model_names,
+                "Bleu-1": best_bleu1,
+                "Rouge-L": best_rougeL,
+                "Overall Perf Score": best_f1,
+            }
+        )
+    )
+    # Melt the DataFrame to a long format
+    data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
+    # Pivot the DataFrame to a wide format
+    data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
+    # make sure the columns are following the order of the models
+    data_pivoted = data_pivoted[model_names]
+    columns = list(data.columns)
+    data_pivoted = data_pivoted.reindex(columns[1:])
+    # Plot the statistics
+    plt.figure(figsize=(10, 6))
+    ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
+    plt.title(title)
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    # Set the rotation of the x-axis labels to 0 degrees
+    plt.xticks(rotation=0)
+    # Format the y-axis to display as percentage
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+    # get the max value of the y-axis
+    a1 = max(best_afrp)
+    a2 = max(best_f1)
+    a3 = max(best_bleu1)
+    a4 = max(best_rougeL)
+    max_value = (
+        max([a1, a2] if include_adjusted_performance else [a1, a2, a3, a4]) * 1.12
+    )
+    print("max_value:", max_value)
+    # Set the y-axis limit up to 70%
+    ax.set_ylim(0, max_value)
+    # Add the values above each bar
+    for p in ax.patches:
+        ax.annotate(
+            f"{p.get_height() * 100:.1f}",
+            (p.get_x() + p.get_width() / 2.0, p.get_height()),
+            ha="center",
+            va="bottom",
+            xytext=(0, 10),
+            textcoords="offset points",
+            rotation=90,
+        )
+    plt.show()
+    return data_pivoted, best_mtp
+all_open_source_models = [
+    "gemma-1.1-2b-it",
+    "Phi-3-mini-128k-instruct",
+    "gemma-1.1-7b-it",
+    "Llama-2-7b-chat-hf",
+    "Mistral-7B-Instruct-v0.2",
+    "Meta-Llama-3-8B-Instruct",
+    "Llama-2-13b-chat-hf",
+    "Llama-2-70b-chat-hf",
+    "Meta-Llama-3-70B-Instruct",
+]
+non_rag_csv_result_files = [
+    "./data/results/gemma-1.1-2b-it_wd_non_rag.csv",  # gemma-1.1-2b-it
+    "./data/results/Phi-3-mini-128k-instruct_wd_non_rag_batch_16.csv",  # Phi-3-mini-128k-instruct(batch size:16)
+    "./data/results/gemma-1.1-7b-it_wd_non_rag.csv",  # gemma-1.1-7b-it
+    "./data/results/Tune_2024-04-09_09-19-22.csv",  # Llama-2-7b-chat-hf
+    "./data/results/Tune_2024-04-16_12-24-27.csv.csv",  # Mistral-7B-Instruct-v0.2
+    "./data/results/Meta-Llama-3-8B-Instruct_wd_non_rag.csv",  # Meta-Llama-3-8B-Instruct
+    "./data/results/Meta-Llama-3-8B-Instruct_wd_1_non_rag.csv",  # Meta-Llama-3-8B-Instruct
+    "./data/results/Tune_2024-04-10_16-53-38.csv",  # Llama-2-13b-chat-hf
+    "./data/results/Llama-2-70b-chat-hf_wd_non_rag.csv",  # Llama-2-70b-chat-hf
+    "./data/results/Meta-Llama-3-70B-Instruct_wd_non_rag.csv",  # Meta-Llama-3-70B-Instruct
+]
+rag_csv_result_files = [
+    "./data/results/gemma-1.1-2b-it_wd.csv",  # gemma-1.1-2b-it
+    "./data/results/gemma-1.1-2b-it_wd_true.csv",  # gemma-1.1-2b-it(true)
+    "./data/results/Phi-3-mini-128k-instruct_wd_rag_batch_4.csv",  # Phi-3-mini-128k-instruct(batch size:16)
+    "./data/results/Phi-3-mini-128k-instruct_wd_true.csv",  # Phi-3-mini-128k-instruct(batch size:16)
+    "./data/results/gemma-1.1-7b-it_wd.csv",  # gemma-1.1-7b-it
+    "./data/results/gemma-1.1-7b-it_wd_true.csv",  # gemma-1.1-7b-it(true)
+    "./data/results/Tune_2024-03-20_15-35-37.csv",  # Llama-2-7b-chat-hf
+    "./data/results/Llama-2-7b-chat-hf_wd_true.csv",  # Llama-2-7b-chat-hf(true)
+    "./data/results/Tune_2024-03-29_11-28-20.csv",  # Mistral-7B-Instruct-v0.2
+    "./data/results/Mistral-7B-Instruct-v0.2_wd_true.csv",  # Mistral-7B-Instruct-v0.2(true)
+    "./data/results/Meta-Llama-3-8B-Instruct_wd.csv",  # Meta-Llama-3-8b-instruct
+    "./data/results/Meta-Llama-3-8B-Instruct_wd_true.csv",  # Meta-Llama-3-8b-instruct(true)
+    "./data/results/Tune_2024-03-25_23-32-57.csv",  # Llama-2-13b-chat-hf
+    "./data/results/Llama-2-13b-chat-hf_wd_true.csv",  # Llama-2-13b-chat-hf(true)
+    "./data/results/Llama-2-70b-chat-hf_wd.csv",  # Llama-2-70b-chat-hf
+    "./data/results/Llama-2-70b-chat-hf_wd_true.csv",  # Llama-2-70b-chat-hf
+    "./data/results/Meta-Llama-3-70B-Instruct_wd.csv",  # Meta-Llama-3-70B-Instruct
+    "./data/results/Meta-Llama-3-70B-Instruct_wd_true.csv",  # Meta-Llama-3-70B-Instruct(true)
+]
+df_ms_macro = pd.read_json("./data/datasets/ms_macro.json")
+def load_for_repetition_penalty_ms_macro(
+    csv_result_file, repetition_penalty, force_recalculate=False
+):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
+    )
+    df = load_with_newline_and_repetition_scores(
+        result_file, force_recalculate=force_recalculate
+    )
+    if len(df) != len(df_ms_macro):
+        print(f"error: len(df) != {len(df_ms_macro)}")
+        missing_ids = [
+            id for id in df_ms_macro["id"].unique() if id not in df["id"].unique()
+        ]
+        print(f"missing_ids: {missing_ids}")
+    if df["ground_truth"][0] != str(df_ms_macro["wellFormedAnswers"][0]):
+        df["ground_truth"] = df_ms_macro["wellFormedAnswers"]
+        print("ground_truth updated for:", result_file)
+        df.to_csv(result_file, index=False)
+    return df
+# MS MACRO
+def plot_performance_scores_ms_macro(
+    result,
+    models=None,
+    title="Performance",
+):
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # print(result[model]["df_list_repetition_penalty"][0].describe())
+        # Calculate the statistics
+        bleu1 = list(df["bleu1"])
+        rougeL = list(df["rougeL"])
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
+        best_f1 = max(f1)
+        best_f1_index = f1.index(best_f1)
+        bleu1, rougeL = adjust_perf_scores_with_repetition_penalty(
+            result[model], bleu1, rougeL
+        )
+        afrp = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        best_afrp = max(afrp)
+        best_afrp_index = afrp.index(best_afrp)
+        repetition_penalties = list(df["repetition_penalty"])
+        # line plot for precision, recall, f1
+        plt.figure(figsize=(10, 6))
+        plt.axvspan(
+            repetition_penalties[best_f1_index] - 0.01,
+            repetition_penalties[best_f1_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="blue",
+        )
+        plt.axvspan(
+            repetition_penalties[best_afrp_index] - 0.01,
+            repetition_penalties[best_afrp_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="orange",
+        )
+        plt.plot(
+            repetition_penalties,
+            f1,
+            label="Overall Perf Score",
+            marker="D",
+            color="blue",
+        )
+        plt.plot(
+            repetition_penalties,
+            afrp,
+            label="RF Adjusted Perf Score",
+            marker="o",
+            color="orange",
+        )
+        plt.xlabel("Repetition Penalties")
+        plt.ylabel("Score")
+        plt.xlim(0.99, 1.31)
+        # y in percentage
+        plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+        plt.title(f"{model} {title}")
+        plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+        plt.show()
+def plot_repetition_factors(result, groups):
+    for group in groups:
+        # Plot the statistics
+        plt.figure(figsize=(10, 6))
+        max_value = 0
+        for model in result.keys():
+            if not group in model.lower():
+                continue
+            print(f"model: {model}")
+            df = result[model]["df_overall"]
+            repetition_panelties = [
+                repetition_penalty for repetition_penalty in df["repetition_penalty"]
+            ]
+            mean_score = [
+                math.log10(10 + df["total_repetitions"].mean())
+                for df in result[model]["df_list_repetition_penalty"]
+            ]
+            sns.lineplot(x=repetition_panelties, y=mean_score, label=model)
+            new_max = max(mean_score)
+            if new_max > max_value:
+                max_value = new_max
+        max_value = max_value * 1.05
+        if max_value < 1.5:
+            max_value = 1.5
+        # set ylimit
+        plt.ylim(1, max_value)
+        # show grid
+        plt.grid(True)
+        plt.xlabel("Repetition Penalties")
+        plt.ylabel("Repetition Factors")
+        plt.title("Repetition Factors vs Repetition Penalties")
+        plt.legend()
+        plt.show()
+def plot_repetition_factors_by_group(result, group_filter=None):
+    markers = ["D", "o", "s", "x"]
+    colors = ["blue", "orange", "green", "red"]
+    # Plot the statistics
+    plt.figure(figsize=(10, 6))
+    index = 0
+    max_value = 0
+    for model in result.keys():
+        if group_filter is not None and group_filter not in model:
+            continue
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        repetition_panelties = [
+            repetition_penalty for repetition_penalty in df["repetition_penalty"]
+        ]
+        # Calculate the statistics
+        mean_score = [
+            math.log10(10 + df["total_repetitions"].mean())
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        if len(mean_score) != len(repetition_panelties):
+            print(
+                f"model: {model} has different length of repetition penalties and mean score"
+            )
+            print("repetition_panelties:", len(repetition_panelties))
+            print("mean_score:", len(mean_score))
+            continue
+        new_max = max(mean_score)
+        if new_max > max_value:
+            max_value = new_max
+        sns.lineplot(
+            x=repetition_panelties,
+            y=mean_score,
+            label=model,
+            marker=markers[index],
+            color=colors[index],
+        )
+        index += 1
+    max_value = max_value * 1.05
+    if max_value < 1.5:
+        max_value = 1.5
+    # set ylimit
+    plt.ylim(1, max_value)
+    max_value = 0
+    plt.xlabel("Repetition Penalties")
+    plt.ylabel("Repetition Factors")
+    plt.title("Repetition Factors vs Repetition Penalties")
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    plt.show()
+ms_marco_csv_result_files = [
+    "data/results/gemma-1.1-2b-it_mm_true_false.csv",
+    "data/results/gemma-1.1-2b-it_mm_true.csv",
+    "data/results/gemma-1.1-2b-it_mm_true_false_non_rag.csv",
+    "data/results/Phi-3-mini-128k-instruct_mm_false.csv",
+    "data/results/Phi-3-mini-128k-instruct_mm_true.csv",
+    "data/results/Phi-3-mini-128k-instruct_mm_non_rag.csv",
+    "data/results/gemma-1.1-7b-it_mm_false.csv",
+    "data/results/gemma-1.1-7b-it_mm_true.csv",
+    "data/results/gemma-1.1-7b-it_mm_non_rag.csv",
+    "data/results/Llama-2-7b-chat-hf_mm_true_false.csv",
+    "data/results/Llama-2-7b-chat-hf_mm_true.csv",
+    "data/results/Llama-2-7b-chat-hf_mm_true_false_non_rag.csv",
+    "data/results/Mistral-7B-Instruct-v0.2_mm_false.csv",
+    "data/results/Mistral-7B-Instruct-v0.2_mm_true.csv",
+    "data/results/Mistral-7B-Instruct-v0.2_mm_non_rag.csv",
+    "data/results/Meta-Llama-3-8B-Instruct_mm_true_false.csv",
+    "data/results/Meta-Llama-3-8B-Instruct_mm_true.csv",
+    "data/results/Meta-Llama-3-8B-Instruct_mm_true_false_non_rag.csv",
+    "data/results/Llama-2-13b-chat-hf_mm_false.csv",
+    "data/results/Llama-2-13b-chat-hf_mm_true.csv",
+    "data/results/Llama-2-13b-chat-hf_mm_non_rag.csv",
+    "data/results/Llama-2-70b-chat-hf_mm_false.csv",
+    "data/results/Llama-2-70b-chat-hf_mm_true.csv",
+    "data/results/Llama-2-70b-chat-hf_mm_non_rag.csv",
+    "data/results/Meta-Llama-3-70B-Instruct_mm_false.csv",
+    "data/results/Meta-Llama-3-70B-Instruct_mm_true.csv",
+    "data/results/Meta-Llama-3-70B-Instruct_mm_non_rag.csv",
+]
+webqsp_csv_result_files = []
+webqsp_model_result_counts = {}
+def find_model_name(file_path):
+    df = pd.read_csv(file_path, comment="#", on_bad_lines="warn")
+    return df["model"][0]
+def add_file(file):
+    model_name = find_model_name(file)
+    if "(generic prompt)" not in model_name:
+        webqsp_csv_result_files.append(file)
+        if model_name not in webqsp_model_result_counts:
+            webqsp_model_result_counts[model_name] = 1
+        else:
+            webqsp_model_result_counts[model_name] += 1
+last_model_name = None
+non_rag_index = 0
+for csv_result_file in rag_csv_result_files:
+    try:
+        model_name = find_model_name(csv_result_file)
+        # print(f"processing model: {model_name} - {csv_result_file}")
+        if last_model_name != model_name and last_model_name is not None:
+            while non_rag_index < len(non_rag_csv_result_files):
+                # print(f"processing non-rag file - {file}")
+                file = non_rag_csv_result_files[non_rag_index]
+                non_model_name = find_model_name(file)
+                if non_model_name.startswith(last_model_name):
+                    add_file(file)
+                    non_rag_index += 1
+                else:
+                    break
+        add_file(csv_result_file)
+        last_model_name = model_name
+    except FileNotFoundError as e:
+        print("\terror processing file: ", csv_result_file, e)
+        continue
+for file in non_rag_csv_result_files[non_rag_index:]:
+    add_file(file)
+def calc_rap_scores(result, precision="precision", recall="recall"):
+    newline_score = [
+        df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
+    ]
+    repetition_score = [
+        df["repetition_score"].mean() for df in result["df_list_repetition_penalty"]
+    ]
+    if precision in result["df_list_repetition_penalty"][0].columns:
+        precision = [
+            df[precision].mean() for df in result["df_list_repetition_penalty"]
+        ]
+        recall = [df[recall].mean() for df in result["df_list_repetition_penalty"]]
+    else:
+        precision = result["df_overall"][precision]
+        recall = result["df_overall"][recall]
+    f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+    rap = [
+        f / math.log10(10 + n + r)
+        for f, n, r in zip(f1, newline_score, repetition_score)
+    ]
+    return newline_score, repetition_score, f1, rap
+def load_webqsp_result(csv_result_files, force_recalculate=False):
+    model_name_exts = {
+        "true": "(RAG - Chat Template)",
+        "wd": "(RAG - Generic Prompt)",
+        "rag": "(Non-RAG)",
+    }
+    result = {}
+    for i, csv_result_file in enumerate(csv_result_files):
+        try:
+            df = pd.read_csv(csv_result_file)
+            parts = re.split(r"[_\.]", csv_result_file)
+            if parts[-2] in model_name_exts.keys():
+                key = parts[-2]
+            elif csv_result_file in non_rag_csv_result_files:
+                key = "rag"
+            else:
+                key = "wd"
+            model_name = f'{df["model"][0]}{model_name_exts[key]}'
+            dfs = [
+                calculate_performance_score(
+                    csv_result_file,
+                    repetition_penalty,
+                    force_recalculate=force_recalculate,
+                )
+                for repetition_penalty in df["repetition_penalty"]
+            ]
+            result[model_name] = {
+                "df_overall": df,
+                "df_list_repetition_penalty": dfs,
+                "file": csv_result_file,
+            }
+            newline_score, repetition_score, perf, rap = calc_rap_scores(
+                result[model_name]
+            )
+            df["newline_score"] = newline_score
+            df["repetition_score"] = repetition_score
+            df["total_repetitions"] = df["newline_score"] + df["repetition_score"]
+            df["perf"] = perf
+            df["rap"] = rap
+        except Exception as e:
+            print(f"Error: {e}")
+    return result
+def load_ms_marco_result(csv_result_files, force_recalculate=False):
+    model_name_exts = {
+        "true": "(RAG - Chat Template)",
+        "false": "(RAG - Generic Prompt)",
+        "rag": "(Non-RAG)",
+    }
+    result = {}
+    for csv_result_file in csv_result_files:
+        try:
+            df = pd.read_csv(csv_result_file)
+            parts = re.split(r"[_\.]", csv_result_file)
+            model_name = f'{df["model"][0]}{model_name_exts[parts[-2]]}'
+            print(f"\tmodel_name: {model_name}")
+            dfs = [
+                load_for_repetition_penalty_ms_macro(
+                    csv_result_file,
+                    repetition_penalty,
+                    force_recalculate=force_recalculate,
+                )
+                for repetition_penalty in df["repetition_penalty"]
+            ]
+            result[model_name] = {
+                "df_overall": df,
+                "df_list_repetition_penalty": dfs,
+                "file": csv_result_file,
+            }
+            newline_score, repetition_score, perf, rap = calc_rap_scores(
+                result[model_name],
+                precision="bleu1",
+                recall="rougeL",
+            )
+            df["newline_score"] = newline_score
+            df["repetition_score"] = repetition_score
+            df["total_repetitions"] = df["newline_score"] + df["repetition_score"]
+            df["perf"] = perf
+            df["rap"] = rap
+        except Exception as e:
+            print(f"Error: {e}")
+    return result

eval_modules/calc_repetitions_v5.py ADDED Viewed

	@@ -0,0 +1,1383 @@

+import os
+import re
+import math
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mtick
+import seaborn as sns
+import nltk
+import evaluate
+meteor = evaluate.load("meteor")
+print(f"loading: {__file__}")
+# final version
+pattern_excessive_whitespaces = re.compile(r"\s{5,}")
+pattern_text_repetitions = re.compile(r"(.{5}.*)\s*((\1)\s*)+", re.M | re.DOTALL)
+def del_excessive_whitespaces(text, debug=False):
+    count = 0
+    if isinstance(text, str):
+        if debug:
+            print("----detect excessive whitespaces----")
+        count = len(text)
+        text = pattern_excessive_whitespaces.sub("", text)
+        count -= len(text)
+        if debug and count:
+            print(f"removed excessive whitespaces: {count}")
+    return text, count
+# final version for repetition detection
+def detect_text_repetitions(text, debug=False):
+    count = 0
+    if isinstance(text, str):
+        if debug:
+            print("----detect text repetitions----")
+        matches = pattern_text_repetitions.finditer(text)
+        for match in matches:
+            if debug:
+                print(match)
+                for groupNum in range(0, len(match.groups())):
+                    groupNum = groupNum + 1
+                    print(
+                        "Group {groupNum} found at {start}-{end}: `{group}`".format(
+                            groupNum=groupNum,
+                            start=match.start(groupNum),
+                            end=match.end(groupNum),
+                            group=match.group(groupNum),
+                        )
+                    )
+            start, end = match.span()
+            count += end - start
+    return count
+def detect_repetitions(text, debug=False):
+    text, count_excessive_whitespaces = del_excessive_whitespaces(text, debug=debug)
+    count_text_repetitions = detect_text_repetitions(text, debug=debug)
+    total_repetitions = count_excessive_whitespaces + count_text_repetitions
+    result = (count_excessive_whitespaces, count_text_repetitions, total_repetitions)
+    if debug:
+        print(result)
+    return result
+def detect_scores(text, debug=False):
+    newline_score, repetition_score, total_repetitions = detect_repetitions(
+        text, debug=debug
+    )
+    return pd.Series([newline_score, repetition_score, total_repetitions])
+def load_with_newline_and_repetition_scores(result_file, force_recalculate=False):
+    print(f"loading result file: {result_file}")
+    df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
+    if (
+        force_recalculate
+        or "newline_score" not in df.columns
+        or "repetition_score" not in df.columns
+        or "total_repetitions" not in df.columns
+    ):
+        df[["newline_score", "repetition_score", "total_repetitions"]] = df[
+            "answer"
+        ].apply(detect_scores)
+        df.to_csv(result_file, index=False)
+    return df
+def replace_last(source_string, old_string, new_string):
+    head, _sep, tail = source_string.rpartition(old_string)
+    return head + new_string + tail
+def load_for_repetition_penalty(
+    csv_result_file, repetition_penalty, force_recalculate=False
+):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
+    )
+    return load_with_newline_and_repetition_scores(
+        result_file, force_recalculate=force_recalculate
+    )
+def calc_adjusted_performance(f, r):
+    return f / math.log10(10 + r)
+def calculate_adjusted_performance(row):
+    r = row["total_repetitions"]
+    adjusted_precision = calc_adjusted_performance(row["precision"], r)
+    adjusted_recall = calc_adjusted_performance(row["recall"], r)
+    return pd.Series([adjusted_precision, adjusted_recall])
+def load_performance_df(csv_result_file, repetition_penalty):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}-t2_evaluated.json"
+    )
+    result_file = result_file.replace("/results/", "/eval/")
+    print(f"loading json file: {result_file}")
+    df = pd.read_json(result_file)
+    return df
+def calculate_performance_score_v1(
+    csv_result_file, repetition_penalty, force_recalculate=False
+):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
+    )
+    print(f"loading result file: {result_file}")
+    df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
+    if force_recalculate or "f2" in df.columns or "f1" not in df.columns:
+        df.drop(
+            columns=[
+                "precision",
+                "recall",
+                "f1",
+                "f2",
+                "entities_in_answer",
+                "entities_in_question",
+            ],
+            errors="ignore",
+            inplace=True,
+        )
+        perf_df = load_performance_df(csv_result_file, repetition_penalty)
+        filtered_df = perf_df[perf_df["id"].isin(df["id"])]
+        perf_df = filtered_df.reset_index(drop=True)
+        print(f"perf_df len: {len(perf_df)}")
+        # print(perf_df.head())
+        df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"]
+        df["precision"] = perf_df["score"].apply(lambda x: x[0])
+        df["recall"] = perf_df["score"].apply(lambda x: x[1])
+        df["f1"] = perf_df["score"].apply(lambda x: x[2])
+        df[["adjusted_precision", "adjusted_recall"]] = df.apply(
+            calculate_adjusted_performance, axis=1
+        )
+        df.to_csv(result_file, index=False)
+        print(f"performance scores saved to result file: {result_file}")
+    print(f"df len: {len(df)}")
+    return df
+ref_df = pd.read_csv(
+    "./data/results/gpt-3.5-turbo_non_rag.csv", comment="#", on_bad_lines="warn"
+)
+def calculate_performance_score(
+    csv_result_file, repetition_penalty, force_recalculate=False
+):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
+    )
+    re_creating = False
+    if os.path.exists(result_file):
+        print(f"loading result file: {result_file}")
+        df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
+    else:
+        print(f"re-creating result file: {result_file}")
+        df = pd.DataFrame()
+        force_recalculate = True
+    if force_recalculate or "f2" in df.columns or "f1" not in df.columns:
+        df.drop(
+            columns=[
+                "precision",
+                "recall",
+                "f1",
+                "f2",
+                "entities_in_answer",
+                "entities_in_question",
+                "word_count",
+            ],
+            errors="ignore",
+            inplace=True,
+        )
+        perf_df = load_performance_df(csv_result_file, repetition_penalty)
+        filtered_df = perf_df[perf_df["id"].isin(ref_df["id"])]
+        perf_df = filtered_df.reset_index(drop=True)
+        print(f"perf_df len: {len(perf_df)}")
+        if len(perf_df) != len(ref_df):
+            print(f"error: len(perf_df) != {len(ref_df)}")
+            missing_ids = [
+                id for id in ref_df["id"].unique() if id not in perf_df["id"].unique()
+            ]
+            print(f"missing_ids: {missing_ids}")
+        # print(perf_df.head())
+        df["id"] = perf_df["id"]
+        df["question"] = perf_df["question"]
+        df["answer"] = perf_df["pred_answer"]
+        df["word_count"] = df["answer"].apply(
+            lambda x: len(nltk.word_tokenize(x)) if isinstance(x, str) else 0
+        )
+        df["ground_truth"] = perf_df["ground_truth"]
+        df[["newline_score", "repetition_score", "total_repetitions"]] = df[
+            "answer"
+        ].apply(detect_scores)
+        df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"]
+        df["precision"] = perf_df["score"].apply(lambda x: x[0])
+        df["recall"] = perf_df["score"].apply(lambda x: x[1])
+        df["f1"] = perf_df["score"].apply(lambda x: x[2])
+        df[["adjusted_precision", "adjusted_recall"]] = df.apply(
+            calculate_adjusted_performance, axis=1
+        )
+        df.to_csv(result_file, index=False)
+        print(f"performance scores saved to result file: {result_file}")
+    print(f"df len: {len(df)}")
+    return df
+def adjust_perf_scores_with_repetition_penalty(result, precision, recall):
+    newline_score = [
+        df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
+    ]
+    repetition_score = [
+        df["repetition_score"].mean() for df in result["df_list_repetition_penalty"]
+    ]
+    precision = [
+        f / math.log10(10 + n + r)
+        for f, n, r in zip(precision, newline_score, repetition_score)
+    ]
+    recall = [
+        f / math.log10(10 + n + r)
+        for f, n, r in zip(recall, newline_score, repetition_score)
+    ]
+    return precision, recall
+def plot_performance_scores(
+    result,
+    models=None,
+    title="Performance",
+):
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        precision = [
+            df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        recall = [
+            df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        best_f1 = max(f1)
+        best_f1_index = f1.index(best_f1)
+        precision, recall = adjust_perf_scores_with_repetition_penalty(
+            result[model], precision, recall
+        )
+        afrp = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        best_afrp = max(afrp)
+        best_afrp_index = afrp.index(best_afrp)
+        adjusted_precision = [
+            df["adjusted_precision"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        adjusted_recall = [
+            df["adjusted_recall"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        afrp2 = [
+            2 * (p * r) / (p + r) for p, r in zip(adjusted_precision, adjusted_recall)
+        ]
+        best_afrp2 = max(afrp2)
+        best_afrp2_index = afrp2.index(best_afrp2)
+        repetition_penalties = list(df["repetition_penalty"])
+        # line plot for precision, recall, f1
+        plt.figure(figsize=(10, 6))
+        plt.axvspan(
+            repetition_penalties[best_f1_index] - 0.01,
+            repetition_penalties[best_f1_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="blue",
+        )
+        # plt.axvspan(
+        #     repetition_penalties[best_afrp2_index] - 0.01,
+        #     repetition_penalties[best_afrp2_index] + 0.01,
+        #     alpha=0.5,
+        #     edgecolor="none",
+        #     facecolor="green",
+        # )
+        plt.axvspan(
+            repetition_penalties[best_afrp_index] - 0.01,
+            repetition_penalties[best_afrp_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="orange",
+        )
+        plt.plot(repetition_penalties, f1, label="F1", marker="D", color="blue")
+        # plt.plot(
+        #     repetition_penalties,
+        #     afrp2,
+        #     label="Per-question RAP - F1",
+        #     marker="s",
+        #     color="green",
+        # )
+        plt.plot(
+            repetition_penalties,
+            afrp,
+            label="RAP - F1",
+            marker="o",
+            color="orange",
+        )
+        plt.xlabel("Repetition Penalties")
+        plt.ylabel("Score")
+        # plt.xlim(0.99, 1.31)
+        # y in percentage
+        plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+        plt.title(f"{model} {title}")
+        plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+        plt.show()
+def plot_best_afrp(
+    result,
+    models=None,
+    title="Models with Best RAP - F1",
+    ref_result=None,
+):
+    # Initialize lists to store the statistics
+    model_names = []
+    best_f1 = []
+    best_afrp = []
+    best_repetition_penalty = []
+    best_mtr = []
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        precision = [
+            df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        recall = [
+            df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        newline_score = [
+            df["newline_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        # print(f"newline_score: {newline_score}")
+        repetition_score = [
+            df["repetition_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        # print(f"repetition_score: {repetition_score}")
+        afrp = [
+            f / math.log10(10 + n + r)
+            for f, n, r in zip(f1, newline_score, repetition_score)
+        ]
+        best_afrp.append(max(afrp))
+        best_afrp_index = afrp.index(best_afrp[-1])
+        best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
+        best_f1.append(f1[best_afrp_index])
+        best_mtr.append(
+            newline_score[best_afrp_index] + repetition_score[best_afrp_index]
+        )
+        # print(
+        #     f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
+        # )
+        df = result[model]["df_list_repetition_penalty"][best_afrp_index]
+        model_names.append(
+            f"{model} (RP={best_repetition_penalty[-1]})"
+        )  # Add the model name to the list
+    if ref_result is not None:
+        print("ref_result:", ref_result)
+        for model in ref_result.keys():
+            model_names.append(model)
+            df = pd.read_csv(ref_result[model])
+            # df = df[df["id"].isin(wikidata_df["id"])]
+            p = df["precision"].mean()
+            r = df["recall"].mean()
+            f1 = 2 * p * r / (p + r) if p + r > 0 else 0
+            best_f1.append(f1)
+            best_afrp.append(f1)
+            best_mtr.append(0)
+    print("model_names:", model_names)
+    # print("best_f1:", best_f1)
+    # print("best_afrp:", best_afrp)
+    # Create a DataFrame with the statistics
+    data = pd.DataFrame(
+        {
+            "Model": model_names,
+            "RAP - F1": best_afrp,
+            "F1": best_f1,
+        }
+    )
+    # Melt the DataFrame to a long format
+    data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
+    # Pivot the DataFrame to a wide format
+    data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
+    # make sure the columns are following the order of the models
+    data_pivoted = data_pivoted[model_names]
+    # make sure three groups in the order of precision, recall, f1
+    data_pivoted = data_pivoted.reindex(["RAP - F1", "F1"])
+    # Plot the statistics
+    plt.figure(figsize=(15, 6))
+    ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
+    plt.title(title)
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    # Set the rotation of the x-axis labels to 0 degrees
+    plt.xticks(rotation=0)
+    # Format the y-axis to display as percentage
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+    # get the max value of the y-axis
+    a1 = max(best_afrp)
+    a2 = max(best_f1)
+    max_value = max([a1, a2]) * 1.12
+    print("max_value:", max_value)
+    # Set the y-axis limit up to 70%
+    ax.set_ylim(0, max_value)
+    # Add the values above each bar
+    for p in ax.patches:
+        ax.annotate(
+            f"{p.get_height() * 100:.1f}",
+            (p.get_x() + p.get_width() / 2.0, p.get_height()),
+            ha="center",
+            va="bottom",
+            xytext=(0, 10),
+            textcoords="offset points",
+            rotation=90,
+        )
+    plt.show()
+    return data_pivoted, best_mtr
+def plot_best_performance(
+    result,
+    models=None,
+    title="Models with Best F1 Score",
+    adjusted_f1=False,
+    ref_result=None,
+):
+    # Initialize lists to store the statistics
+    model_names = []
+    best_precision = []
+    best_recall = []
+    best_f1 = []
+    best_repetition_penalty = []
+    best_mtr = []
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        precision = [
+            df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        recall = [
+            df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
+        ]
+        newline_score = [
+            df["newline_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        repetition_score = [
+            df["repetition_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        if adjusted_f1:
+            precision, recall = adjust_perf_scores_with_repetition_penalty(
+                result[model], precision, recall
+            )
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+        best_f1.append(max(f1))
+        best_f1_index = f1.index(best_f1[-1])
+        best_repetition_penalty.append(df["repetition_penalty"][best_f1_index])
+        best_precision.append(precision[best_f1_index])
+        best_recall.append(recall[best_f1_index])
+        best_mtr.append(newline_score[best_f1_index] + repetition_score[best_f1_index])
+        print(
+            f"best repetition penalty: {best_repetition_penalty[-1]}, best f1: {best_f1[-1]}, precision: {best_precision[-1]}, recall: {best_recall[-1]}"
+        )
+        df = result[model]["df_list_repetition_penalty"][best_f1_index]
+        model_names.append(
+            f"{model} (RP={best_repetition_penalty[-1]})"
+        )  # Add the model name to the list
+        # print sum for columns: newline_score, repetition_score
+        print(
+            f"newline_score: {df['newline_score'].sum()}, repetition_score: {df['repetition_score'].sum()}"
+        )
+    if ref_result is not None:
+        print("ref_result:", ref_result)
+        for model in ref_result.keys():
+            model_names.append(model)
+            df = pd.read_csv(ref_result[model])
+            # df = df[df["id"].isin(wikidata_df["id"])]
+            best_precision.append(df["precision"].mean())
+            best_recall.append(df["recall"].mean())
+            f1 = (
+                2
+                * (best_precision[-1] * best_recall[-1])
+                / (best_precision[-1] + best_recall[-1])
+            )
+            # best_f1.append(df["f1"].mean())
+            best_f1.append(f1)
+            best_mtr.append(0)
+    # Create a DataFrame with the statistics
+    data = (
+        pd.DataFrame(
+            {
+                "Model": model_names,
+                "Adjusted Precision with RP": best_precision,
+                "Adjusted Recall with RP": best_recall,
+                "Adjusted F1 with RP": best_f1,
+            }
+        )
+        if adjusted_f1
+        else pd.DataFrame(
+            {
+                "Model": model_names,
+                "Precision": best_precision,
+                "Recall": best_recall,
+                "F1": best_f1,
+            }
+        )
+    )
+    columns = list(data.columns)
+    # Melt the DataFrame to a long format
+    data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
+    # Pivot the DataFrame to a wide format
+    data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
+    # make sure the columns are following the order of the models
+    data_pivoted = data_pivoted[model_names]
+    # make sure three groups in the order of precision, recall, f1
+    data_pivoted = data_pivoted.reindex(columns[1:])
+    # Plot the statistics
+    plt.figure(figsize=(10, 6))
+    ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
+    plt.title(title)
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    # Set the rotation of the x-axis labels to 0 degrees
+    plt.xticks(rotation=0)
+    # Format the y-axis to display as percentage
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+    # get the max value of the y-axis
+    a1 = max(best_precision)
+    a2 = max(best_recall)
+    a3 = max(best_f1)
+    max_value = max([a1, a2, a3]) * 1.12
+    print("max_value:", max_value)
+    # Set the y-axis limit up to 70%
+    ax.set_ylim(0, max_value)
+    # Add the values above each bar
+    for p in ax.patches:
+        ax.annotate(
+            f"{p.get_height() * 100:.1f}",
+            (p.get_x() + p.get_width() / 2.0, p.get_height()),
+            ha="center",
+            va="bottom",
+            xytext=(0, 10),
+            textcoords="offset points",
+            rotation=90,
+        )
+    plt.show()
+    return data_pivoted, best_mtr
+def plot_best_performance_ms_macro(
+    result,
+    models=None,
+    title="Models with Best RAP - Performance",
+    ref_result=None,
+    skip_generic_prompt=False,
+    include_adjusted_performance=True,
+):
+    # Initialize lists to store the statistics
+    model_names = []
+    best_f1 = []
+    best_afrp = []
+    best_repetition_penalty = []
+    best_bleu1 = []
+    best_rougeL = []
+    best_mtr = []
+    if models is None:
+        models = result.keys()
+    for model in models:
+        if skip_generic_prompt and "generic prompt" in model:
+            continue
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # Calculate the statistics
+        bleu1 = [x for x in df["bleu1"]]
+        rougeL = [x for x in df["rougeL"]]
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
+        newline_score = [
+            df["newline_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        # print(f"newline_score: {newline_score}")
+        repetition_score = [
+            df["repetition_score"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        # print(f"repetition_score: {repetition_score}")
+        afrp = [
+            f / math.log10(10 + n + r)
+            for f, n, r in zip(f1, newline_score, repetition_score)
+        ]
+        best_afrp.append(max(afrp if include_adjusted_performance else f1))
+        best_afrp_index = (
+            afrp.index(best_afrp[-1])
+            if include_adjusted_performance
+            else f1.index(best_afrp[-1])
+        )
+        best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
+        best_f1.append(f1[best_afrp_index])
+        best_bleu1.append(bleu1[best_afrp_index])
+        best_rougeL.append(rougeL[best_afrp_index])
+        best_mtr.append(
+            newline_score[best_afrp_index] + repetition_score[best_afrp_index]
+        )
+        # print(
+        #     f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
+        # )
+        df = result[model]["df_list_repetition_penalty"][best_afrp_index]
+        model_names.append(
+            f"{model} (RP={best_repetition_penalty[-1]})"
+        )  # Add the model name to the list
+    if ref_result is not None:
+        print("ref_result:", ref_result)
+        for model in ref_result.keys():
+            model_names.append(model)
+            df = pd.read_csv(ref_result[model], comment="#", on_bad_lines="warn")
+            # df = df[df["id"].isin(wikidata_df["id"])]
+            p = df["bleu1"][0]
+            best_bleu1.append(p)
+            r = df["rougeL"][0]
+            best_rougeL.append(r)
+            f1 = 2 * p * r / (p + r) if p + r > 0 else 0
+            best_f1.append(f1)
+            best_afrp.append(f1)
+            best_mtr.append(0)
+    # print("model_names:", model_names)
+    # print("best_f1:", best_f1)
+    # print("best_afrp:", best_afrp)
+    # Create a DataFrame with the statistics
+    data = (
+        pd.DataFrame(
+            {
+                "Model": model_names,
+                "RAP - Perf Score": best_afrp,
+                "Overall Perf Score": best_f1,
+            }
+        )
+        if include_adjusted_performance
+        else pd.DataFrame(
+            {
+                "Model": model_names,
+                "Bleu-1": best_bleu1,
+                "Rouge-L": best_rougeL,
+                "Overall Perf Score": best_f1,
+            }
+        )
+    )
+    # Melt the DataFrame to a long format
+    data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
+    # Pivot the DataFrame to a wide format
+    data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
+    # make sure the columns are following the order of the models
+    data_pivoted = data_pivoted[model_names]
+    columns = list(data.columns)
+    data_pivoted = data_pivoted.reindex(columns[1:])
+    # Plot the statistics
+    plt.figure(figsize=(10, 6))
+    ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
+    plt.title(title)
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    # Set the rotation of the x-axis labels to 0 degrees
+    plt.xticks(rotation=0)
+    # Format the y-axis to display as percentage
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+    # get the max value of the y-axis
+    a1 = max(best_afrp)
+    a2 = max(best_f1)
+    a3 = max(best_bleu1)
+    a4 = max(best_rougeL)
+    max_value = (
+        max([a1, a2] if include_adjusted_performance else [a1, a2, a3, a4]) * 1.12
+    )
+    print("max_value:", max_value)
+    # Set the y-axis limit up to 70%
+    ax.set_ylim(0, max_value)
+    # Add the values above each bar
+    for p in ax.patches:
+        ax.annotate(
+            f"{p.get_height() * 100:.1f}",
+            (p.get_x() + p.get_width() / 2.0, p.get_height()),
+            ha="center",
+            va="bottom",
+            xytext=(0, 10),
+            textcoords="offset points",
+            rotation=90,
+        )
+    plt.show()
+    return data_pivoted, best_mtr
+all_open_source_models = [
+    "gemma-1.1-2b-it",
+    "Phi-3-mini-128k-instruct",
+    "gemma-1.1-7b-it",
+    "Llama-2-7b-chat-hf",
+    "Mistral-7B-Instruct-v0.2",
+    "Meta-Llama-3-8B-Instruct",
+    "Llama-2-13b-chat-hf",
+    "Llama-2-70b-chat-hf",
+    "Meta-Llama-3-70B-Instruct",
+]
+non_rag_csv_result_files = [
+    "./data/results/gemma-1.1-2b-it_wd_non_rag.csv",  # gemma-1.1-2b-it
+    "./data/results/Phi-3-mini-128k-instruct_wd_non_rag_batch_16.csv",  # Phi-3-mini-128k-instruct(batch size:16)
+    "./data/results/gemma-1.1-7b-it_wd_non_rag.csv",  # gemma-1.1-7b-it
+    "./data/results/Tune_2024-04-09_09-19-22.csv",  # Llama-2-7b-chat-hf
+    "./data/results/Tune_2024-04-16_12-24-27.csv.csv",  # Mistral-7B-Instruct-v0.2
+    "./data/results/Meta-Llama-3-8B-Instruct_wd_non_rag.csv",  # Meta-Llama-3-8B-Instruct
+    "./data/results/Meta-Llama-3-8B-Instruct_wd_1_non_rag.csv",  # Meta-Llama-3-8B-Instruct
+    "./data/results/Tune_2024-04-10_16-53-38.csv",  # Llama-2-13b-chat-hf
+    "./data/results/Llama-2-70b-chat-hf_wd_non_rag.csv",  # Llama-2-70b-chat-hf
+    "./data/results/Meta-Llama-3-70B-Instruct_wd_non_rag.csv",  # Meta-Llama-3-70B-Instruct
+]
+rag_csv_result_files = [
+    "./data/results/gemma-1.1-2b-it_wd.csv",  # gemma-1.1-2b-it
+    "./data/results/gemma-1.1-2b-it_wd_true.csv",  # gemma-1.1-2b-it(true)
+    "./data/results/Phi-3-mini-128k-instruct_wd_rag_batch_4.csv",  # Phi-3-mini-128k-instruct(batch size:16)
+    "./data/results/Phi-3-mini-128k-instruct_wd_true.csv",  # Phi-3-mini-128k-instruct(batch size:16)
+    "./data/results/gemma-1.1-7b-it_wd.csv",  # gemma-1.1-7b-it
+    "./data/results/gemma-1.1-7b-it_wd_true.csv",  # gemma-1.1-7b-it(true)
+    "./data/results/Tune_2024-03-20_15-35-37.csv",  # Llama-2-7b-chat-hf
+    "./data/results/Llama-2-7b-chat-hf_wd_true.csv",  # Llama-2-7b-chat-hf(true)
+    "./data/results/Tune_2024-03-29_11-28-20.csv",  # Mistral-7B-Instruct-v0.2
+    "./data/results/Mistral-7B-Instruct-v0.2_wd_true.csv",  # Mistral-7B-Instruct-v0.2(true)
+    "./data/results/Meta-Llama-3-8B-Instruct_wd.csv",  # Meta-Llama-3-8b-instruct
+    "./data/results/Meta-Llama-3-8B-Instruct_wd_true.csv",  # Meta-Llama-3-8b-instruct(true)
+    "./data/results/Tune_2024-03-25_23-32-57.csv",  # Llama-2-13b-chat-hf
+    "./data/results/Llama-2-13b-chat-hf_wd_true.csv",  # Llama-2-13b-chat-hf(true)
+    "./data/results/Llama-2-70b-chat-hf_wd.csv",  # Llama-2-70b-chat-hf
+    "./data/results/Llama-2-70b-chat-hf_wd_true.csv",  # Llama-2-70b-chat-hf
+    "./data/results/Meta-Llama-3-70B-Instruct_wd.csv",  # Meta-Llama-3-70B-Instruct
+    "./data/results/Meta-Llama-3-70B-Instruct_wd_true.csv",  # Meta-Llama-3-70B-Instruct(true)
+]
+df_ms_macro = pd.read_json("./data/datasets/ms_macro.json")
+def load_for_repetition_penalty_ms_macro(
+    csv_result_file, repetition_penalty, force_recalculate=False
+):
+    result_file = replace_last(
+        csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
+    )
+    df = load_with_newline_and_repetition_scores(
+        result_file, force_recalculate=force_recalculate
+    )
+    if len(df) != len(df_ms_macro):
+        print(f"error: len(df) != {len(df_ms_macro)}")
+        missing_ids = [
+            id for id in df_ms_macro["id"].unique() if id not in df["id"].unique()
+        ]
+        print(f"missing_ids: {missing_ids}")
+    if df["ground_truth"][0] != str(df_ms_macro["wellFormedAnswers"][0]):
+        df["ground_truth"] = df_ms_macro["wellFormedAnswers"]
+        print("ground_truth updated for:", result_file)
+        df.to_csv(result_file, index=False)
+    return df
+# MS MACRO
+def plot_performance_scores_ms_macro(
+    result,
+    models=None,
+    title="Performance",
+):
+    if models is None:
+        models = result.keys()
+    for model in models:
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        # print(result[model]["df_list_repetition_penalty"][0].describe())
+        # Calculate the statistics
+        bleu1 = list(df["bleu1"])
+        rougeL = list(df["rougeL"])
+        f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
+        best_f1 = max(f1)
+        best_f1_index = f1.index(best_f1)
+        bleu1, rougeL = adjust_perf_scores_with_repetition_penalty(
+            result[model], bleu1, rougeL
+        )
+        afrp = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
+        # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
+        best_afrp = max(afrp)
+        best_afrp_index = afrp.index(best_afrp)
+        repetition_penalties = list(df["repetition_penalty"])
+        # line plot for precision, recall, f1
+        plt.figure(figsize=(10, 6))
+        plt.axvspan(
+            repetition_penalties[best_f1_index] - 0.01,
+            repetition_penalties[best_f1_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="blue",
+        )
+        plt.axvspan(
+            repetition_penalties[best_afrp_index] - 0.01,
+            repetition_penalties[best_afrp_index] + 0.01,
+            alpha=0.5,
+            edgecolor="none",
+            facecolor="orange",
+        )
+        plt.plot(
+            repetition_penalties,
+            f1,
+            label="Overall Perf Score",
+            marker="D",
+            color="blue",
+        )
+        plt.plot(
+            repetition_penalties,
+            afrp,
+            label="RAP - Perf Score",
+            marker="o",
+            color="orange",
+        )
+        plt.xlabel("Repetition Penalties")
+        plt.ylabel("Score")
+        # plt.xlim(0.99, 1.31)
+        # y in percentage
+        plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
+        plt.title(f"{model} {title}")
+        plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+        plt.show()
+def plot_repetition_factors(result, groups):
+    for group in groups:
+        # Plot the statistics
+        plt.figure(figsize=(10, 6))
+        max_value = 0
+        for model in result.keys():
+            if not group in model.lower():
+                continue
+            print(f"model: {model}")
+            df = result[model]["df_overall"]
+            repetition_panelties = [
+                repetition_penalty for repetition_penalty in df["repetition_penalty"]
+            ]
+            mean_score = [
+                # math.log10(10 + df["total_repetitions"].mean())
+                df["total_repetitions"].mean()
+                for df in result[model]["df_list_repetition_penalty"]
+            ]
+            sns.lineplot(x=repetition_panelties, y=mean_score, label=model)
+            new_max = max(mean_score)
+            if new_max > max_value:
+                max_value = new_max
+        max_value = max_value * 1.05
+        # if max_value < 1.5:
+        #     max_value = 1.5
+        # set ylimit
+        plt.ylim(0, max_value)
+        # show grid
+        plt.grid(True)
+        plt.xlabel("Repetition Penalties")
+        plt.ylabel("Mean Total Repetitions")
+        plt.title("Mean Total Repetitions vs Repetition Penalties")
+        plt.legend()
+        plt.show()
+def plot_repetition_factors_by_group(result, group_filter=None):
+    markers = ["D", "o", "s", "x"]
+    colors = ["blue", "orange", "green", "red"]
+    # Plot the statistics
+    plt.figure(figsize=(10, 6))
+    index = 0
+    max_value = 0
+    for model in result.keys():
+        if group_filter is not None and group_filter not in model:
+            continue
+        print(f"model: {model}")
+        df = result[model]["df_overall"]
+        repetition_panelties = [
+            repetition_penalty for repetition_penalty in df["repetition_penalty"]
+        ]
+        # Calculate the statistics
+        mean_score = [
+            # math.log10(10 + df["total_repetitions"].mean())
+            df["total_repetitions"].mean()
+            for df in result[model]["df_list_repetition_penalty"]
+        ]
+        if len(mean_score) != len(repetition_panelties):
+            print(
+                f"model: {model} has different length of repetition penalties and mean score"
+            )
+            print("repetition_panelties:", len(repetition_panelties))
+            print("mean_score:", len(mean_score))
+            continue
+        new_max = max(mean_score)
+        if new_max > max_value:
+            max_value = new_max
+        sns.lineplot(
+            x=repetition_panelties,
+            y=mean_score,
+            label=model,
+            marker=markers[index],
+            color=colors[index],
+        )
+        index += 1
+    max_value = max_value * 1.05
+    # if max_value < 1.5:
+    #     max_value = 1.5
+    # set ylimit
+    plt.ylim(0, max_value)
+    max_value = 0
+    plt.xlabel("Repetition Penalties")
+    plt.ylabel("Mean Total Repetitions")
+    plt.title("Mean Total Repetitions vs Repetition Penalties")
+    plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
+    plt.show()
+ms_marco_csv_result_files = [
+    "data/results/gemma-1.1-2b-it_mm_true_false.csv",
+    "data/results/gemma-1.1-2b-it_mm_true.csv",
+    "data/results/gemma-1.1-2b-it_mm_true_false_non_rag.csv",
+    "data/results/Phi-3-mini-128k-instruct_mm_false.csv",
+    "data/results/Phi-3-mini-128k-instruct_mm_true.csv",
+    "data/results/Phi-3-mini-128k-instruct_mm_non_rag.csv",
+    "data/results/gemma-1.1-7b-it_mm_false.csv",
+    "data/results/gemma-1.1-7b-it_mm_true.csv",
+    "data/results/gemma-1.1-7b-it_mm_non_rag.csv",
+    "data/results/Llama-2-7b-chat-hf_mm_true_false.csv",
+    "data/results/Llama-2-7b-chat-hf_mm_true.csv",
+    "data/results/Llama-2-7b-chat-hf_mm_true_false_non_rag.csv",
+    "data/results/Mistral-7B-Instruct-v0.2_mm_false.csv",
+    "data/results/Mistral-7B-Instruct-v0.2_mm_true.csv",
+    "data/results/Mistral-7B-Instruct-v0.2_mm_non_rag.csv",
+    "data/results/Meta-Llama-3-8B-Instruct_mm_true_false.csv",
+    "data/results/Meta-Llama-3-8B-Instruct_mm_true.csv",
+    "data/results/Meta-Llama-3-8B-Instruct_mm_true_false_non_rag.csv",
+    "data/results/Llama-2-13b-chat-hf_mm_false.csv",
+    "data/results/Llama-2-13b-chat-hf_mm_true.csv",
+    "data/results/Llama-2-13b-chat-hf_mm_non_rag.csv",
+    "data/results/Llama-2-70b-chat-hf_mm_false.csv",
+    "data/results/Llama-2-70b-chat-hf_mm_true.csv",
+    "data/results/Llama-2-70b-chat-hf_mm_non_rag.csv",
+    "data/results/Meta-Llama-3-70B-Instruct_mm_false.csv",
+    "data/results/Meta-Llama-3-70B-Instruct_mm_true.csv",
+    "data/results/Meta-Llama-3-70B-Instruct_mm_non_rag.csv",
+]
+webqsp_csv_result_files = []
+webqsp_model_result_counts = {}
+def find_model_name(file_path):
+    df = pd.read_csv(file_path, comment="#", on_bad_lines="warn")
+    return df["model"][0]
+def add_file(file):
+    model_name = find_model_name(file)
+    if "(generic prompt)" not in model_name:
+        webqsp_csv_result_files.append(file)
+        if model_name not in webqsp_model_result_counts:
+            webqsp_model_result_counts[model_name] = 1
+        else:
+            webqsp_model_result_counts[model_name] += 1
+last_model_name = None
+non_rag_index = 0
+for csv_result_file in rag_csv_result_files:
+    try:
+        model_name = find_model_name(csv_result_file)
+        # print(f"processing model: {model_name} - {csv_result_file}")
+        if last_model_name != model_name and last_model_name is not None:
+            while non_rag_index < len(non_rag_csv_result_files):
+                # print(f"processing non-rag file - {file}")
+                file = non_rag_csv_result_files[non_rag_index]
+                non_model_name = find_model_name(file)
+                if non_model_name.startswith(last_model_name):
+                    add_file(file)
+                    non_rag_index += 1
+                else:
+                    break
+        add_file(csv_result_file)
+        last_model_name = model_name
+    except FileNotFoundError as e:
+        print("\terror processing file: ", csv_result_file, e)
+        continue
+for file in non_rag_csv_result_files[non_rag_index:]:
+    add_file(file)
+def calc_rap_scores(result, precision="precision", recall="recall"):
+    newline_score = [
+        df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
+    ]
+    repetition_score = [
+        df["repetition_score"].mean() for df in result["df_list_repetition_penalty"]
+    ]
+    if precision in result["df_list_repetition_penalty"][0].columns:
+        precision = [
+            df[precision].mean() for df in result["df_list_repetition_penalty"]
+        ]
+        recall = [df[recall].mean() for df in result["df_list_repetition_penalty"]]
+    else:
+        precision = result["df_overall"][precision]
+        recall = result["df_overall"][recall]
+    f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
+    rap = [
+        f / math.log10(10 + n + r)
+        for f, n, r in zip(f1, newline_score, repetition_score)
+    ]
+    return newline_score, repetition_score, f1, rap
+def load_webqsp_result(csv_result_files, force_recalculate=False):
+    model_name_exts = {
+        "true": "(RAG - Chat Template)",
+        "wd": "(RAG - Generic Prompt)",
+        "rag": "(Non-RAG)",
+    }
+    result = {}
+    for i, csv_result_file in enumerate(csv_result_files):
+        try:
+            df = pd.read_csv(csv_result_file)
+            parts = re.split(r"[_\.]", csv_result_file)
+            if parts[-2] in model_name_exts.keys():
+                key = parts[-2]
+            elif csv_result_file in non_rag_csv_result_files:
+                key = "rag"
+            else:
+                key = "wd"
+            model_name = f'{df["model"][0]}{model_name_exts[key]}'
+            dfs = [
+                calculate_performance_score(
+                    csv_result_file,
+                    repetition_penalty,
+                    force_recalculate=force_recalculate,
+                )
+                for repetition_penalty in df["repetition_penalty"]
+            ]
+            answer_lens = []
+            for df_rpp in dfs:
+                df_rpp["answer_len"] = df_rpp["answer"].apply(
+                    lambda x: len(x) if isinstance(x, str) else 0
+                )
+                answer_lens.append(df_rpp["answer_len"].mean())
+            result[model_name] = {
+                "df_overall": df,
+                "df_list_repetition_penalty": dfs,
+                "file": csv_result_file,
+            }
+            newline_score, repetition_score, perf, rap = calc_rap_scores(
+                result[model_name]
+            )
+            df["newline_score"] = newline_score
+            df["repetition_score"] = repetition_score
+            df["total_repetitions"] = df["newline_score"] + df["repetition_score"]
+            df["answer_len"] = answer_lens
+            df["perf"] = perf
+            df["rap"] = rap
+        except Exception as e:
+            print(f"Error: {e}")
+    return result
+def load_ms_marco_result(csv_result_files, force_recalculate=False):
+    model_name_exts = {
+        "true": "(RAG - Chat Template)",
+        "false": "(RAG - Generic Prompt)",
+        "rag": "(Non-RAG)",
+    }
+    result = {}
+    for csv_result_file in csv_result_files:
+        try:
+            df = pd.read_csv(csv_result_file)
+            parts = re.split(r"[_\.]", csv_result_file)
+            model_name = f'{df["model"][0]}{model_name_exts[parts[-2]]}'
+            print(f"\tmodel_name: {model_name}")
+            dfs = [
+                load_for_repetition_penalty_ms_macro(
+                    csv_result_file,
+                    repetition_penalty,
+                    force_recalculate=force_recalculate,
+                )
+                for repetition_penalty in df["repetition_penalty"]
+            ]
+            answer_lens = []
+            for df_rpp in dfs:
+                df_rpp["answer_len"] = df_rpp["answer"].apply(
+                    lambda x: len(x) if isinstance(x, str) else 0
+                )
+                answer_lens.append(df_rpp["answer_len"].mean())
+            result[model_name] = {
+                "df_overall": df,
+                "df_list_repetition_penalty": dfs,
+                "file": csv_result_file,
+            }
+            newline_score, repetition_score, perf, rap = calc_rap_scores(
+                result[model_name],
+                precision="bleu1",
+                recall="rougeL",
+            )
+            df["newline_score"] = newline_score
+            df["repetition_score"] = repetition_score
+            df["total_repetitions"] = df["newline_score"] + df["repetition_score"]
+            df["answer_len"] = answer_lens
+            df["perf"] = perf
+            df["rap"] = rap
+        except Exception as e:
+            print(f"Error: {e}")
+    return result
+def load_ms_marco_result_v2(csv_result_files, force_recalculate=False):
+    model_name_exts = {
+        "true": "(RAG - Chat Template)",
+        "false": "(RAG - Generic Prompt)",
+        "rag": "(Non-RAG)",
+    }
+    result = {}
+    for csv_result_file in csv_result_files:
+        try:
+            df = pd.read_csv(csv_result_file)
+            parts = re.split(r"[_\.]", csv_result_file)
+            model_name = f'{df["model"][0]}{model_name_exts[parts[-2]]}'
+            print(f"\tmodel_name: {model_name}")
+            dfs = [
+                load_for_repetition_penalty_ms_macro(
+                    csv_result_file,
+                    repetition_penalty,
+                    force_recalculate=force_recalculate,
+                )
+                for repetition_penalty in df["repetition_penalty"]
+            ]
+            answer_lens = []
+            for df_rpp in dfs:
+                df_rpp["answer_len"] = df_rpp["answer"].apply(
+                    lambda x: len(x) if isinstance(x, str) else 0
+                )
+                answer_lens.append(df_rpp["answer_len"].mean())
+            df["answer_len"] = answer_lens
+            meteor_scores = []
+            for df_rpp in dfs:
+                meteor_score = meteor.compute(
+                    predictions=df_rpp["answer"], references=df_rpp["ground_truth"]
+                )["meteor"]
+                meteor_scores.append(meteor_score)
+            df["meteor_scores"] = meteor_scores
+            result[model_name] = {
+                "df_overall": df,
+                "df_list_repetition_penalty": dfs,
+                "file": csv_result_file,
+            }
+            newline_score, repetition_score, perf, rap = calc_rap_scores(
+                result[model_name],
+                precision="meteor_scores",
+                recall="meteor_scores",
+            )
+            df["newline_score"] = newline_score
+            df["repetition_score"] = repetition_score
+            df["total_repetitions"] = df["newline_score"] + df["repetition_score"]
+            df["perf"] = perf
+            df["rap"] = rap
+        except Exception as e:
+            print(f"Error: {e}")
+    return result

notebooks/00_Repetition_Algorithms_Comparison.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

notebooks/04_RAPGeT_v2.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff