dh-mc commited on
Commit
04522ca
1 Parent(s): 8f1a330

added comparison of different repetition detection methods

Browse files
eval_modules/calc_repetitions.py CHANGED
@@ -7,6 +7,9 @@ import matplotlib.pyplot as plt
7
  import matplotlib.ticker as mtick
8
  import seaborn as sns
9
  import nltk
 
 
 
10
 
11
  print(f"loading: {__file__}")
12
 
@@ -1316,3 +1319,65 @@ def load_ms_marco_result(csv_result_files, force_recalculate=False):
1316
  print(f"Error: {e}")
1317
 
1318
  return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  import matplotlib.ticker as mtick
8
  import seaborn as sns
9
  import nltk
10
+ import evaluate
11
+
12
+ meteor = evaluate.load("meteor")
13
 
14
  print(f"loading: {__file__}")
15
 
 
1319
  print(f"Error: {e}")
1320
 
1321
  return result
1322
+
1323
+
1324
+ def load_ms_marco_result_v2(csv_result_files, force_recalculate=False):
1325
+ model_name_exts = {
1326
+ "true": "(RAG - Chat Template)",
1327
+ "false": "(RAG - Generic Prompt)",
1328
+ "rag": "(Non-RAG)",
1329
+ }
1330
+
1331
+ result = {}
1332
+ for csv_result_file in csv_result_files:
1333
+ try:
1334
+ df = pd.read_csv(csv_result_file)
1335
+
1336
+ parts = re.split(r"[_\.]", csv_result_file)
1337
+ model_name = f'{df["model"][0]}{model_name_exts[parts[-2]]}'
1338
+
1339
+ print(f"\tmodel_name: {model_name}")
1340
+ dfs = [
1341
+ load_for_repetition_penalty_ms_macro(
1342
+ csv_result_file,
1343
+ repetition_penalty,
1344
+ force_recalculate=force_recalculate,
1345
+ )
1346
+ for repetition_penalty in df["repetition_penalty"]
1347
+ ]
1348
+
1349
+ answer_lens = []
1350
+ for df_rpp in dfs:
1351
+ df_rpp["answer_len"] = df_rpp["answer"].apply(
1352
+ lambda x: len(x) if isinstance(x, str) else 0
1353
+ )
1354
+ answer_lens.append(df_rpp["answer_len"].mean())
1355
+ df["answer_len"] = answer_lens
1356
+
1357
+ meteor_scores = []
1358
+ for df_rpp in dfs:
1359
+ meteor_score = meteor.compute(
1360
+ predictions=df_rpp["answer"], references=df_rpp["ground_truth"]
1361
+ )["meteor"]
1362
+ meteor_scores.append(meteor_score)
1363
+ df["meteor_scores"] = meteor_scores
1364
+
1365
+ result[model_name] = {
1366
+ "df_overall": df,
1367
+ "df_list_repetition_penalty": dfs,
1368
+ "file": csv_result_file,
1369
+ }
1370
+ newline_score, repetition_score, perf, rap = calc_rap_scores(
1371
+ result[model_name],
1372
+ precision="meteor_scores",
1373
+ recall="meteor_scores",
1374
+ )
1375
+ df["newline_score"] = newline_score
1376
+ df["repetition_score"] = repetition_score
1377
+ df["total_repetitions"] = df["newline_score"] + df["repetition_score"]
1378
+ df["perf"] = perf
1379
+ df["rap"] = rap
1380
+ except Exception as e:
1381
+ print(f"Error: {e}")
1382
+
1383
+ return result
eval_modules/calc_repetitions_v1.py ADDED
@@ -0,0 +1,929 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import math
3
+ import pandas as pd
4
+ import numpy as np
5
+ import matplotlib.pyplot as plt
6
+ import matplotlib.ticker as mtick
7
+ import seaborn as sns
8
+
9
+ # final version
10
+ pattern_abnormal_newlines = re.compile(r"\n{5,}")
11
+ pattern_text_repetitions = re.compile(r"\b(\w.+?)\b(\1+)", re.M | re.DOTALL)
12
+ exception_pattern = re.compile(r"(\w+\.)\1")
13
+
14
+
15
+ # final version for repetition detection
16
+ def detect_repetitions(
17
+ text, debug=False, pattern_text_repetitions=pattern_text_repetitions
18
+ ):
19
+ subtotals = [0, 0]
20
+
21
+ if isinstance(text, str):
22
+ patterns = [pattern_abnormal_newlines, pattern_text_repetitions]
23
+ for i, pattern in enumerate(patterns):
24
+ if debug:
25
+ print(
26
+ f"----detect {'abnormal newlines' if i == 0 else 'text repetitions'}----"
27
+ )
28
+ matches = pattern.finditer(text)
29
+ for match in matches:
30
+ if debug:
31
+ print(match)
32
+ for groupNum in range(0, len(match.groups())):
33
+ groupNum = groupNum + 1
34
+ print(
35
+ "Group {groupNum} found at {start}-{end}: `{group}`".format(
36
+ groupNum=groupNum,
37
+ start=match.start(groupNum),
38
+ end=match.end(groupNum),
39
+ group=match.group(groupNum),
40
+ )
41
+ )
42
+
43
+ if exception_pattern.match(match[0]):
44
+ if debug:
45
+ print("ignored: ", match[0])
46
+ continue
47
+
48
+ start, end = match.span()
49
+ subtotals[i] += end - start
50
+
51
+ result = (subtotals[0], subtotals[1], subtotals[0] + subtotals[1])
52
+
53
+ if debug:
54
+ print(result)
55
+ return result
56
+
57
+
58
+ def detect_abnormal_newlines(text, debug=False):
59
+ return detect_repetitions(text, debug=debug)[0]
60
+
61
+
62
+ def detect_text_repetitions(text, debug=False):
63
+ return detect_repetitions(text, debug=debug)[1]
64
+
65
+
66
+ def detect_scores(text, debug=False):
67
+ newline_score, repetition_score, total_repetitions = detect_repetitions(
68
+ text, debug=debug
69
+ )
70
+ return pd.Series([newline_score, repetition_score, total_repetitions])
71
+
72
+
73
+ def load_with_newline_and_repetition_scores(result_file, force_recalculate=False):
74
+ print(f"loading result file: {result_file}")
75
+ df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
76
+
77
+ if (
78
+ force_recalculate
79
+ or "newline_score" not in df.columns
80
+ or "repetition_score" not in df.columns
81
+ or "total_repetitions" not in df.columns
82
+ ):
83
+ df[["newline_score", "repetition_score", "total_repetitions"]] = df[
84
+ "answer"
85
+ ].apply(detect_scores)
86
+ df.to_csv(result_file, index=False)
87
+
88
+ return df
89
+
90
+
91
+ def replace_last(source_string, old_string, new_string):
92
+ head, _sep, tail = source_string.rpartition(old_string)
93
+ return head + new_string + tail
94
+
95
+
96
+ def load_for_repetition_penalty(
97
+ csv_result_file, repetition_penalty, force_recalculate=False
98
+ ):
99
+ result_file = replace_last(
100
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
101
+ )
102
+ return load_with_newline_and_repetition_scores(
103
+ result_file, force_recalculate=force_recalculate
104
+ )
105
+
106
+
107
+ def calc_adjusted_performance(f, r):
108
+ return f / math.log10(10 + r)
109
+
110
+
111
+ def calculate_adjusted_performance(row):
112
+ r = row["total_repetitions"]
113
+ adjusted_precision = calc_adjusted_performance(row["precision"], r)
114
+ adjusted_recall = calc_adjusted_performance(row["recall"], r)
115
+ return pd.Series([adjusted_precision, adjusted_recall])
116
+
117
+
118
+ def load_performance_df(csv_result_file, repetition_penalty):
119
+ result_file = replace_last(
120
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}-t2_evaluated.json"
121
+ )
122
+ result_file = result_file.replace("/results/", "/eval/")
123
+ print(f"loading json file: {result_file}")
124
+ df = pd.read_json(result_file)
125
+
126
+ return df
127
+
128
+
129
+ def calculate_performance_score(
130
+ csv_result_file, repetition_penalty, force_recalculate=False
131
+ ):
132
+ result_file = replace_last(
133
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
134
+ )
135
+ print(f"loading result file: {result_file}")
136
+ df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
137
+
138
+ if force_recalculate or "f2" in df.columns or "f1" not in df.columns:
139
+ df.drop(
140
+ columns=[
141
+ "precision",
142
+ "recall",
143
+ "f1",
144
+ "f2",
145
+ "entities_in_answer",
146
+ "entities_in_question",
147
+ ],
148
+ errors="ignore",
149
+ inplace=True,
150
+ )
151
+ perf_df = load_performance_df(csv_result_file, repetition_penalty)
152
+ filtered_df = perf_df[perf_df["id"].isin(df["id"])]
153
+ perf_df = filtered_df.reset_index(drop=True)
154
+ print(f"perf_df len: {len(perf_df)}")
155
+ # print(perf_df.head())
156
+
157
+ df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"]
158
+
159
+ df["precision"] = perf_df["score"].apply(lambda x: x[0])
160
+ df["recall"] = perf_df["score"].apply(lambda x: x[1])
161
+
162
+ df[["adjusted_precision", "adjusted_recall"]] = df.apply(
163
+ calculate_adjusted_performance, axis=1
164
+ )
165
+
166
+ df.to_csv(result_file, index=False)
167
+ print(f"performance scores saved to result file: {result_file}")
168
+
169
+ print(f"df len: {len(df)}")
170
+
171
+ return df
172
+
173
+
174
+ def adjust_perf_scores_with_repetition_penalty(result, precision, recall):
175
+ newline_score = [
176
+ df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
177
+ ]
178
+ print(f"newline_score: {newline_score}")
179
+
180
+ repetition_score = [
181
+ df["repetition_score"].mean() for df in result["df_list_repetition_penalty"]
182
+ ]
183
+ print(f"repetition_score: {repetition_score}")
184
+
185
+ precision = [
186
+ f / math.log10(10 + n + r)
187
+ for f, n, r in zip(precision, newline_score, repetition_score)
188
+ ]
189
+ recall = [
190
+ f / math.log10(10 + n + r)
191
+ for f, n, r in zip(recall, newline_score, repetition_score)
192
+ ]
193
+
194
+ return precision, recall
195
+
196
+
197
+ def plot_performance_scores(
198
+ result,
199
+ models=None,
200
+ title="Performance",
201
+ ):
202
+
203
+ if models is None:
204
+ models = result.keys()
205
+ for model in models:
206
+ print(f"model: {model}")
207
+ df = result[model]["df_overall"]
208
+
209
+ # Calculate the statistics
210
+ precision = [
211
+ df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
212
+ ]
213
+ recall = [
214
+ df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
215
+ ]
216
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
217
+ best_f1 = max(f1)
218
+ best_f1_index = f1.index(best_f1)
219
+
220
+ precision, recall = adjust_perf_scores_with_repetition_penalty(
221
+ result[model], precision, recall
222
+ )
223
+ afrp = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
224
+
225
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
226
+ best_afrp = max(afrp)
227
+ best_afrp_index = afrp.index(best_afrp)
228
+
229
+ adjusted_precision = [
230
+ df["adjusted_precision"].mean()
231
+ for df in result[model]["df_list_repetition_penalty"]
232
+ ]
233
+ adjusted_recall = [
234
+ df["adjusted_recall"].mean()
235
+ for df in result[model]["df_list_repetition_penalty"]
236
+ ]
237
+ afrp2 = [
238
+ 2 * (p * r) / (p + r) for p, r in zip(adjusted_precision, adjusted_recall)
239
+ ]
240
+ best_afrp2 = max(afrp2)
241
+ best_afrp2_index = afrp2.index(best_afrp2)
242
+
243
+ repetition_penalties = list(df["repetition_penalty"])
244
+
245
+ # line plot for precision, recall, f1
246
+ plt.figure(figsize=(10, 6))
247
+
248
+ plt.axvspan(
249
+ repetition_penalties[best_f1_index] - 0.01,
250
+ repetition_penalties[best_f1_index] + 0.01,
251
+ alpha=0.5,
252
+ edgecolor="none",
253
+ facecolor="blue",
254
+ )
255
+
256
+ plt.axvspan(
257
+ repetition_penalties[best_afrp2_index] - 0.01,
258
+ repetition_penalties[best_afrp2_index] + 0.01,
259
+ alpha=0.5,
260
+ edgecolor="none",
261
+ facecolor="green",
262
+ )
263
+
264
+ plt.axvspan(
265
+ repetition_penalties[best_afrp_index] - 0.01,
266
+ repetition_penalties[best_afrp_index] + 0.01,
267
+ alpha=0.5,
268
+ edgecolor="none",
269
+ facecolor="orange",
270
+ )
271
+
272
+ plt.plot(repetition_penalties, f1, label="F1", marker="D", color="blue")
273
+ plt.plot(
274
+ repetition_penalties,
275
+ afrp2,
276
+ label="Per-question RF Adjusted F1",
277
+ marker="s",
278
+ color="green",
279
+ )
280
+ plt.plot(
281
+ repetition_penalties,
282
+ afrp,
283
+ label="Overall RF Adjusted F1",
284
+ marker="o",
285
+ color="orange",
286
+ )
287
+ plt.xlabel("Repetition Penalties")
288
+ plt.ylabel("Score")
289
+ plt.xlim(0.99, 1.31)
290
+ # y in percentage
291
+ plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
292
+ plt.title(f"{model} {title}")
293
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
294
+
295
+ plt.show()
296
+
297
+
298
+ def plot_best_afrp(
299
+ result,
300
+ models=None,
301
+ title="Models with Best Repetition Factor Adjusted F1",
302
+ ref_result=None,
303
+ ):
304
+ # Initialize lists to store the statistics
305
+ model_names = []
306
+ best_f1 = []
307
+ best_afrp = []
308
+ best_repetition_penalty = []
309
+
310
+ if models is None:
311
+ models = result.keys()
312
+ for model in models:
313
+ print(f"model: {model}")
314
+ df = result[model]["df_overall"]
315
+
316
+ # Calculate the statistics
317
+ precision = [
318
+ df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
319
+ ]
320
+ recall = [
321
+ df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
322
+ ]
323
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
324
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
325
+
326
+ newline_score = [
327
+ df["newline_score"].mean()
328
+ for df in result[model]["df_list_repetition_penalty"]
329
+ ]
330
+ print(f"newline_score: {newline_score}")
331
+
332
+ repetition_score = [
333
+ df["repetition_score"].mean()
334
+ for df in result[model]["df_list_repetition_penalty"]
335
+ ]
336
+ print(f"repetition_score: {repetition_score}")
337
+
338
+ afrp = [
339
+ f / math.log10(10 + n + r)
340
+ for f, n, r in zip(f1, newline_score, repetition_score)
341
+ ]
342
+
343
+ best_afrp.append(max(afrp))
344
+ best_afrp_index = afrp.index(best_afrp[-1])
345
+ best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
346
+
347
+ best_f1.append(f1[best_afrp_index])
348
+
349
+ print(
350
+ f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
351
+ )
352
+
353
+ df = result[model]["df_list_repetition_penalty"][best_afrp_index]
354
+
355
+ model_names.append(
356
+ f"{model} (RP={best_repetition_penalty[-1]})"
357
+ ) # Add the model name to the list
358
+
359
+ if ref_result is not None:
360
+ print("ref_result:", ref_result)
361
+ for model in ref_result.keys():
362
+ model_names.append(model)
363
+ df = pd.read_csv(ref_result[model])
364
+ # df = df[df["id"].isin(wikidata_df["id"])]
365
+
366
+ p = df["precision"].mean()
367
+ r = df["recall"].mean()
368
+
369
+ f1 = 2 * p * r / (p + r) if p + r > 0 else 0
370
+ best_f1.append(f1)
371
+ best_afrp.append(f1)
372
+
373
+ print("model_names:", model_names)
374
+ print("best_f1:", best_f1)
375
+ print("best_afrp:", best_afrp)
376
+
377
+ # Create a DataFrame with the statistics
378
+ data = pd.DataFrame(
379
+ {
380
+ "Model": model_names,
381
+ "Repetition Factor Adjusted F1": best_afrp,
382
+ "F1": best_f1,
383
+ }
384
+ )
385
+
386
+ # Melt the DataFrame to a long format
387
+ data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
388
+
389
+ # Pivot the DataFrame to a wide format
390
+ data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
391
+
392
+ # make sure the columns are following the order of the models
393
+ data_pivoted = data_pivoted[model_names]
394
+
395
+ # make sure three groups in the order of precision, recall, f1
396
+ data_pivoted = data_pivoted.reindex(["Repetition Factor Adjusted F1", "F1"])
397
+
398
+ # Plot the statistics
399
+ plt.figure(figsize=(10, 6))
400
+ ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
401
+ plt.title(title)
402
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
403
+
404
+ # Set the rotation of the x-axis labels to 0 degrees
405
+ plt.xticks(rotation=0)
406
+
407
+ # Format the y-axis to display as percentage
408
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
409
+
410
+ # get the max value of the y-axis
411
+ a1 = max(best_afrp)
412
+ a2 = max(best_f1)
413
+
414
+ max_value = max([a1, a2]) * 1.12
415
+ print("max_value:", max_value)
416
+
417
+ # Set the y-axis limit up to 70%
418
+ ax.set_ylim(0, max_value)
419
+
420
+ # Add the values above each bar
421
+ for p in ax.patches:
422
+ ax.annotate(
423
+ f"{p.get_height() * 100:.1f}",
424
+ (p.get_x() + p.get_width() / 2.0, p.get_height()),
425
+ ha="center",
426
+ va="bottom",
427
+ xytext=(0, 10),
428
+ textcoords="offset points",
429
+ rotation=90,
430
+ )
431
+
432
+ plt.show()
433
+
434
+
435
+ def plot_best_performance(
436
+ result,
437
+ models=None,
438
+ title="Models with Best F1 Score",
439
+ adjusted_f1=False,
440
+ ref_result=None,
441
+ ):
442
+ # Initialize lists to store the statistics
443
+ model_names = []
444
+ best_precision = []
445
+ best_recall = []
446
+ best_f1 = []
447
+ best_repetition_penalty = []
448
+
449
+ if models is None:
450
+ models = result.keys()
451
+ for model in models:
452
+ print(f"model: {model}")
453
+ df = result[model]["df_overall"]
454
+
455
+ # Calculate the statistics
456
+ precision = [
457
+ df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
458
+ ]
459
+ recall = [
460
+ df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
461
+ ]
462
+
463
+ if adjusted_f1:
464
+ precision, recall = adjust_perf_scores_with_repetition_penalty(
465
+ result[model], precision, recall
466
+ )
467
+
468
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
469
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
470
+
471
+ best_f1.append(max(f1))
472
+ best_f1_index = f1.index(best_f1[-1])
473
+ best_repetition_penalty.append(df["repetition_penalty"][best_f1_index])
474
+
475
+ best_precision.append(precision[best_f1_index])
476
+ best_recall.append(recall[best_f1_index])
477
+
478
+ print(
479
+ f"best repetition penalty: {best_repetition_penalty[-1]}, best f1: {best_f1[-1]}, precision: {best_precision[-1]}, recall: {best_recall[-1]}"
480
+ )
481
+
482
+ df = result[model]["df_list_repetition_penalty"][best_f1_index]
483
+
484
+ model_names.append(
485
+ f"{model} (RP={best_repetition_penalty[-1]})"
486
+ ) # Add the model name to the list
487
+
488
+ # print sum for columns: newline_score, repetition_score
489
+ print(
490
+ f"newline_score: {df['newline_score'].sum()}, repetition_score: {df['repetition_score'].sum()}"
491
+ )
492
+
493
+ if ref_result is not None:
494
+ print("ref_result:", ref_result)
495
+ for model in ref_result.keys():
496
+ model_names.append(model)
497
+ df = pd.read_csv(ref_result[model])
498
+ # df = df[df["id"].isin(wikidata_df["id"])]
499
+
500
+ best_precision.append(df["precision"].mean())
501
+ best_recall.append(df["recall"].mean())
502
+ f1 = (
503
+ 2
504
+ * (best_precision[-1] * best_recall[-1])
505
+ / (best_precision[-1] + best_recall[-1])
506
+ )
507
+ # best_f1.append(df["f1"].mean())
508
+ best_f1.append(f1)
509
+
510
+ # Create a DataFrame with the statistics
511
+ data = (
512
+ pd.DataFrame(
513
+ {
514
+ "Model": model_names,
515
+ "Adjusted Precision with RP": best_precision,
516
+ "Adjusted Recall with RP": best_recall,
517
+ "Adjusted F1 with RP": best_f1,
518
+ }
519
+ )
520
+ if adjusted_f1
521
+ else pd.DataFrame(
522
+ {
523
+ "Model": model_names,
524
+ "Precision": best_precision,
525
+ "Recall": best_recall,
526
+ "F1": best_f1,
527
+ }
528
+ )
529
+ )
530
+ columns = list(data.columns)
531
+
532
+ # Melt the DataFrame to a long format
533
+ data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
534
+
535
+ # Pivot the DataFrame to a wide format
536
+ data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
537
+
538
+ # make sure the columns are following the order of the models
539
+ data_pivoted = data_pivoted[model_names]
540
+
541
+ # make sure three groups in the order of precision, recall, f1
542
+ data_pivoted = data_pivoted.reindex(columns[1:])
543
+
544
+ # Plot the statistics
545
+ plt.figure(figsize=(10, 6))
546
+ ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
547
+ plt.title(title)
548
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
549
+
550
+ # Set the rotation of the x-axis labels to 0 degrees
551
+ plt.xticks(rotation=0)
552
+
553
+ # Format the y-axis to display as percentage
554
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
555
+
556
+ # get the max value of the y-axis
557
+ a1 = max(best_precision)
558
+ a2 = max(best_recall)
559
+ a3 = max(best_f1)
560
+
561
+ max_value = max([a1, a2, a3]) * 1.12
562
+ print("max_value:", max_value)
563
+
564
+ # Set the y-axis limit up to 70%
565
+ ax.set_ylim(0, max_value)
566
+
567
+ # Add the values above each bar
568
+ for p in ax.patches:
569
+ ax.annotate(
570
+ f"{p.get_height() * 100:.1f}",
571
+ (p.get_x() + p.get_width() / 2.0, p.get_height()),
572
+ ha="center",
573
+ va="bottom",
574
+ xytext=(0, 10),
575
+ textcoords="offset points",
576
+ rotation=90,
577
+ )
578
+
579
+ plt.show()
580
+
581
+
582
+ def plot_best_performance_ms_macro(
583
+ result,
584
+ models=None,
585
+ title="Models with Best Repetition Factor Adjusted Performance",
586
+ ref_result=None,
587
+ skip_generic_prompt=False,
588
+ include_adjusted_performance=True,
589
+ ):
590
+ # Initialize lists to store the statistics
591
+ model_names = []
592
+ best_f1 = []
593
+ best_afrp = []
594
+ best_repetition_penalty = []
595
+ best_bleu1 = []
596
+ best_rougeL = []
597
+
598
+ if models is None:
599
+ models = result.keys()
600
+ for model in models:
601
+ if skip_generic_prompt and "generic prompt" in model:
602
+ continue
603
+ print(f"model: {model}")
604
+ df = result[model]["df_overall"]
605
+
606
+ # Calculate the statistics
607
+ bleu1 = [x for x in df["bleu1"]]
608
+ rougeL = [x for x in df["rougeL"]]
609
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
610
+
611
+ newline_score = [
612
+ df["newline_score"].mean()
613
+ for df in result[model]["df_list_repetition_penalty"]
614
+ ]
615
+ print(f"newline_score: {newline_score}")
616
+
617
+ repetition_score = [
618
+ df["repetition_score"].mean()
619
+ for df in result[model]["df_list_repetition_penalty"]
620
+ ]
621
+ print(f"repetition_score: {repetition_score}")
622
+
623
+ afrp = [
624
+ f / math.log10(10 + n + r)
625
+ for f, n, r in zip(f1, newline_score, repetition_score)
626
+ ]
627
+
628
+ best_afrp.append(max(afrp if include_adjusted_performance else f1))
629
+ best_afrp_index = (
630
+ afrp.index(best_afrp[-1])
631
+ if include_adjusted_performance
632
+ else f1.index(best_afrp[-1])
633
+ )
634
+ best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
635
+
636
+ best_f1.append(f1[best_afrp_index])
637
+ best_bleu1.append(bleu1[best_afrp_index])
638
+ best_rougeL.append(rougeL[best_afrp_index])
639
+
640
+ print(
641
+ f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
642
+ )
643
+
644
+ df = result[model]["df_list_repetition_penalty"][best_afrp_index]
645
+
646
+ model_names.append(
647
+ f"{model} (RP={best_repetition_penalty[-1]})"
648
+ ) # Add the model name to the list
649
+
650
+ if ref_result is not None:
651
+ print("ref_result:", ref_result)
652
+ for model in ref_result.keys():
653
+ model_names.append(model)
654
+ df = pd.read_csv(ref_result[model], comment="#", on_bad_lines="warn")
655
+ # df = df[df["id"].isin(wikidata_df["id"])]
656
+
657
+ p = df["bleu1"][0]
658
+ best_bleu1.append(p)
659
+
660
+ r = df["rougeL"][0]
661
+ best_rougeL.append(r)
662
+
663
+ f1 = 2 * p * r / (p + r) if p + r > 0 else 0
664
+ best_f1.append(f1)
665
+ best_afrp.append(f1)
666
+
667
+ print("model_names:", model_names)
668
+ print("best_f1:", best_f1)
669
+ print("best_afrp:", best_afrp)
670
+
671
+ # Create a DataFrame with the statistics
672
+ data = (
673
+ pd.DataFrame(
674
+ {
675
+ "Model": model_names,
676
+ "Repetition Factor Adjusted Perf Score": best_afrp,
677
+ "Overall Perf Score": best_f1,
678
+ }
679
+ )
680
+ if include_adjusted_performance
681
+ else pd.DataFrame(
682
+ {
683
+ "Model": model_names,
684
+ "Bleu-1": best_bleu1,
685
+ "Rouge-L": best_rougeL,
686
+ "Overall Perf Score": best_f1,
687
+ }
688
+ )
689
+ )
690
+
691
+ # Melt the DataFrame to a long format
692
+ data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
693
+
694
+ # Pivot the DataFrame to a wide format
695
+ data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
696
+
697
+ # make sure the columns are following the order of the models
698
+ data_pivoted = data_pivoted[model_names]
699
+
700
+ columns = list(data.columns)
701
+ data_pivoted = data_pivoted.reindex(columns[1:])
702
+
703
+ # Plot the statistics
704
+ plt.figure(figsize=(10, 6))
705
+ ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
706
+ plt.title(title)
707
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
708
+
709
+ # Set the rotation of the x-axis labels to 0 degrees
710
+ plt.xticks(rotation=0)
711
+
712
+ # Format the y-axis to display as percentage
713
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
714
+
715
+ # get the max value of the y-axis
716
+ a1 = max(best_afrp)
717
+ a2 = max(best_f1)
718
+ a3 = max(best_bleu1)
719
+ a4 = max(best_rougeL)
720
+
721
+ max_value = (
722
+ max([a1, a2] if include_adjusted_performance else [a1, a2, a3, a4]) * 1.12
723
+ )
724
+ print("max_value:", max_value)
725
+
726
+ # Set the y-axis limit up to 70%
727
+ ax.set_ylim(0, max_value)
728
+
729
+ # Add the values above each bar
730
+ for p in ax.patches:
731
+ ax.annotate(
732
+ f"{p.get_height() * 100:.1f}",
733
+ (p.get_x() + p.get_width() / 2.0, p.get_height()),
734
+ ha="center",
735
+ va="bottom",
736
+ xytext=(0, 10),
737
+ textcoords="offset points",
738
+ rotation=90,
739
+ )
740
+
741
+ plt.show()
742
+
743
+
744
+ non_rag_csv_result_files = [
745
+ "./data/results/Phi-3-mini-128k-instruct_wd_non_rag_batch_16.csv", # Phi-3-mini-128k-instruct(batch size:16)
746
+ # "./data/results/Tune_2024-04-12_17-14-28.csv", # Orca-2-7b
747
+ "./data/results/Tune_2024-04-09_09-19-22.csv", # Llama-2-7b-chat-hf
748
+ # "./data/results/Tune_2024-04-15_12-43-48.csv", # Llama-2-7b-chat-hf(cwq)
749
+ "./data/results/Meta-Llama-3-8B-Instruct_wd_non_rag.csv", # Meta-Llama-3-8B-Instruct
750
+ "./data/results/Meta-Llama-3-8B-Instruct_wd_1_non_rag.csv", # Meta-Llama-3-8B-Instruct
751
+ "./data/results/Tune_2024-04-16_12-24-27.csv.csv", # Mistral-7B-Instruct-v0.2
752
+ "./data/results/gemma-1.1-2b-it_wd_non_rag.csv", # gemma-1.1-2b-it
753
+ # "./data/results/Tune_2024-04-17_04-23-15.csv", # gemma-1.1-2b-it(cwq)
754
+ "./data/results/gemma-1.1-7b-it_wd_non_rag.csv", # gemma-1.1-2b-it
755
+ # "./data/results/Tune_2024-04-18_21-56-52.csv", # gemma-1.1-7b-it
756
+ # "./data/results/Tune_2024-04-19_08-14-49.csv", # gemma-1.1-7b-it(cwq)
757
+ # "./data/results/Tune_2024-04-17_23-52-04.csv", # Orca-2-13b
758
+ "./data/results/Tune_2024-04-10_16-53-38.csv", # Llama-2-13b-chat-hf
759
+ "./data/results/Llama-2-70b-chat-hf_wd_non_rag.csv", # Llama-2-70b-chat-hf
760
+ "./data/results/Meta-Llama-3-70B-Instruct_wd_non_rag.csv", # Meta-Llama-3-70B-Instruct
761
+ # "./data/results/llama-3-70b-instruct-awq_wd_non_rag.csv", # Llama-3-70b-instruct-awq
762
+ ]
763
+
764
+ rag_csv_result_files = [
765
+ "./data/results/Phi-3-mini-128k-instruct_wd_rag_batch_4.csv", # Phi-3-mini-128k-instruct(batch size:16)
766
+ "./data/results/Phi-3-mini-128k-instruct_wd_true.csv", # Phi-3-mini-128k-instruct(batch size:16)
767
+ # "./data/results/Tune_2024-03-19_19-13-36.csv", # Orca-2-7b
768
+ "./data/results/Tune_2024-03-20_15-35-37.csv", # Llama-2-7b-chat-hf
769
+ "./data/results/Llama-2-7b-chat-hf_wd_true.csv", # Llama-2-7b-chat-hf(true)
770
+ # "./data/results/Tune_2024-04-15_14-52-31.csv", # Llama-2-7b-chat-hf(cwq)
771
+ "./data/results/Meta-Llama-3-8B-Instruct_wd.csv", # Meta-Llama-3-8b-instruct
772
+ "./data/results/Meta-Llama-3-8B-Instruct_wd_true.csv", # Meta-Llama-3-8b-instruct(true)
773
+ "./data/results/Tune_2024-03-29_11-28-20.csv", # Mistral-7B-Instruct-v0.2
774
+ "./data/results/Mistral-7B-Instruct-v0.2_wd_true.csv", # Mistral-7B-Instruct-v0.2(true)
775
+ "./data/results/gemma-1.1-2b-it_wd.csv", # gemma-1.1-2b-it
776
+ "./data/results/gemma-1.1-2b-it_wd_true.csv", # gemma-1.1-7b-it(true)
777
+ # "./data/results/Tune_2024-04-20_13-12-43.csv", # gemma-1.1-2b-it
778
+ # "./data/results/Tune_2024-04-16_06-48-32.csv", # gemma-1.1-2b-it(cwq)
779
+ "./data/results/gemma-1.1-7b-it_wd.csv", # gemma-1.1-7b-it
780
+ "./data/results/gemma-1.1-7b-it_wd_true.csv", # gemma-1.1-7b-it(true)
781
+ # "./data/results/Tune_2024-04-18_13-18-38.csv", # gemma-1.1-7b-it
782
+ # "./data/results/Tune_2024-04-19_04-26-33.csv", # gemma-1.1-7b-it(cwq)
783
+ # "./data/results/Orca-2-13b_wd.csv", # Orca-2-13b
784
+ # "./data/results/Tune_2024-03-22_09-28-56.csv", # Orca-2-13b
785
+ "./data/results/Tune_2024-03-25_23-32-57.csv", # Llama-2-13b-chat-hf
786
+ "./data/results/Llama-2-13b-chat-hf_wd_true.csv", # Llama-2-13b-chat-hf(true)
787
+ "./data/results/Llama-2-70b-chat-hf_wd.csv", # Llama-2-70b-chat-hf
788
+ "./data/results/Llama-2-70b-chat-hf_wd_true.csv", # Llama-2-70b-chat-hf
789
+ "./data/results/Meta-Llama-3-70B-Instruct_wd.csv", # Meta-Llama-3-70B-Instruct
790
+ "./data/results/Meta-Llama-3-70B-Instruct_wd_true.csv", # Meta-Llama-3-70B-Instruct(true)
791
+ ]
792
+
793
+ df_ms_macro = pd.read_json("./data/datasets/ms_macro.json")
794
+
795
+
796
+ def load_for_repetition_penalty_ms_macro(
797
+ csv_result_file, repetition_penalty, force_recalculate=False
798
+ ):
799
+ result_file = replace_last(
800
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
801
+ )
802
+ df = load_with_newline_and_repetition_scores(
803
+ result_file, force_recalculate=force_recalculate
804
+ )
805
+
806
+ if df["ground_truth"][0] != df_ms_macro["wellFormedAnswers"][0]:
807
+ df["ground_truth"] = df_ms_macro["wellFormedAnswers"]
808
+ print("ground_truth updated for:", result_file)
809
+ df.to_csv(result_file, index=False)
810
+ return df
811
+
812
+
813
+ # MS MACRO
814
+ def plot_performance_scores_ms_macro(
815
+ result,
816
+ models=None,
817
+ title="Performance",
818
+ ):
819
+
820
+ if models is None:
821
+ models = result.keys()
822
+ for model in models:
823
+ print(f"model: {model}")
824
+ df = result[model]["df_overall"]
825
+ # print(result[model]["df_list_repetition_penalty"][0].describe())
826
+
827
+ # Calculate the statistics
828
+ bleu1 = list(df["bleu1"])
829
+ rougeL = list(df["rougeL"])
830
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
831
+ best_f1 = max(f1)
832
+ best_f1_index = f1.index(best_f1)
833
+
834
+ bleu1, rougeL = adjust_perf_scores_with_repetition_penalty(
835
+ result[model], bleu1, rougeL
836
+ )
837
+ afrp = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
838
+
839
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
840
+ best_afrp = max(afrp)
841
+ best_afrp_index = afrp.index(best_afrp)
842
+
843
+ repetition_penalties = list(df["repetition_penalty"])
844
+
845
+ # line plot for precision, recall, f1
846
+ plt.figure(figsize=(10, 6))
847
+
848
+ plt.axvspan(
849
+ repetition_penalties[best_f1_index] - 0.01,
850
+ repetition_penalties[best_f1_index] + 0.01,
851
+ alpha=0.5,
852
+ edgecolor="none",
853
+ facecolor="blue",
854
+ )
855
+
856
+ plt.axvspan(
857
+ repetition_penalties[best_afrp_index] - 0.01,
858
+ repetition_penalties[best_afrp_index] + 0.01,
859
+ alpha=0.5,
860
+ edgecolor="none",
861
+ facecolor="orange",
862
+ )
863
+
864
+ plt.plot(
865
+ repetition_penalties,
866
+ f1,
867
+ label="Overall Perf Score",
868
+ marker="D",
869
+ color="blue",
870
+ )
871
+ plt.plot(
872
+ repetition_penalties,
873
+ afrp,
874
+ label="RF Adjusted Perf Score",
875
+ marker="o",
876
+ color="orange",
877
+ )
878
+
879
+ plt.xlabel("Repetition Penalties")
880
+ plt.ylabel("Score")
881
+ plt.xlim(0.99, 1.31)
882
+ # y in percentage
883
+ plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
884
+ plt.title(f"{model} {title}")
885
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
886
+
887
+ plt.show()
888
+
889
+
890
+ def plot_repetition_factors(result, groups):
891
+ for group in groups:
892
+ # Plot the statistics
893
+ plt.figure(figsize=(10, 6))
894
+
895
+ max_value = 0
896
+ for model in result.keys():
897
+ if not group in model.lower():
898
+ continue
899
+ print(f"model: {model}")
900
+ df = result[model]["df_overall"]
901
+ repetition_panelties = [
902
+ repetition_penalty for repetition_penalty in df["repetition_penalty"]
903
+ ]
904
+
905
+ mean_score = [
906
+ math.log10(10 + df["total_repetitions"].mean())
907
+ for df in result[model]["df_list_repetition_penalty"]
908
+ ]
909
+
910
+ sns.lineplot(x=repetition_panelties, y=mean_score, label=model)
911
+
912
+ new_max = max(mean_score)
913
+ if new_max > max_value:
914
+ max_value = new_max
915
+
916
+ max_value = max_value * 1.05
917
+ if max_value < 1.5:
918
+ max_value = 1.5
919
+ # set ylimit
920
+ plt.ylim(1, max_value)
921
+
922
+ # show grid
923
+ plt.grid(True)
924
+ plt.xlabel("Repetition Penalties")
925
+ plt.ylabel("Repetition Factors")
926
+ plt.title("Repetition Factors vs Repetition Penalties")
927
+ plt.legend()
928
+
929
+ plt.show()
eval_modules/calc_repetitions_v2.py ADDED
@@ -0,0 +1,1087 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import pandas as pd
5
+ import numpy as np
6
+ import matplotlib.pyplot as plt
7
+ import matplotlib.ticker as mtick
8
+ import seaborn as sns
9
+ import nltk
10
+
11
+ # final version
12
+ pattern_abnormal_newlines = re.compile(r"\n{5,}")
13
+ pattern_text_repetitions = re.compile(r"(.{5,}?)(\1+)", re.M | re.DOTALL)
14
+ exception_patterns = [re.compile(r"(\w+\.?)\1")]
15
+
16
+
17
+ # final version for repetition detection
18
+ def detect_repetitions(
19
+ text, debug=False, pattern_text_repetitions=pattern_text_repetitions
20
+ ):
21
+ subtotals = [0, 0]
22
+
23
+ if isinstance(text, str):
24
+ patterns = [pattern_abnormal_newlines, pattern_text_repetitions]
25
+ for i, pattern in enumerate(patterns):
26
+ if debug:
27
+ print(
28
+ f"----detect {'abnormal newlines' if i == 0 else 'text repetitions'}----"
29
+ )
30
+ matches = pattern.finditer(text)
31
+ for match in matches:
32
+ if i > 0:
33
+ ignored = False
34
+ for exception_pattern in exception_patterns:
35
+ if exception_pattern.match(match[0]):
36
+ if debug:
37
+ print("ignored: ", match[0])
38
+ ignored = True
39
+ break
40
+ if ignored:
41
+ continue
42
+
43
+ if debug:
44
+ print(match)
45
+ for groupNum in range(0, len(match.groups())):
46
+ groupNum = groupNum + 1
47
+ print(
48
+ "Group {groupNum} found at {start}-{end}: `{group}`".format(
49
+ groupNum=groupNum,
50
+ start=match.start(groupNum),
51
+ end=match.end(groupNum),
52
+ group=match.group(groupNum),
53
+ )
54
+ )
55
+
56
+ start, end = match.span()
57
+ subtotals[i] += end - start
58
+
59
+ if i == 0 and subtotals[i] > 0:
60
+ text = pattern.sub("", text)
61
+ if debug:
62
+ print(f"removed abnormal newlines: {subtotals[i]}")
63
+
64
+ result = (subtotals[0], subtotals[1], subtotals[0] + subtotals[1])
65
+
66
+ if debug:
67
+ print(result)
68
+ return result
69
+
70
+
71
+ def detect_abnormal_newlines(text, debug=False):
72
+ return detect_repetitions(text, debug=debug)[0]
73
+
74
+
75
+ def detect_text_repetitions(text, debug=False):
76
+ return detect_repetitions(text, debug=debug)[1]
77
+
78
+
79
+ def detect_scores(text, debug=False):
80
+ newline_score, repetition_score, total_repetitions = detect_repetitions(
81
+ text, debug=debug
82
+ )
83
+ return pd.Series([newline_score, repetition_score, total_repetitions])
84
+
85
+
86
+ def load_with_newline_and_repetition_scores(result_file, force_recalculate=False):
87
+ print(f"loading result file: {result_file}")
88
+ df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
89
+
90
+ if (
91
+ force_recalculate
92
+ or "newline_score" not in df.columns
93
+ or "repetition_score" not in df.columns
94
+ or "total_repetitions" not in df.columns
95
+ ):
96
+ df[["newline_score", "repetition_score", "total_repetitions"]] = df[
97
+ "answer"
98
+ ].apply(detect_scores)
99
+ df.to_csv(result_file, index=False)
100
+
101
+ return df
102
+
103
+
104
+ def replace_last(source_string, old_string, new_string):
105
+ head, _sep, tail = source_string.rpartition(old_string)
106
+ return head + new_string + tail
107
+
108
+
109
+ def load_for_repetition_penalty(
110
+ csv_result_file, repetition_penalty, force_recalculate=False
111
+ ):
112
+ result_file = replace_last(
113
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
114
+ )
115
+ return load_with_newline_and_repetition_scores(
116
+ result_file, force_recalculate=force_recalculate
117
+ )
118
+
119
+
120
+ def calc_adjusted_performance(f, r):
121
+ return f / math.log10(10 + r)
122
+
123
+
124
+ def calculate_adjusted_performance(row):
125
+ r = row["total_repetitions"]
126
+ adjusted_precision = calc_adjusted_performance(row["precision"], r)
127
+ adjusted_recall = calc_adjusted_performance(row["recall"], r)
128
+ return pd.Series([adjusted_precision, adjusted_recall])
129
+
130
+
131
+ def load_performance_df(csv_result_file, repetition_penalty):
132
+ result_file = replace_last(
133
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}-t2_evaluated.json"
134
+ )
135
+ result_file = result_file.replace("/results/", "/eval/")
136
+ print(f"loading json file: {result_file}")
137
+ df = pd.read_json(result_file)
138
+
139
+ return df
140
+
141
+
142
+ def calculate_performance_score_v1(
143
+ csv_result_file, repetition_penalty, force_recalculate=False
144
+ ):
145
+ result_file = replace_last(
146
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
147
+ )
148
+ print(f"loading result file: {result_file}")
149
+ df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
150
+
151
+ if force_recalculate or "f2" in df.columns or "f1" not in df.columns:
152
+ df.drop(
153
+ columns=[
154
+ "precision",
155
+ "recall",
156
+ "f1",
157
+ "f2",
158
+ "entities_in_answer",
159
+ "entities_in_question",
160
+ ],
161
+ errors="ignore",
162
+ inplace=True,
163
+ )
164
+ perf_df = load_performance_df(csv_result_file, repetition_penalty)
165
+ filtered_df = perf_df[perf_df["id"].isin(df["id"])]
166
+ perf_df = filtered_df.reset_index(drop=True)
167
+ print(f"perf_df len: {len(perf_df)}")
168
+ # print(perf_df.head())
169
+
170
+ df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"]
171
+
172
+ df["precision"] = perf_df["score"].apply(lambda x: x[0])
173
+ df["recall"] = perf_df["score"].apply(lambda x: x[1])
174
+ df["f1"] = perf_df["score"].apply(lambda x: x[2])
175
+
176
+ df[["adjusted_precision", "adjusted_recall"]] = df.apply(
177
+ calculate_adjusted_performance, axis=1
178
+ )
179
+
180
+ df.to_csv(result_file, index=False)
181
+ print(f"performance scores saved to result file: {result_file}")
182
+
183
+ print(f"df len: {len(df)}")
184
+
185
+ return df
186
+
187
+
188
+ ref_df = pd.read_csv(
189
+ "./data/results/gpt-3.5-turbo_non_rag.csv", comment="#", on_bad_lines="warn"
190
+ )
191
+
192
+
193
+ def calculate_performance_score(
194
+ csv_result_file, repetition_penalty, force_recalculate=False
195
+ ):
196
+ result_file = replace_last(
197
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
198
+ )
199
+
200
+ re_creating = False
201
+ if os.path.exists(result_file):
202
+ print(f"loading result file: {result_file}")
203
+ df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
204
+ else:
205
+ print(f"re-creating result file: {result_file}")
206
+ df = pd.DataFrame()
207
+ force_recalculate = True
208
+
209
+ if force_recalculate or "f2" in df.columns or "f1" not in df.columns:
210
+ df.drop(
211
+ columns=[
212
+ "precision",
213
+ "recall",
214
+ "f1",
215
+ "f2",
216
+ "entities_in_answer",
217
+ "entities_in_question",
218
+ "word_count",
219
+ ],
220
+ errors="ignore",
221
+ inplace=True,
222
+ )
223
+ perf_df = load_performance_df(csv_result_file, repetition_penalty)
224
+ filtered_df = perf_df[perf_df["id"].isin(ref_df["id"])]
225
+ perf_df = filtered_df.reset_index(drop=True)
226
+ print(f"perf_df len: {len(perf_df)}")
227
+
228
+ if len(perf_df) != len(ref_df):
229
+ print(f"error: len(perf_df) != {len(ref_df)}")
230
+ missing_ids = [
231
+ id for id in ref_df["id"].unique() if id not in perf_df["id"].unique()
232
+ ]
233
+ print(f"missing_ids: {missing_ids}")
234
+
235
+ # print(perf_df.head())
236
+
237
+ df["id"] = perf_df["id"]
238
+ df["question"] = perf_df["question"]
239
+ df["answer"] = perf_df["pred_answer"]
240
+ df["word_count"] = df["answer"].apply(
241
+ lambda x: len(nltk.word_tokenize(x)) if isinstance(x, str) else 0
242
+ )
243
+ df["ground_truth"] = perf_df["ground_truth"]
244
+ df[["newline_score", "repetition_score", "total_repetitions"]] = df[
245
+ "answer"
246
+ ].apply(detect_scores)
247
+
248
+ df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"]
249
+ df["precision"] = perf_df["score"].apply(lambda x: x[0])
250
+ df["recall"] = perf_df["score"].apply(lambda x: x[1])
251
+ df["f1"] = perf_df["score"].apply(lambda x: x[2])
252
+
253
+ df[["adjusted_precision", "adjusted_recall"]] = df.apply(
254
+ calculate_adjusted_performance, axis=1
255
+ )
256
+
257
+ df.to_csv(result_file, index=False)
258
+ print(f"performance scores saved to result file: {result_file}")
259
+
260
+ print(f"df len: {len(df)}")
261
+
262
+ return df
263
+
264
+
265
+ def adjust_perf_scores_with_repetition_penalty(result, precision, recall):
266
+ newline_score = [
267
+ df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
268
+ ]
269
+ print(f"newline_score: {newline_score}")
270
+
271
+ repetition_score = [
272
+ df["repetition_score"].mean() for df in result["df_list_repetition_penalty"]
273
+ ]
274
+ print(f"repetition_score: {repetition_score}")
275
+
276
+ precision = [
277
+ f / math.log10(10 + n + r)
278
+ for f, n, r in zip(precision, newline_score, repetition_score)
279
+ ]
280
+ recall = [
281
+ f / math.log10(10 + n + r)
282
+ for f, n, r in zip(recall, newline_score, repetition_score)
283
+ ]
284
+
285
+ return precision, recall
286
+
287
+
288
+ def plot_performance_scores(
289
+ result,
290
+ models=None,
291
+ title="Performance",
292
+ ):
293
+
294
+ if models is None:
295
+ models = result.keys()
296
+ for model in models:
297
+ print(f"model: {model}")
298
+ df = result[model]["df_overall"]
299
+
300
+ # Calculate the statistics
301
+ precision = [
302
+ df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
303
+ ]
304
+ recall = [
305
+ df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
306
+ ]
307
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
308
+ best_f1 = max(f1)
309
+ best_f1_index = f1.index(best_f1)
310
+
311
+ precision, recall = adjust_perf_scores_with_repetition_penalty(
312
+ result[model], precision, recall
313
+ )
314
+ afrp = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
315
+
316
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
317
+ best_afrp = max(afrp)
318
+ best_afrp_index = afrp.index(best_afrp)
319
+
320
+ adjusted_precision = [
321
+ df["adjusted_precision"].mean()
322
+ for df in result[model]["df_list_repetition_penalty"]
323
+ ]
324
+ adjusted_recall = [
325
+ df["adjusted_recall"].mean()
326
+ for df in result[model]["df_list_repetition_penalty"]
327
+ ]
328
+ afrp2 = [
329
+ 2 * (p * r) / (p + r) for p, r in zip(adjusted_precision, adjusted_recall)
330
+ ]
331
+ best_afrp2 = max(afrp2)
332
+ best_afrp2_index = afrp2.index(best_afrp2)
333
+
334
+ repetition_penalties = list(df["repetition_penalty"])
335
+
336
+ # line plot for precision, recall, f1
337
+ plt.figure(figsize=(10, 6))
338
+
339
+ plt.axvspan(
340
+ repetition_penalties[best_f1_index] - 0.01,
341
+ repetition_penalties[best_f1_index] + 0.01,
342
+ alpha=0.5,
343
+ edgecolor="none",
344
+ facecolor="blue",
345
+ )
346
+
347
+ # plt.axvspan(
348
+ # repetition_penalties[best_afrp2_index] - 0.01,
349
+ # repetition_penalties[best_afrp2_index] + 0.01,
350
+ # alpha=0.5,
351
+ # edgecolor="none",
352
+ # facecolor="green",
353
+ # )
354
+
355
+ plt.axvspan(
356
+ repetition_penalties[best_afrp_index] - 0.01,
357
+ repetition_penalties[best_afrp_index] + 0.01,
358
+ alpha=0.5,
359
+ edgecolor="none",
360
+ facecolor="orange",
361
+ )
362
+
363
+ plt.plot(repetition_penalties, f1, label="F1", marker="D", color="blue")
364
+ # plt.plot(
365
+ # repetition_penalties,
366
+ # afrp2,
367
+ # label="Per-question RF Adjusted F1",
368
+ # marker="s",
369
+ # color="green",
370
+ # )
371
+ plt.plot(
372
+ repetition_penalties,
373
+ afrp,
374
+ label="RF Adjusted F1",
375
+ marker="o",
376
+ color="orange",
377
+ )
378
+ plt.xlabel("Repetition Penalties")
379
+ plt.ylabel("Score")
380
+ plt.xlim(0.99, 1.31)
381
+ # y in percentage
382
+ plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
383
+ plt.title(f"{model} {title}")
384
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
385
+
386
+ plt.show()
387
+
388
+
389
+ def plot_best_afrp(
390
+ result,
391
+ models=None,
392
+ title="Models with Best Repetition Factor Adjusted F1",
393
+ ref_result=None,
394
+ ):
395
+ # Initialize lists to store the statistics
396
+ model_names = []
397
+ best_f1 = []
398
+ best_afrp = []
399
+ best_repetition_penalty = []
400
+
401
+ if models is None:
402
+ models = result.keys()
403
+ for model in models:
404
+ print(f"model: {model}")
405
+ df = result[model]["df_overall"]
406
+
407
+ # Calculate the statistics
408
+ precision = [
409
+ df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
410
+ ]
411
+ recall = [
412
+ df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
413
+ ]
414
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
415
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
416
+
417
+ newline_score = [
418
+ df["newline_score"].mean()
419
+ for df in result[model]["df_list_repetition_penalty"]
420
+ ]
421
+ print(f"newline_score: {newline_score}")
422
+
423
+ repetition_score = [
424
+ df["repetition_score"].mean()
425
+ for df in result[model]["df_list_repetition_penalty"]
426
+ ]
427
+ print(f"repetition_score: {repetition_score}")
428
+
429
+ afrp = [
430
+ f / math.log10(10 + n + r)
431
+ for f, n, r in zip(f1, newline_score, repetition_score)
432
+ ]
433
+
434
+ best_afrp.append(max(afrp))
435
+ best_afrp_index = afrp.index(best_afrp[-1])
436
+ best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
437
+
438
+ best_f1.append(f1[best_afrp_index])
439
+
440
+ print(
441
+ f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
442
+ )
443
+
444
+ df = result[model]["df_list_repetition_penalty"][best_afrp_index]
445
+
446
+ model_names.append(
447
+ f"{model} (RP={best_repetition_penalty[-1]})"
448
+ ) # Add the model name to the list
449
+
450
+ if ref_result is not None:
451
+ print("ref_result:", ref_result)
452
+ for model in ref_result.keys():
453
+ model_names.append(model)
454
+ df = pd.read_csv(ref_result[model])
455
+ # df = df[df["id"].isin(wikidata_df["id"])]
456
+
457
+ p = df["precision"].mean()
458
+ r = df["recall"].mean()
459
+
460
+ f1 = 2 * p * r / (p + r) if p + r > 0 else 0
461
+ best_f1.append(f1)
462
+ best_afrp.append(f1)
463
+
464
+ print("model_names:", model_names)
465
+ print("best_f1:", best_f1)
466
+ print("best_afrp:", best_afrp)
467
+
468
+ # Create a DataFrame with the statistics
469
+ data = pd.DataFrame(
470
+ {
471
+ "Model": model_names,
472
+ "Repetition Factor Adjusted F1": best_afrp,
473
+ "F1": best_f1,
474
+ }
475
+ )
476
+
477
+ # Melt the DataFrame to a long format
478
+ data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
479
+
480
+ # Pivot the DataFrame to a wide format
481
+ data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
482
+
483
+ # make sure the columns are following the order of the models
484
+ data_pivoted = data_pivoted[model_names]
485
+
486
+ # make sure three groups in the order of precision, recall, f1
487
+ data_pivoted = data_pivoted.reindex(["Repetition Factor Adjusted F1", "F1"])
488
+
489
+ # Plot the statistics
490
+ plt.figure(figsize=(15, 6))
491
+ ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
492
+ plt.title(title)
493
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
494
+
495
+ # Set the rotation of the x-axis labels to 0 degrees
496
+ plt.xticks(rotation=0)
497
+
498
+ # Format the y-axis to display as percentage
499
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
500
+
501
+ # get the max value of the y-axis
502
+ a1 = max(best_afrp)
503
+ a2 = max(best_f1)
504
+
505
+ max_value = max([a1, a2]) * 1.12
506
+ print("max_value:", max_value)
507
+
508
+ # Set the y-axis limit up to 70%
509
+ ax.set_ylim(0, max_value)
510
+
511
+ # Add the values above each bar
512
+ for p in ax.patches:
513
+ ax.annotate(
514
+ f"{p.get_height() * 100:.1f}",
515
+ (p.get_x() + p.get_width() / 2.0, p.get_height()),
516
+ ha="center",
517
+ va="bottom",
518
+ xytext=(0, 10),
519
+ textcoords="offset points",
520
+ rotation=90,
521
+ )
522
+
523
+ plt.show()
524
+
525
+
526
+ def plot_best_performance(
527
+ result,
528
+ models=None,
529
+ title="Models with Best F1 Score",
530
+ adjusted_f1=False,
531
+ ref_result=None,
532
+ ):
533
+ # Initialize lists to store the statistics
534
+ model_names = []
535
+ best_precision = []
536
+ best_recall = []
537
+ best_f1 = []
538
+ best_repetition_penalty = []
539
+
540
+ if models is None:
541
+ models = result.keys()
542
+ for model in models:
543
+ print(f"model: {model}")
544
+ df = result[model]["df_overall"]
545
+
546
+ # Calculate the statistics
547
+ precision = [
548
+ df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
549
+ ]
550
+ recall = [
551
+ df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
552
+ ]
553
+
554
+ if adjusted_f1:
555
+ precision, recall = adjust_perf_scores_with_repetition_penalty(
556
+ result[model], precision, recall
557
+ )
558
+
559
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
560
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
561
+
562
+ best_f1.append(max(f1))
563
+ best_f1_index = f1.index(best_f1[-1])
564
+ best_repetition_penalty.append(df["repetition_penalty"][best_f1_index])
565
+
566
+ best_precision.append(precision[best_f1_index])
567
+ best_recall.append(recall[best_f1_index])
568
+
569
+ print(
570
+ f"best repetition penalty: {best_repetition_penalty[-1]}, best f1: {best_f1[-1]}, precision: {best_precision[-1]}, recall: {best_recall[-1]}"
571
+ )
572
+
573
+ df = result[model]["df_list_repetition_penalty"][best_f1_index]
574
+
575
+ model_names.append(
576
+ f"{model} (RP={best_repetition_penalty[-1]})"
577
+ ) # Add the model name to the list
578
+
579
+ # print sum for columns: newline_score, repetition_score
580
+ print(
581
+ f"newline_score: {df['newline_score'].sum()}, repetition_score: {df['repetition_score'].sum()}"
582
+ )
583
+
584
+ if ref_result is not None:
585
+ print("ref_result:", ref_result)
586
+ for model in ref_result.keys():
587
+ model_names.append(model)
588
+ df = pd.read_csv(ref_result[model])
589
+ # df = df[df["id"].isin(wikidata_df["id"])]
590
+
591
+ best_precision.append(df["precision"].mean())
592
+ best_recall.append(df["recall"].mean())
593
+ f1 = (
594
+ 2
595
+ * (best_precision[-1] * best_recall[-1])
596
+ / (best_precision[-1] + best_recall[-1])
597
+ )
598
+ # best_f1.append(df["f1"].mean())
599
+ best_f1.append(f1)
600
+
601
+ # Create a DataFrame with the statistics
602
+ data = (
603
+ pd.DataFrame(
604
+ {
605
+ "Model": model_names,
606
+ "Adjusted Precision with RP": best_precision,
607
+ "Adjusted Recall with RP": best_recall,
608
+ "Adjusted F1 with RP": best_f1,
609
+ }
610
+ )
611
+ if adjusted_f1
612
+ else pd.DataFrame(
613
+ {
614
+ "Model": model_names,
615
+ "Precision": best_precision,
616
+ "Recall": best_recall,
617
+ "F1": best_f1,
618
+ }
619
+ )
620
+ )
621
+ columns = list(data.columns)
622
+
623
+ # Melt the DataFrame to a long format
624
+ data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
625
+
626
+ # Pivot the DataFrame to a wide format
627
+ data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
628
+
629
+ # make sure the columns are following the order of the models
630
+ data_pivoted = data_pivoted[model_names]
631
+
632
+ # make sure three groups in the order of precision, recall, f1
633
+ data_pivoted = data_pivoted.reindex(columns[1:])
634
+
635
+ # Plot the statistics
636
+ plt.figure(figsize=(10, 6))
637
+ ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
638
+ plt.title(title)
639
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
640
+
641
+ # Set the rotation of the x-axis labels to 0 degrees
642
+ plt.xticks(rotation=0)
643
+
644
+ # Format the y-axis to display as percentage
645
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
646
+
647
+ # get the max value of the y-axis
648
+ a1 = max(best_precision)
649
+ a2 = max(best_recall)
650
+ a3 = max(best_f1)
651
+
652
+ max_value = max([a1, a2, a3]) * 1.12
653
+ print("max_value:", max_value)
654
+
655
+ # Set the y-axis limit up to 70%
656
+ ax.set_ylim(0, max_value)
657
+
658
+ # Add the values above each bar
659
+ for p in ax.patches:
660
+ ax.annotate(
661
+ f"{p.get_height() * 100:.1f}",
662
+ (p.get_x() + p.get_width() / 2.0, p.get_height()),
663
+ ha="center",
664
+ va="bottom",
665
+ xytext=(0, 10),
666
+ textcoords="offset points",
667
+ rotation=90,
668
+ )
669
+
670
+ plt.show()
671
+
672
+
673
+ def plot_best_performance_ms_macro(
674
+ result,
675
+ models=None,
676
+ title="Models with Best Repetition Factor Adjusted Performance",
677
+ ref_result=None,
678
+ skip_generic_prompt=False,
679
+ include_adjusted_performance=True,
680
+ ):
681
+ # Initialize lists to store the statistics
682
+ model_names = []
683
+ best_f1 = []
684
+ best_afrp = []
685
+ best_repetition_penalty = []
686
+ best_bleu1 = []
687
+ best_rougeL = []
688
+
689
+ if models is None:
690
+ models = result.keys()
691
+ for model in models:
692
+ if skip_generic_prompt and "generic prompt" in model:
693
+ continue
694
+ print(f"model: {model}")
695
+ df = result[model]["df_overall"]
696
+
697
+ # Calculate the statistics
698
+ bleu1 = [x for x in df["bleu1"]]
699
+ rougeL = [x for x in df["rougeL"]]
700
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
701
+
702
+ newline_score = [
703
+ df["newline_score"].mean()
704
+ for df in result[model]["df_list_repetition_penalty"]
705
+ ]
706
+ print(f"newline_score: {newline_score}")
707
+
708
+ repetition_score = [
709
+ df["repetition_score"].mean()
710
+ for df in result[model]["df_list_repetition_penalty"]
711
+ ]
712
+ print(f"repetition_score: {repetition_score}")
713
+
714
+ afrp = [
715
+ f / math.log10(10 + n + r)
716
+ for f, n, r in zip(f1, newline_score, repetition_score)
717
+ ]
718
+
719
+ best_afrp.append(max(afrp if include_adjusted_performance else f1))
720
+ best_afrp_index = (
721
+ afrp.index(best_afrp[-1])
722
+ if include_adjusted_performance
723
+ else f1.index(best_afrp[-1])
724
+ )
725
+ best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
726
+
727
+ best_f1.append(f1[best_afrp_index])
728
+ best_bleu1.append(bleu1[best_afrp_index])
729
+ best_rougeL.append(rougeL[best_afrp_index])
730
+
731
+ print(
732
+ f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
733
+ )
734
+
735
+ df = result[model]["df_list_repetition_penalty"][best_afrp_index]
736
+
737
+ model_names.append(
738
+ f"{model} (RP={best_repetition_penalty[-1]})"
739
+ ) # Add the model name to the list
740
+
741
+ if ref_result is not None:
742
+ print("ref_result:", ref_result)
743
+ for model in ref_result.keys():
744
+ model_names.append(model)
745
+ df = pd.read_csv(ref_result[model], comment="#", on_bad_lines="warn")
746
+ # df = df[df["id"].isin(wikidata_df["id"])]
747
+
748
+ p = df["bleu1"][0]
749
+ best_bleu1.append(p)
750
+
751
+ r = df["rougeL"][0]
752
+ best_rougeL.append(r)
753
+
754
+ f1 = 2 * p * r / (p + r) if p + r > 0 else 0
755
+ best_f1.append(f1)
756
+ best_afrp.append(f1)
757
+
758
+ print("model_names:", model_names)
759
+ print("best_f1:", best_f1)
760
+ print("best_afrp:", best_afrp)
761
+
762
+ # Create a DataFrame with the statistics
763
+ data = (
764
+ pd.DataFrame(
765
+ {
766
+ "Model": model_names,
767
+ "Repetition Factor Adjusted Perf Score": best_afrp,
768
+ "Overall Perf Score": best_f1,
769
+ }
770
+ )
771
+ if include_adjusted_performance
772
+ else pd.DataFrame(
773
+ {
774
+ "Model": model_names,
775
+ "Bleu-1": best_bleu1,
776
+ "Rouge-L": best_rougeL,
777
+ "Overall Perf Score": best_f1,
778
+ }
779
+ )
780
+ )
781
+
782
+ # Melt the DataFrame to a long format
783
+ data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
784
+
785
+ # Pivot the DataFrame to a wide format
786
+ data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
787
+
788
+ # make sure the columns are following the order of the models
789
+ data_pivoted = data_pivoted[model_names]
790
+
791
+ columns = list(data.columns)
792
+ data_pivoted = data_pivoted.reindex(columns[1:])
793
+
794
+ # Plot the statistics
795
+ plt.figure(figsize=(10, 6))
796
+ ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
797
+ plt.title(title)
798
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
799
+
800
+ # Set the rotation of the x-axis labels to 0 degrees
801
+ plt.xticks(rotation=0)
802
+
803
+ # Format the y-axis to display as percentage
804
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
805
+
806
+ # get the max value of the y-axis
807
+ a1 = max(best_afrp)
808
+ a2 = max(best_f1)
809
+ a3 = max(best_bleu1)
810
+ a4 = max(best_rougeL)
811
+
812
+ max_value = (
813
+ max([a1, a2] if include_adjusted_performance else [a1, a2, a3, a4]) * 1.12
814
+ )
815
+ print("max_value:", max_value)
816
+
817
+ # Set the y-axis limit up to 70%
818
+ ax.set_ylim(0, max_value)
819
+
820
+ # Add the values above each bar
821
+ for p in ax.patches:
822
+ ax.annotate(
823
+ f"{p.get_height() * 100:.1f}",
824
+ (p.get_x() + p.get_width() / 2.0, p.get_height()),
825
+ ha="center",
826
+ va="bottom",
827
+ xytext=(0, 10),
828
+ textcoords="offset points",
829
+ rotation=90,
830
+ )
831
+
832
+ plt.show()
833
+
834
+
835
+ all_open_source_models = [
836
+ "gemma-1.1-2b-it",
837
+ "Phi-3-mini-128k-instruct",
838
+ "gemma-1.1-7b-it",
839
+ "Llama-2-7b-chat-hf",
840
+ "Mistral-7B-Instruct-v0.2",
841
+ "Meta-Llama-3-8B-Instruct",
842
+ "Llama-2-13b-chat-hf",
843
+ "Llama-2-70b-chat-hf",
844
+ "Meta-Llama-3-70B-Instruct",
845
+ ]
846
+
847
+
848
+ non_rag_csv_result_files = [
849
+ "./data/results/gemma-1.1-2b-it_wd_non_rag.csv", # gemma-1.1-2b-it
850
+ "./data/results/Phi-3-mini-128k-instruct_wd_non_rag_batch_16.csv", # Phi-3-mini-128k-instruct(batch size:16)
851
+ "./data/results/gemma-1.1-7b-it_wd_non_rag.csv", # gemma-1.1-7b-it
852
+ "./data/results/Tune_2024-04-09_09-19-22.csv", # Llama-2-7b-chat-hf
853
+ "./data/results/Tune_2024-04-16_12-24-27.csv.csv", # Mistral-7B-Instruct-v0.2
854
+ "./data/results/Meta-Llama-3-8B-Instruct_wd_non_rag.csv", # Meta-Llama-3-8B-Instruct
855
+ "./data/results/Meta-Llama-3-8B-Instruct_wd_1_non_rag.csv", # Meta-Llama-3-8B-Instruct
856
+ "./data/results/Tune_2024-04-10_16-53-38.csv", # Llama-2-13b-chat-hf
857
+ "./data/results/Llama-2-70b-chat-hf_wd_non_rag.csv", # Llama-2-70b-chat-hf
858
+ "./data/results/Meta-Llama-3-70B-Instruct_wd_non_rag.csv", # Meta-Llama-3-70B-Instruct
859
+ ]
860
+
861
+ rag_csv_result_files = [
862
+ "./data/results/gemma-1.1-2b-it_wd.csv", # gemma-1.1-2b-it
863
+ "./data/results/gemma-1.1-2b-it_wd_true.csv", # gemma-1.1-2b-it(true)
864
+ "./data/results/Phi-3-mini-128k-instruct_wd_rag_batch_4.csv", # Phi-3-mini-128k-instruct(batch size:16)
865
+ "./data/results/Phi-3-mini-128k-instruct_wd_true.csv", # Phi-3-mini-128k-instruct(batch size:16)
866
+ "./data/results/gemma-1.1-7b-it_wd.csv", # gemma-1.1-7b-it
867
+ "./data/results/gemma-1.1-7b-it_wd_true.csv", # gemma-1.1-7b-it(true)
868
+ "./data/results/Tune_2024-03-20_15-35-37.csv", # Llama-2-7b-chat-hf
869
+ "./data/results/Llama-2-7b-chat-hf_wd_true.csv", # Llama-2-7b-chat-hf(true)
870
+ "./data/results/Tune_2024-03-29_11-28-20.csv", # Mistral-7B-Instruct-v0.2
871
+ "./data/results/Mistral-7B-Instruct-v0.2_wd_true.csv", # Mistral-7B-Instruct-v0.2(true)
872
+ "./data/results/Meta-Llama-3-8B-Instruct_wd.csv", # Meta-Llama-3-8b-instruct
873
+ "./data/results/Meta-Llama-3-8B-Instruct_wd_true.csv", # Meta-Llama-3-8b-instruct(true)
874
+ "./data/results/Tune_2024-03-25_23-32-57.csv", # Llama-2-13b-chat-hf
875
+ "./data/results/Llama-2-13b-chat-hf_wd_true.csv", # Llama-2-13b-chat-hf(true)
876
+ "./data/results/Llama-2-70b-chat-hf_wd.csv", # Llama-2-70b-chat-hf
877
+ "./data/results/Llama-2-70b-chat-hf_wd_true.csv", # Llama-2-70b-chat-hf
878
+ "./data/results/Meta-Llama-3-70B-Instruct_wd.csv", # Meta-Llama-3-70B-Instruct
879
+ "./data/results/Meta-Llama-3-70B-Instruct_wd_true.csv", # Meta-Llama-3-70B-Instruct(true)
880
+ ]
881
+
882
+ df_ms_macro = pd.read_json("./data/datasets/ms_macro.json")
883
+
884
+
885
+ def load_for_repetition_penalty_ms_macro(
886
+ csv_result_file, repetition_penalty, force_recalculate=False
887
+ ):
888
+ result_file = replace_last(
889
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
890
+ )
891
+ df = load_with_newline_and_repetition_scores(
892
+ result_file, force_recalculate=force_recalculate
893
+ )
894
+
895
+ if len(df) != len(df_ms_macro):
896
+ print(f"error: len(df) != {len(df_ms_macro)}")
897
+ missing_ids = [
898
+ id for id in df_ms_macro["id"].unique() if id not in df["id"].unique()
899
+ ]
900
+ print(f"missing_ids: {missing_ids}")
901
+
902
+ if df["ground_truth"][0] != str(df_ms_macro["wellFormedAnswers"][0]):
903
+ df["ground_truth"] = df_ms_macro["wellFormedAnswers"]
904
+ print("ground_truth updated for:", result_file)
905
+ df.to_csv(result_file, index=False)
906
+ return df
907
+
908
+
909
+ # MS MACRO
910
+ def plot_performance_scores_ms_macro(
911
+ result,
912
+ models=None,
913
+ title="Performance",
914
+ ):
915
+
916
+ if models is None:
917
+ models = result.keys()
918
+ for model in models:
919
+ print(f"model: {model}")
920
+ df = result[model]["df_overall"]
921
+ # print(result[model]["df_list_repetition_penalty"][0].describe())
922
+
923
+ # Calculate the statistics
924
+ bleu1 = list(df["bleu1"])
925
+ rougeL = list(df["rougeL"])
926
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
927
+ best_f1 = max(f1)
928
+ best_f1_index = f1.index(best_f1)
929
+
930
+ bleu1, rougeL = adjust_perf_scores_with_repetition_penalty(
931
+ result[model], bleu1, rougeL
932
+ )
933
+ afrp = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
934
+
935
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
936
+ best_afrp = max(afrp)
937
+ best_afrp_index = afrp.index(best_afrp)
938
+
939
+ repetition_penalties = list(df["repetition_penalty"])
940
+
941
+ # line plot for precision, recall, f1
942
+ plt.figure(figsize=(10, 6))
943
+
944
+ plt.axvspan(
945
+ repetition_penalties[best_f1_index] - 0.01,
946
+ repetition_penalties[best_f1_index] + 0.01,
947
+ alpha=0.5,
948
+ edgecolor="none",
949
+ facecolor="blue",
950
+ )
951
+
952
+ plt.axvspan(
953
+ repetition_penalties[best_afrp_index] - 0.01,
954
+ repetition_penalties[best_afrp_index] + 0.01,
955
+ alpha=0.5,
956
+ edgecolor="none",
957
+ facecolor="orange",
958
+ )
959
+
960
+ plt.plot(
961
+ repetition_penalties,
962
+ f1,
963
+ label="Overall Perf Score",
964
+ marker="D",
965
+ color="blue",
966
+ )
967
+ plt.plot(
968
+ repetition_penalties,
969
+ afrp,
970
+ label="RF Adjusted Perf Score",
971
+ marker="o",
972
+ color="orange",
973
+ )
974
+
975
+ plt.xlabel("Repetition Penalties")
976
+ plt.ylabel("Score")
977
+ plt.xlim(0.99, 1.31)
978
+ # y in percentage
979
+ plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
980
+ plt.title(f"{model} {title}")
981
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
982
+
983
+ plt.show()
984
+
985
+
986
+ def plot_repetition_factors(result, groups):
987
+ for group in groups:
988
+ # Plot the statistics
989
+ plt.figure(figsize=(10, 6))
990
+
991
+ max_value = 0
992
+ for model in result.keys():
993
+ if not group in model.lower():
994
+ continue
995
+ print(f"model: {model}")
996
+ df = result[model]["df_overall"]
997
+ repetition_panelties = [
998
+ repetition_penalty for repetition_penalty in df["repetition_penalty"]
999
+ ]
1000
+
1001
+ mean_score = [
1002
+ math.log10(10 + df["total_repetitions"].mean())
1003
+ for df in result[model]["df_list_repetition_penalty"]
1004
+ ]
1005
+
1006
+ sns.lineplot(x=repetition_panelties, y=mean_score, label=model)
1007
+
1008
+ new_max = max(mean_score)
1009
+ if new_max > max_value:
1010
+ max_value = new_max
1011
+
1012
+ max_value = max_value * 1.05
1013
+ if max_value < 1.5:
1014
+ max_value = 1.5
1015
+ # set ylimit
1016
+ plt.ylim(1, max_value)
1017
+
1018
+ # show grid
1019
+ plt.grid(True)
1020
+ plt.xlabel("Repetition Penalties")
1021
+ plt.ylabel("Repetition Factors")
1022
+ plt.title("Repetition Factors vs Repetition Penalties")
1023
+ plt.legend()
1024
+
1025
+ plt.show()
1026
+
1027
+
1028
+ def plot_repetition_factors_by_group(result, group_filter=None):
1029
+ markers = ["D", "o", "s", "x"]
1030
+ colors = ["blue", "orange", "green", "red"]
1031
+
1032
+ # Plot the statistics
1033
+ plt.figure(figsize=(10, 6))
1034
+ index = 0
1035
+ max_value = 0
1036
+
1037
+ for model in result.keys():
1038
+ if group_filter is not None and group_filter not in model:
1039
+ continue
1040
+
1041
+ print(f"model: {model}")
1042
+
1043
+ df = result[model]["df_overall"]
1044
+ repetition_panelties = [
1045
+ repetition_penalty for repetition_penalty in df["repetition_penalty"]
1046
+ ]
1047
+
1048
+ # Calculate the statistics
1049
+ mean_score = [
1050
+ math.log10(10 + df["total_repetitions"].mean())
1051
+ for df in result[model]["df_list_repetition_penalty"]
1052
+ ]
1053
+ if len(mean_score) != len(repetition_panelties):
1054
+ print(
1055
+ f"model: {model} has different length of repetition penalties and mean score"
1056
+ )
1057
+ print("repetition_panelties:", len(repetition_panelties))
1058
+ print("mean_score:", len(mean_score))
1059
+ continue
1060
+
1061
+ new_max = max(mean_score)
1062
+ if new_max > max_value:
1063
+ max_value = new_max
1064
+
1065
+ sns.lineplot(
1066
+ x=repetition_panelties,
1067
+ y=mean_score,
1068
+ label=model,
1069
+ marker=markers[index],
1070
+ color=colors[index],
1071
+ )
1072
+
1073
+ index += 1
1074
+
1075
+ max_value = max_value * 1.05
1076
+ if max_value < 1.5:
1077
+ max_value = 1.5
1078
+ # set ylimit
1079
+ plt.ylim(1, max_value)
1080
+ max_value = 0
1081
+
1082
+ plt.xlabel("Repetition Penalties")
1083
+ plt.ylabel("Repetition Factors")
1084
+ plt.title("Repetition Factors vs Repetition Penalties")
1085
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
1086
+
1087
+ plt.show()
eval_modules/calc_repetitions_v3.py ADDED
@@ -0,0 +1,1095 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import pandas as pd
5
+ import numpy as np
6
+ import matplotlib.pyplot as plt
7
+ import matplotlib.ticker as mtick
8
+ import seaborn as sns
9
+ import nltk
10
+
11
+ # final version
12
+ pattern_abnormal_newlines = re.compile(r"\n{5,}")
13
+ pattern_text_repetitions = re.compile(r"(.{5}.*)\s*((\1)\s*)+", re.M | re.DOTALL)
14
+ exception_patterns = [
15
+ re.compile(r"(\w+\.?)(\1)+$"),
16
+ re.compile(r"\W*(wink|nudge|Virginia)\W*((\1)\W*)+$"),
17
+ re.compile(r"\s+$"),
18
+ ]
19
+
20
+
21
+ # final version for repetition detection
22
+ def detect_repetitions(
23
+ text, debug=False, pattern_text_repetitions=pattern_text_repetitions
24
+ ):
25
+ subtotals = [0, 0]
26
+
27
+ if isinstance(text, str):
28
+ patterns = [pattern_abnormal_newlines, pattern_text_repetitions]
29
+ for i, pattern in enumerate(patterns):
30
+ if debug:
31
+ print(
32
+ f"----detect {'abnormal newlines' if i == 0 else 'text repetitions'}----"
33
+ )
34
+ matches = pattern.finditer(text)
35
+ for match in matches:
36
+ if i > 0:
37
+ ignored = False
38
+ for exception_pattern in exception_patterns:
39
+ match_ex = exception_pattern.match(match[0])
40
+ if match_ex:
41
+ if debug:
42
+ print("ignored: ", match[0])
43
+ print("exception: ", match_ex)
44
+ ignored = True
45
+ break
46
+ if ignored:
47
+ continue
48
+
49
+ if debug:
50
+ print(match)
51
+ for groupNum in range(0, len(match.groups())):
52
+ groupNum = groupNum + 1
53
+ print(
54
+ "Group {groupNum} found at {start}-{end}: `{group}`".format(
55
+ groupNum=groupNum,
56
+ start=match.start(groupNum),
57
+ end=match.end(groupNum),
58
+ group=match.group(groupNum),
59
+ )
60
+ )
61
+
62
+ start, end = match.span()
63
+ subtotals[i] += end - start
64
+
65
+ if i == 0:
66
+ text = text.strip()
67
+ if subtotals[i] > 0:
68
+ text = pattern.sub("", text)
69
+ if debug:
70
+ print(f"removed abnormal newlines: {subtotals[i]}")
71
+
72
+ result = (subtotals[0], subtotals[1], subtotals[0] + subtotals[1])
73
+
74
+ if debug:
75
+ print(result)
76
+ return result
77
+
78
+
79
+ def detect_abnormal_newlines(text, debug=False):
80
+ return detect_repetitions(text, debug=debug)[0]
81
+
82
+
83
+ def detect_text_repetitions(text, debug=False):
84
+ return detect_repetitions(text, debug=debug)[1]
85
+
86
+
87
+ def detect_scores(text, debug=False):
88
+ newline_score, repetition_score, total_repetitions = detect_repetitions(
89
+ text, debug=debug
90
+ )
91
+ return pd.Series([newline_score, repetition_score, total_repetitions])
92
+
93
+
94
+ def load_with_newline_and_repetition_scores(result_file, force_recalculate=False):
95
+ print(f"loading result file: {result_file}")
96
+ df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
97
+
98
+ if (
99
+ force_recalculate
100
+ or "newline_score" not in df.columns
101
+ or "repetition_score" not in df.columns
102
+ or "total_repetitions" not in df.columns
103
+ ):
104
+ df[["newline_score", "repetition_score", "total_repetitions"]] = df[
105
+ "answer"
106
+ ].apply(detect_scores)
107
+ df.to_csv(result_file, index=False)
108
+
109
+ return df
110
+
111
+
112
+ def replace_last(source_string, old_string, new_string):
113
+ head, _sep, tail = source_string.rpartition(old_string)
114
+ return head + new_string + tail
115
+
116
+
117
+ def load_for_repetition_penalty(
118
+ csv_result_file, repetition_penalty, force_recalculate=False
119
+ ):
120
+ result_file = replace_last(
121
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
122
+ )
123
+ return load_with_newline_and_repetition_scores(
124
+ result_file, force_recalculate=force_recalculate
125
+ )
126
+
127
+
128
+ def calc_adjusted_performance(f, r):
129
+ return f / math.log10(10 + r)
130
+
131
+
132
+ def calculate_adjusted_performance(row):
133
+ r = row["total_repetitions"]
134
+ adjusted_precision = calc_adjusted_performance(row["precision"], r)
135
+ adjusted_recall = calc_adjusted_performance(row["recall"], r)
136
+ return pd.Series([adjusted_precision, adjusted_recall])
137
+
138
+
139
+ def load_performance_df(csv_result_file, repetition_penalty):
140
+ result_file = replace_last(
141
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}-t2_evaluated.json"
142
+ )
143
+ result_file = result_file.replace("/results/", "/eval/")
144
+ print(f"loading json file: {result_file}")
145
+ df = pd.read_json(result_file)
146
+
147
+ return df
148
+
149
+
150
+ def calculate_performance_score_v1(
151
+ csv_result_file, repetition_penalty, force_recalculate=False
152
+ ):
153
+ result_file = replace_last(
154
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
155
+ )
156
+ print(f"loading result file: {result_file}")
157
+ df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
158
+
159
+ if force_recalculate or "f2" in df.columns or "f1" not in df.columns:
160
+ df.drop(
161
+ columns=[
162
+ "precision",
163
+ "recall",
164
+ "f1",
165
+ "f2",
166
+ "entities_in_answer",
167
+ "entities_in_question",
168
+ ],
169
+ errors="ignore",
170
+ inplace=True,
171
+ )
172
+ perf_df = load_performance_df(csv_result_file, repetition_penalty)
173
+ filtered_df = perf_df[perf_df["id"].isin(df["id"])]
174
+ perf_df = filtered_df.reset_index(drop=True)
175
+ print(f"perf_df len: {len(perf_df)}")
176
+ # print(perf_df.head())
177
+
178
+ df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"]
179
+
180
+ df["precision"] = perf_df["score"].apply(lambda x: x[0])
181
+ df["recall"] = perf_df["score"].apply(lambda x: x[1])
182
+ df["f1"] = perf_df["score"].apply(lambda x: x[2])
183
+
184
+ df[["adjusted_precision", "adjusted_recall"]] = df.apply(
185
+ calculate_adjusted_performance, axis=1
186
+ )
187
+
188
+ df.to_csv(result_file, index=False)
189
+ print(f"performance scores saved to result file: {result_file}")
190
+
191
+ print(f"df len: {len(df)}")
192
+
193
+ return df
194
+
195
+
196
+ ref_df = pd.read_csv(
197
+ "./data/results/gpt-3.5-turbo_non_rag.csv", comment="#", on_bad_lines="warn"
198
+ )
199
+
200
+
201
+ def calculate_performance_score(
202
+ csv_result_file, repetition_penalty, force_recalculate=False
203
+ ):
204
+ result_file = replace_last(
205
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
206
+ )
207
+
208
+ re_creating = False
209
+ if os.path.exists(result_file):
210
+ print(f"loading result file: {result_file}")
211
+ df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
212
+ else:
213
+ print(f"re-creating result file: {result_file}")
214
+ df = pd.DataFrame()
215
+ force_recalculate = True
216
+
217
+ if force_recalculate or "f2" in df.columns or "f1" not in df.columns:
218
+ df.drop(
219
+ columns=[
220
+ "precision",
221
+ "recall",
222
+ "f1",
223
+ "f2",
224
+ "entities_in_answer",
225
+ "entities_in_question",
226
+ "word_count",
227
+ ],
228
+ errors="ignore",
229
+ inplace=True,
230
+ )
231
+ perf_df = load_performance_df(csv_result_file, repetition_penalty)
232
+ filtered_df = perf_df[perf_df["id"].isin(ref_df["id"])]
233
+ perf_df = filtered_df.reset_index(drop=True)
234
+ print(f"perf_df len: {len(perf_df)}")
235
+
236
+ if len(perf_df) != len(ref_df):
237
+ print(f"error: len(perf_df) != {len(ref_df)}")
238
+ missing_ids = [
239
+ id for id in ref_df["id"].unique() if id not in perf_df["id"].unique()
240
+ ]
241
+ print(f"missing_ids: {missing_ids}")
242
+
243
+ # print(perf_df.head())
244
+
245
+ df["id"] = perf_df["id"]
246
+ df["question"] = perf_df["question"]
247
+ df["answer"] = perf_df["pred_answer"]
248
+ df["word_count"] = df["answer"].apply(
249
+ lambda x: len(nltk.word_tokenize(x)) if isinstance(x, str) else 0
250
+ )
251
+ df["ground_truth"] = perf_df["ground_truth"]
252
+ df[["newline_score", "repetition_score", "total_repetitions"]] = df[
253
+ "answer"
254
+ ].apply(detect_scores)
255
+
256
+ df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"]
257
+ df["precision"] = perf_df["score"].apply(lambda x: x[0])
258
+ df["recall"] = perf_df["score"].apply(lambda x: x[1])
259
+ df["f1"] = perf_df["score"].apply(lambda x: x[2])
260
+
261
+ df[["adjusted_precision", "adjusted_recall"]] = df.apply(
262
+ calculate_adjusted_performance, axis=1
263
+ )
264
+
265
+ df.to_csv(result_file, index=False)
266
+ print(f"performance scores saved to result file: {result_file}")
267
+
268
+ print(f"df len: {len(df)}")
269
+
270
+ return df
271
+
272
+
273
+ def adjust_perf_scores_with_repetition_penalty(result, precision, recall):
274
+ newline_score = [
275
+ df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
276
+ ]
277
+ print(f"newline_score: {newline_score}")
278
+
279
+ repetition_score = [
280
+ df["repetition_score"].mean() for df in result["df_list_repetition_penalty"]
281
+ ]
282
+ print(f"repetition_score: {repetition_score}")
283
+
284
+ precision = [
285
+ f / math.log10(10 + n + r)
286
+ for f, n, r in zip(precision, newline_score, repetition_score)
287
+ ]
288
+ recall = [
289
+ f / math.log10(10 + n + r)
290
+ for f, n, r in zip(recall, newline_score, repetition_score)
291
+ ]
292
+
293
+ return precision, recall
294
+
295
+
296
+ def plot_performance_scores(
297
+ result,
298
+ models=None,
299
+ title="Performance",
300
+ ):
301
+
302
+ if models is None:
303
+ models = result.keys()
304
+ for model in models:
305
+ print(f"model: {model}")
306
+ df = result[model]["df_overall"]
307
+
308
+ # Calculate the statistics
309
+ precision = [
310
+ df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
311
+ ]
312
+ recall = [
313
+ df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
314
+ ]
315
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
316
+ best_f1 = max(f1)
317
+ best_f1_index = f1.index(best_f1)
318
+
319
+ precision, recall = adjust_perf_scores_with_repetition_penalty(
320
+ result[model], precision, recall
321
+ )
322
+ afrp = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
323
+
324
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
325
+ best_afrp = max(afrp)
326
+ best_afrp_index = afrp.index(best_afrp)
327
+
328
+ adjusted_precision = [
329
+ df["adjusted_precision"].mean()
330
+ for df in result[model]["df_list_repetition_penalty"]
331
+ ]
332
+ adjusted_recall = [
333
+ df["adjusted_recall"].mean()
334
+ for df in result[model]["df_list_repetition_penalty"]
335
+ ]
336
+ afrp2 = [
337
+ 2 * (p * r) / (p + r) for p, r in zip(adjusted_precision, adjusted_recall)
338
+ ]
339
+ best_afrp2 = max(afrp2)
340
+ best_afrp2_index = afrp2.index(best_afrp2)
341
+
342
+ repetition_penalties = list(df["repetition_penalty"])
343
+
344
+ # line plot for precision, recall, f1
345
+ plt.figure(figsize=(10, 6))
346
+
347
+ plt.axvspan(
348
+ repetition_penalties[best_f1_index] - 0.01,
349
+ repetition_penalties[best_f1_index] + 0.01,
350
+ alpha=0.5,
351
+ edgecolor="none",
352
+ facecolor="blue",
353
+ )
354
+
355
+ # plt.axvspan(
356
+ # repetition_penalties[best_afrp2_index] - 0.01,
357
+ # repetition_penalties[best_afrp2_index] + 0.01,
358
+ # alpha=0.5,
359
+ # edgecolor="none",
360
+ # facecolor="green",
361
+ # )
362
+
363
+ plt.axvspan(
364
+ repetition_penalties[best_afrp_index] - 0.01,
365
+ repetition_penalties[best_afrp_index] + 0.01,
366
+ alpha=0.5,
367
+ edgecolor="none",
368
+ facecolor="orange",
369
+ )
370
+
371
+ plt.plot(repetition_penalties, f1, label="F1", marker="D", color="blue")
372
+ # plt.plot(
373
+ # repetition_penalties,
374
+ # afrp2,
375
+ # label="Per-question RF Adjusted F1",
376
+ # marker="s",
377
+ # color="green",
378
+ # )
379
+ plt.plot(
380
+ repetition_penalties,
381
+ afrp,
382
+ label="RF Adjusted F1",
383
+ marker="o",
384
+ color="orange",
385
+ )
386
+ plt.xlabel("Repetition Penalties")
387
+ plt.ylabel("Score")
388
+ plt.xlim(0.99, 1.31)
389
+ # y in percentage
390
+ plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
391
+ plt.title(f"{model} {title}")
392
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
393
+
394
+ plt.show()
395
+
396
+
397
+ def plot_best_afrp(
398
+ result,
399
+ models=None,
400
+ title="Models with Best Repetition Factor Adjusted F1",
401
+ ref_result=None,
402
+ ):
403
+ # Initialize lists to store the statistics
404
+ model_names = []
405
+ best_f1 = []
406
+ best_afrp = []
407
+ best_repetition_penalty = []
408
+
409
+ if models is None:
410
+ models = result.keys()
411
+ for model in models:
412
+ print(f"model: {model}")
413
+ df = result[model]["df_overall"]
414
+
415
+ # Calculate the statistics
416
+ precision = [
417
+ df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
418
+ ]
419
+ recall = [
420
+ df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
421
+ ]
422
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
423
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
424
+
425
+ newline_score = [
426
+ df["newline_score"].mean()
427
+ for df in result[model]["df_list_repetition_penalty"]
428
+ ]
429
+ print(f"newline_score: {newline_score}")
430
+
431
+ repetition_score = [
432
+ df["repetition_score"].mean()
433
+ for df in result[model]["df_list_repetition_penalty"]
434
+ ]
435
+ print(f"repetition_score: {repetition_score}")
436
+
437
+ afrp = [
438
+ f / math.log10(10 + n + r)
439
+ for f, n, r in zip(f1, newline_score, repetition_score)
440
+ ]
441
+
442
+ best_afrp.append(max(afrp))
443
+ best_afrp_index = afrp.index(best_afrp[-1])
444
+ best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
445
+
446
+ best_f1.append(f1[best_afrp_index])
447
+
448
+ print(
449
+ f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
450
+ )
451
+
452
+ df = result[model]["df_list_repetition_penalty"][best_afrp_index]
453
+
454
+ model_names.append(
455
+ f"{model} (RP={best_repetition_penalty[-1]})"
456
+ ) # Add the model name to the list
457
+
458
+ if ref_result is not None:
459
+ print("ref_result:", ref_result)
460
+ for model in ref_result.keys():
461
+ model_names.append(model)
462
+ df = pd.read_csv(ref_result[model])
463
+ # df = df[df["id"].isin(wikidata_df["id"])]
464
+
465
+ p = df["precision"].mean()
466
+ r = df["recall"].mean()
467
+
468
+ f1 = 2 * p * r / (p + r) if p + r > 0 else 0
469
+ best_f1.append(f1)
470
+ best_afrp.append(f1)
471
+
472
+ print("model_names:", model_names)
473
+ print("best_f1:", best_f1)
474
+ print("best_afrp:", best_afrp)
475
+
476
+ # Create a DataFrame with the statistics
477
+ data = pd.DataFrame(
478
+ {
479
+ "Model": model_names,
480
+ "Repetition Factor Adjusted F1": best_afrp,
481
+ "F1": best_f1,
482
+ }
483
+ )
484
+
485
+ # Melt the DataFrame to a long format
486
+ data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
487
+
488
+ # Pivot the DataFrame to a wide format
489
+ data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
490
+
491
+ # make sure the columns are following the order of the models
492
+ data_pivoted = data_pivoted[model_names]
493
+
494
+ # make sure three groups in the order of precision, recall, f1
495
+ data_pivoted = data_pivoted.reindex(["Repetition Factor Adjusted F1", "F1"])
496
+
497
+ # Plot the statistics
498
+ plt.figure(figsize=(15, 6))
499
+ ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
500
+ plt.title(title)
501
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
502
+
503
+ # Set the rotation of the x-axis labels to 0 degrees
504
+ plt.xticks(rotation=0)
505
+
506
+ # Format the y-axis to display as percentage
507
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
508
+
509
+ # get the max value of the y-axis
510
+ a1 = max(best_afrp)
511
+ a2 = max(best_f1)
512
+
513
+ max_value = max([a1, a2]) * 1.12
514
+ print("max_value:", max_value)
515
+
516
+ # Set the y-axis limit up to 70%
517
+ ax.set_ylim(0, max_value)
518
+
519
+ # Add the values above each bar
520
+ for p in ax.patches:
521
+ ax.annotate(
522
+ f"{p.get_height() * 100:.1f}",
523
+ (p.get_x() + p.get_width() / 2.0, p.get_height()),
524
+ ha="center",
525
+ va="bottom",
526
+ xytext=(0, 10),
527
+ textcoords="offset points",
528
+ rotation=90,
529
+ )
530
+
531
+ plt.show()
532
+
533
+
534
+ def plot_best_performance(
535
+ result,
536
+ models=None,
537
+ title="Models with Best F1 Score",
538
+ adjusted_f1=False,
539
+ ref_result=None,
540
+ ):
541
+ # Initialize lists to store the statistics
542
+ model_names = []
543
+ best_precision = []
544
+ best_recall = []
545
+ best_f1 = []
546
+ best_repetition_penalty = []
547
+
548
+ if models is None:
549
+ models = result.keys()
550
+ for model in models:
551
+ print(f"model: {model}")
552
+ df = result[model]["df_overall"]
553
+
554
+ # Calculate the statistics
555
+ precision = [
556
+ df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
557
+ ]
558
+ recall = [
559
+ df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
560
+ ]
561
+
562
+ if adjusted_f1:
563
+ precision, recall = adjust_perf_scores_with_repetition_penalty(
564
+ result[model], precision, recall
565
+ )
566
+
567
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
568
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
569
+
570
+ best_f1.append(max(f1))
571
+ best_f1_index = f1.index(best_f1[-1])
572
+ best_repetition_penalty.append(df["repetition_penalty"][best_f1_index])
573
+
574
+ best_precision.append(precision[best_f1_index])
575
+ best_recall.append(recall[best_f1_index])
576
+
577
+ print(
578
+ f"best repetition penalty: {best_repetition_penalty[-1]}, best f1: {best_f1[-1]}, precision: {best_precision[-1]}, recall: {best_recall[-1]}"
579
+ )
580
+
581
+ df = result[model]["df_list_repetition_penalty"][best_f1_index]
582
+
583
+ model_names.append(
584
+ f"{model} (RP={best_repetition_penalty[-1]})"
585
+ ) # Add the model name to the list
586
+
587
+ # print sum for columns: newline_score, repetition_score
588
+ print(
589
+ f"newline_score: {df['newline_score'].sum()}, repetition_score: {df['repetition_score'].sum()}"
590
+ )
591
+
592
+ if ref_result is not None:
593
+ print("ref_result:", ref_result)
594
+ for model in ref_result.keys():
595
+ model_names.append(model)
596
+ df = pd.read_csv(ref_result[model])
597
+ # df = df[df["id"].isin(wikidata_df["id"])]
598
+
599
+ best_precision.append(df["precision"].mean())
600
+ best_recall.append(df["recall"].mean())
601
+ f1 = (
602
+ 2
603
+ * (best_precision[-1] * best_recall[-1])
604
+ / (best_precision[-1] + best_recall[-1])
605
+ )
606
+ # best_f1.append(df["f1"].mean())
607
+ best_f1.append(f1)
608
+
609
+ # Create a DataFrame with the statistics
610
+ data = (
611
+ pd.DataFrame(
612
+ {
613
+ "Model": model_names,
614
+ "Adjusted Precision with RP": best_precision,
615
+ "Adjusted Recall with RP": best_recall,
616
+ "Adjusted F1 with RP": best_f1,
617
+ }
618
+ )
619
+ if adjusted_f1
620
+ else pd.DataFrame(
621
+ {
622
+ "Model": model_names,
623
+ "Precision": best_precision,
624
+ "Recall": best_recall,
625
+ "F1": best_f1,
626
+ }
627
+ )
628
+ )
629
+ columns = list(data.columns)
630
+
631
+ # Melt the DataFrame to a long format
632
+ data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
633
+
634
+ # Pivot the DataFrame to a wide format
635
+ data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
636
+
637
+ # make sure the columns are following the order of the models
638
+ data_pivoted = data_pivoted[model_names]
639
+
640
+ # make sure three groups in the order of precision, recall, f1
641
+ data_pivoted = data_pivoted.reindex(columns[1:])
642
+
643
+ # Plot the statistics
644
+ plt.figure(figsize=(10, 6))
645
+ ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
646
+ plt.title(title)
647
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
648
+
649
+ # Set the rotation of the x-axis labels to 0 degrees
650
+ plt.xticks(rotation=0)
651
+
652
+ # Format the y-axis to display as percentage
653
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
654
+
655
+ # get the max value of the y-axis
656
+ a1 = max(best_precision)
657
+ a2 = max(best_recall)
658
+ a3 = max(best_f1)
659
+
660
+ max_value = max([a1, a2, a3]) * 1.12
661
+ print("max_value:", max_value)
662
+
663
+ # Set the y-axis limit up to 70%
664
+ ax.set_ylim(0, max_value)
665
+
666
+ # Add the values above each bar
667
+ for p in ax.patches:
668
+ ax.annotate(
669
+ f"{p.get_height() * 100:.1f}",
670
+ (p.get_x() + p.get_width() / 2.0, p.get_height()),
671
+ ha="center",
672
+ va="bottom",
673
+ xytext=(0, 10),
674
+ textcoords="offset points",
675
+ rotation=90,
676
+ )
677
+
678
+ plt.show()
679
+
680
+
681
+ def plot_best_performance_ms_macro(
682
+ result,
683
+ models=None,
684
+ title="Models with Best Repetition Factor Adjusted Performance",
685
+ ref_result=None,
686
+ skip_generic_prompt=False,
687
+ include_adjusted_performance=True,
688
+ ):
689
+ # Initialize lists to store the statistics
690
+ model_names = []
691
+ best_f1 = []
692
+ best_afrp = []
693
+ best_repetition_penalty = []
694
+ best_bleu1 = []
695
+ best_rougeL = []
696
+
697
+ if models is None:
698
+ models = result.keys()
699
+ for model in models:
700
+ if skip_generic_prompt and "generic prompt" in model:
701
+ continue
702
+ print(f"model: {model}")
703
+ df = result[model]["df_overall"]
704
+
705
+ # Calculate the statistics
706
+ bleu1 = [x for x in df["bleu1"]]
707
+ rougeL = [x for x in df["rougeL"]]
708
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
709
+
710
+ newline_score = [
711
+ df["newline_score"].mean()
712
+ for df in result[model]["df_list_repetition_penalty"]
713
+ ]
714
+ print(f"newline_score: {newline_score}")
715
+
716
+ repetition_score = [
717
+ df["repetition_score"].mean()
718
+ for df in result[model]["df_list_repetition_penalty"]
719
+ ]
720
+ print(f"repetition_score: {repetition_score}")
721
+
722
+ afrp = [
723
+ f / math.log10(10 + n + r)
724
+ for f, n, r in zip(f1, newline_score, repetition_score)
725
+ ]
726
+
727
+ best_afrp.append(max(afrp if include_adjusted_performance else f1))
728
+ best_afrp_index = (
729
+ afrp.index(best_afrp[-1])
730
+ if include_adjusted_performance
731
+ else f1.index(best_afrp[-1])
732
+ )
733
+ best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
734
+
735
+ best_f1.append(f1[best_afrp_index])
736
+ best_bleu1.append(bleu1[best_afrp_index])
737
+ best_rougeL.append(rougeL[best_afrp_index])
738
+
739
+ print(
740
+ f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
741
+ )
742
+
743
+ df = result[model]["df_list_repetition_penalty"][best_afrp_index]
744
+
745
+ model_names.append(
746
+ f"{model} (RP={best_repetition_penalty[-1]})"
747
+ ) # Add the model name to the list
748
+
749
+ if ref_result is not None:
750
+ print("ref_result:", ref_result)
751
+ for model in ref_result.keys():
752
+ model_names.append(model)
753
+ df = pd.read_csv(ref_result[model], comment="#", on_bad_lines="warn")
754
+ # df = df[df["id"].isin(wikidata_df["id"])]
755
+
756
+ p = df["bleu1"][0]
757
+ best_bleu1.append(p)
758
+
759
+ r = df["rougeL"][0]
760
+ best_rougeL.append(r)
761
+
762
+ f1 = 2 * p * r / (p + r) if p + r > 0 else 0
763
+ best_f1.append(f1)
764
+ best_afrp.append(f1)
765
+
766
+ print("model_names:", model_names)
767
+ print("best_f1:", best_f1)
768
+ print("best_afrp:", best_afrp)
769
+
770
+ # Create a DataFrame with the statistics
771
+ data = (
772
+ pd.DataFrame(
773
+ {
774
+ "Model": model_names,
775
+ "Repetition Factor Adjusted Perf Score": best_afrp,
776
+ "Overall Perf Score": best_f1,
777
+ }
778
+ )
779
+ if include_adjusted_performance
780
+ else pd.DataFrame(
781
+ {
782
+ "Model": model_names,
783
+ "Bleu-1": best_bleu1,
784
+ "Rouge-L": best_rougeL,
785
+ "Overall Perf Score": best_f1,
786
+ }
787
+ )
788
+ )
789
+
790
+ # Melt the DataFrame to a long format
791
+ data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
792
+
793
+ # Pivot the DataFrame to a wide format
794
+ data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
795
+
796
+ # make sure the columns are following the order of the models
797
+ data_pivoted = data_pivoted[model_names]
798
+
799
+ columns = list(data.columns)
800
+ data_pivoted = data_pivoted.reindex(columns[1:])
801
+
802
+ # Plot the statistics
803
+ plt.figure(figsize=(10, 6))
804
+ ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
805
+ plt.title(title)
806
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
807
+
808
+ # Set the rotation of the x-axis labels to 0 degrees
809
+ plt.xticks(rotation=0)
810
+
811
+ # Format the y-axis to display as percentage
812
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
813
+
814
+ # get the max value of the y-axis
815
+ a1 = max(best_afrp)
816
+ a2 = max(best_f1)
817
+ a3 = max(best_bleu1)
818
+ a4 = max(best_rougeL)
819
+
820
+ max_value = (
821
+ max([a1, a2] if include_adjusted_performance else [a1, a2, a3, a4]) * 1.12
822
+ )
823
+ print("max_value:", max_value)
824
+
825
+ # Set the y-axis limit up to 70%
826
+ ax.set_ylim(0, max_value)
827
+
828
+ # Add the values above each bar
829
+ for p in ax.patches:
830
+ ax.annotate(
831
+ f"{p.get_height() * 100:.1f}",
832
+ (p.get_x() + p.get_width() / 2.0, p.get_height()),
833
+ ha="center",
834
+ va="bottom",
835
+ xytext=(0, 10),
836
+ textcoords="offset points",
837
+ rotation=90,
838
+ )
839
+
840
+ plt.show()
841
+
842
+
843
+ all_open_source_models = [
844
+ "gemma-1.1-2b-it",
845
+ "Phi-3-mini-128k-instruct",
846
+ "gemma-1.1-7b-it",
847
+ "Llama-2-7b-chat-hf",
848
+ "Mistral-7B-Instruct-v0.2",
849
+ "Meta-Llama-3-8B-Instruct",
850
+ "Llama-2-13b-chat-hf",
851
+ "Llama-2-70b-chat-hf",
852
+ "Meta-Llama-3-70B-Instruct",
853
+ ]
854
+
855
+
856
+ non_rag_csv_result_files = [
857
+ "./data/results/gemma-1.1-2b-it_wd_non_rag.csv", # gemma-1.1-2b-it
858
+ "./data/results/Phi-3-mini-128k-instruct_wd_non_rag_batch_16.csv", # Phi-3-mini-128k-instruct(batch size:16)
859
+ "./data/results/gemma-1.1-7b-it_wd_non_rag.csv", # gemma-1.1-7b-it
860
+ "./data/results/Tune_2024-04-09_09-19-22.csv", # Llama-2-7b-chat-hf
861
+ "./data/results/Tune_2024-04-16_12-24-27.csv.csv", # Mistral-7B-Instruct-v0.2
862
+ "./data/results/Meta-Llama-3-8B-Instruct_wd_non_rag.csv", # Meta-Llama-3-8B-Instruct
863
+ "./data/results/Meta-Llama-3-8B-Instruct_wd_1_non_rag.csv", # Meta-Llama-3-8B-Instruct
864
+ "./data/results/Tune_2024-04-10_16-53-38.csv", # Llama-2-13b-chat-hf
865
+ "./data/results/Llama-2-70b-chat-hf_wd_non_rag.csv", # Llama-2-70b-chat-hf
866
+ "./data/results/Meta-Llama-3-70B-Instruct_wd_non_rag.csv", # Meta-Llama-3-70B-Instruct
867
+ ]
868
+
869
+ rag_csv_result_files = [
870
+ "./data/results/gemma-1.1-2b-it_wd.csv", # gemma-1.1-2b-it
871
+ "./data/results/gemma-1.1-2b-it_wd_true.csv", # gemma-1.1-2b-it(true)
872
+ "./data/results/Phi-3-mini-128k-instruct_wd_rag_batch_4.csv", # Phi-3-mini-128k-instruct(batch size:16)
873
+ "./data/results/Phi-3-mini-128k-instruct_wd_true.csv", # Phi-3-mini-128k-instruct(batch size:16)
874
+ "./data/results/gemma-1.1-7b-it_wd.csv", # gemma-1.1-7b-it
875
+ "./data/results/gemma-1.1-7b-it_wd_true.csv", # gemma-1.1-7b-it(true)
876
+ "./data/results/Tune_2024-03-20_15-35-37.csv", # Llama-2-7b-chat-hf
877
+ "./data/results/Llama-2-7b-chat-hf_wd_true.csv", # Llama-2-7b-chat-hf(true)
878
+ "./data/results/Tune_2024-03-29_11-28-20.csv", # Mistral-7B-Instruct-v0.2
879
+ "./data/results/Mistral-7B-Instruct-v0.2_wd_true.csv", # Mistral-7B-Instruct-v0.2(true)
880
+ "./data/results/Meta-Llama-3-8B-Instruct_wd.csv", # Meta-Llama-3-8b-instruct
881
+ "./data/results/Meta-Llama-3-8B-Instruct_wd_true.csv", # Meta-Llama-3-8b-instruct(true)
882
+ "./data/results/Tune_2024-03-25_23-32-57.csv", # Llama-2-13b-chat-hf
883
+ "./data/results/Llama-2-13b-chat-hf_wd_true.csv", # Llama-2-13b-chat-hf(true)
884
+ "./data/results/Llama-2-70b-chat-hf_wd.csv", # Llama-2-70b-chat-hf
885
+ "./data/results/Llama-2-70b-chat-hf_wd_true.csv", # Llama-2-70b-chat-hf
886
+ "./data/results/Meta-Llama-3-70B-Instruct_wd.csv", # Meta-Llama-3-70B-Instruct
887
+ "./data/results/Meta-Llama-3-70B-Instruct_wd_true.csv", # Meta-Llama-3-70B-Instruct(true)
888
+ ]
889
+
890
+ df_ms_macro = pd.read_json("./data/datasets/ms_macro.json")
891
+
892
+
893
+ def load_for_repetition_penalty_ms_macro(
894
+ csv_result_file, repetition_penalty, force_recalculate=False
895
+ ):
896
+ result_file = replace_last(
897
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
898
+ )
899
+ df = load_with_newline_and_repetition_scores(
900
+ result_file, force_recalculate=force_recalculate
901
+ )
902
+
903
+ if len(df) != len(df_ms_macro):
904
+ print(f"error: len(df) != {len(df_ms_macro)}")
905
+ missing_ids = [
906
+ id for id in df_ms_macro["id"].unique() if id not in df["id"].unique()
907
+ ]
908
+ print(f"missing_ids: {missing_ids}")
909
+
910
+ if df["ground_truth"][0] != str(df_ms_macro["wellFormedAnswers"][0]):
911
+ df["ground_truth"] = df_ms_macro["wellFormedAnswers"]
912
+ print("ground_truth updated for:", result_file)
913
+ df.to_csv(result_file, index=False)
914
+ return df
915
+
916
+
917
+ # MS MACRO
918
+ def plot_performance_scores_ms_macro(
919
+ result,
920
+ models=None,
921
+ title="Performance",
922
+ ):
923
+
924
+ if models is None:
925
+ models = result.keys()
926
+ for model in models:
927
+ print(f"model: {model}")
928
+ df = result[model]["df_overall"]
929
+ # print(result[model]["df_list_repetition_penalty"][0].describe())
930
+
931
+ # Calculate the statistics
932
+ bleu1 = list(df["bleu1"])
933
+ rougeL = list(df["rougeL"])
934
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
935
+ best_f1 = max(f1)
936
+ best_f1_index = f1.index(best_f1)
937
+
938
+ bleu1, rougeL = adjust_perf_scores_with_repetition_penalty(
939
+ result[model], bleu1, rougeL
940
+ )
941
+ afrp = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
942
+
943
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
944
+ best_afrp = max(afrp)
945
+ best_afrp_index = afrp.index(best_afrp)
946
+
947
+ repetition_penalties = list(df["repetition_penalty"])
948
+
949
+ # line plot for precision, recall, f1
950
+ plt.figure(figsize=(10, 6))
951
+
952
+ plt.axvspan(
953
+ repetition_penalties[best_f1_index] - 0.01,
954
+ repetition_penalties[best_f1_index] + 0.01,
955
+ alpha=0.5,
956
+ edgecolor="none",
957
+ facecolor="blue",
958
+ )
959
+
960
+ plt.axvspan(
961
+ repetition_penalties[best_afrp_index] - 0.01,
962
+ repetition_penalties[best_afrp_index] + 0.01,
963
+ alpha=0.5,
964
+ edgecolor="none",
965
+ facecolor="orange",
966
+ )
967
+
968
+ plt.plot(
969
+ repetition_penalties,
970
+ f1,
971
+ label="Overall Perf Score",
972
+ marker="D",
973
+ color="blue",
974
+ )
975
+ plt.plot(
976
+ repetition_penalties,
977
+ afrp,
978
+ label="RF Adjusted Perf Score",
979
+ marker="o",
980
+ color="orange",
981
+ )
982
+
983
+ plt.xlabel("Repetition Penalties")
984
+ plt.ylabel("Score")
985
+ plt.xlim(0.99, 1.31)
986
+ # y in percentage
987
+ plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
988
+ plt.title(f"{model} {title}")
989
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
990
+
991
+ plt.show()
992
+
993
+
994
+ def plot_repetition_factors(result, groups):
995
+ for group in groups:
996
+ # Plot the statistics
997
+ plt.figure(figsize=(10, 6))
998
+
999
+ max_value = 0
1000
+ for model in result.keys():
1001
+ if not group in model.lower():
1002
+ continue
1003
+ print(f"model: {model}")
1004
+ df = result[model]["df_overall"]
1005
+ repetition_panelties = [
1006
+ repetition_penalty for repetition_penalty in df["repetition_penalty"]
1007
+ ]
1008
+
1009
+ mean_score = [
1010
+ math.log10(10 + df["total_repetitions"].mean())
1011
+ for df in result[model]["df_list_repetition_penalty"]
1012
+ ]
1013
+
1014
+ sns.lineplot(x=repetition_panelties, y=mean_score, label=model)
1015
+
1016
+ new_max = max(mean_score)
1017
+ if new_max > max_value:
1018
+ max_value = new_max
1019
+
1020
+ max_value = max_value * 1.05
1021
+ if max_value < 1.5:
1022
+ max_value = 1.5
1023
+ # set ylimit
1024
+ plt.ylim(1, max_value)
1025
+
1026
+ # show grid
1027
+ plt.grid(True)
1028
+ plt.xlabel("Repetition Penalties")
1029
+ plt.ylabel("Repetition Factors")
1030
+ plt.title("Repetition Factors vs Repetition Penalties")
1031
+ plt.legend()
1032
+
1033
+ plt.show()
1034
+
1035
+
1036
+ def plot_repetition_factors_by_group(result, group_filter=None):
1037
+ markers = ["D", "o", "s", "x"]
1038
+ colors = ["blue", "orange", "green", "red"]
1039
+
1040
+ # Plot the statistics
1041
+ plt.figure(figsize=(10, 6))
1042
+ index = 0
1043
+ max_value = 0
1044
+
1045
+ for model in result.keys():
1046
+ if group_filter is not None and group_filter not in model:
1047
+ continue
1048
+
1049
+ print(f"model: {model}")
1050
+
1051
+ df = result[model]["df_overall"]
1052
+ repetition_panelties = [
1053
+ repetition_penalty for repetition_penalty in df["repetition_penalty"]
1054
+ ]
1055
+
1056
+ # Calculate the statistics
1057
+ mean_score = [
1058
+ math.log10(10 + df["total_repetitions"].mean())
1059
+ for df in result[model]["df_list_repetition_penalty"]
1060
+ ]
1061
+ if len(mean_score) != len(repetition_panelties):
1062
+ print(
1063
+ f"model: {model} has different length of repetition penalties and mean score"
1064
+ )
1065
+ print("repetition_panelties:", len(repetition_panelties))
1066
+ print("mean_score:", len(mean_score))
1067
+ continue
1068
+
1069
+ new_max = max(mean_score)
1070
+ if new_max > max_value:
1071
+ max_value = new_max
1072
+
1073
+ sns.lineplot(
1074
+ x=repetition_panelties,
1075
+ y=mean_score,
1076
+ label=model,
1077
+ marker=markers[index],
1078
+ color=colors[index],
1079
+ )
1080
+
1081
+ index += 1
1082
+
1083
+ max_value = max_value * 1.05
1084
+ if max_value < 1.5:
1085
+ max_value = 1.5
1086
+ # set ylimit
1087
+ plt.ylim(1, max_value)
1088
+ max_value = 0
1089
+
1090
+ plt.xlabel("Repetition Penalties")
1091
+ plt.ylabel("Repetition Factors")
1092
+ plt.title("Repetition Factors vs Repetition Penalties")
1093
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
1094
+
1095
+ plt.show()
eval_modules/calc_repetitions_v4.py ADDED
@@ -0,0 +1,1296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import pandas as pd
5
+ import numpy as np
6
+ import matplotlib.pyplot as plt
7
+ import matplotlib.ticker as mtick
8
+ import seaborn as sns
9
+ import nltk
10
+
11
+ print(f"loading: {__file__}")
12
+
13
+ # final version
14
+ pattern_excessive_whitespaces = re.compile(r"\s{5,}")
15
+ pattern_text_repetitions = re.compile(r"(.{5}.*)\s*((\1)\s*)+", re.M | re.DOTALL)
16
+
17
+
18
+ # final version for repetition detection
19
+ def detect_repetitions(text, debug=False):
20
+ subtotals = [0, 0]
21
+
22
+ if isinstance(text, str):
23
+ patterns = [pattern_excessive_whitespaces, pattern_text_repetitions]
24
+ for i, pattern in enumerate(patterns):
25
+ if debug:
26
+ print(
27
+ f"----detect {'excessive whitespaces' if i == 0 else 'text repetitions'}----"
28
+ )
29
+ matches = pattern.finditer(text)
30
+ for match in matches:
31
+ if debug:
32
+ print(match)
33
+ for groupNum in range(0, len(match.groups())):
34
+ groupNum = groupNum + 1
35
+ print(
36
+ "Group {groupNum} found at {start}-{end}: `{group}`".format(
37
+ groupNum=groupNum,
38
+ start=match.start(groupNum),
39
+ end=match.end(groupNum),
40
+ group=match.group(groupNum),
41
+ )
42
+ )
43
+
44
+ start, end = match.span()
45
+ subtotals[i] += end - start
46
+
47
+ if i == 0 and subtotals[i] > 0:
48
+ text = pattern.sub("", text)
49
+ if debug:
50
+ print(f"removed excessive whitespaces: {subtotals[i]}")
51
+
52
+ result = (subtotals[0], subtotals[1], subtotals[0] + subtotals[1])
53
+
54
+ if debug:
55
+ print(result)
56
+ return result
57
+
58
+
59
+ def detect_excessive_whitespaces(text, debug=False):
60
+ return detect_repetitions(text, debug=debug)[0]
61
+
62
+
63
+ def detect_text_repetitions(text, debug=False):
64
+ return detect_repetitions(text, debug=debug)[1]
65
+
66
+
67
+ def detect_scores(text, debug=False):
68
+ newline_score, repetition_score, total_repetitions = detect_repetitions(
69
+ text, debug=debug
70
+ )
71
+ return pd.Series([newline_score, repetition_score, total_repetitions])
72
+
73
+
74
+ def load_with_newline_and_repetition_scores(result_file, force_recalculate=False):
75
+ print(f"loading result file: {result_file}")
76
+ df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
77
+
78
+ if (
79
+ force_recalculate
80
+ or "newline_score" not in df.columns
81
+ or "repetition_score" not in df.columns
82
+ or "total_repetitions" not in df.columns
83
+ ):
84
+ df[["newline_score", "repetition_score", "total_repetitions"]] = df[
85
+ "answer"
86
+ ].apply(detect_scores)
87
+ df.to_csv(result_file, index=False)
88
+
89
+ return df
90
+
91
+
92
+ def replace_last(source_string, old_string, new_string):
93
+ head, _sep, tail = source_string.rpartition(old_string)
94
+ return head + new_string + tail
95
+
96
+
97
+ def load_for_repetition_penalty(
98
+ csv_result_file, repetition_penalty, force_recalculate=False
99
+ ):
100
+ result_file = replace_last(
101
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
102
+ )
103
+ return load_with_newline_and_repetition_scores(
104
+ result_file, force_recalculate=force_recalculate
105
+ )
106
+
107
+
108
+ def calc_adjusted_performance(f, r):
109
+ return f / math.log10(10 + r)
110
+
111
+
112
+ def calculate_adjusted_performance(row):
113
+ r = row["total_repetitions"]
114
+ adjusted_precision = calc_adjusted_performance(row["precision"], r)
115
+ adjusted_recall = calc_adjusted_performance(row["recall"], r)
116
+ return pd.Series([adjusted_precision, adjusted_recall])
117
+
118
+
119
+ def load_performance_df(csv_result_file, repetition_penalty):
120
+ result_file = replace_last(
121
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}-t2_evaluated.json"
122
+ )
123
+ result_file = result_file.replace("/results/", "/eval/")
124
+ print(f"loading json file: {result_file}")
125
+ df = pd.read_json(result_file)
126
+
127
+ return df
128
+
129
+
130
+ def calculate_performance_score_v1(
131
+ csv_result_file, repetition_penalty, force_recalculate=False
132
+ ):
133
+ result_file = replace_last(
134
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
135
+ )
136
+ print(f"loading result file: {result_file}")
137
+ df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
138
+
139
+ if force_recalculate or "f2" in df.columns or "f1" not in df.columns:
140
+ df.drop(
141
+ columns=[
142
+ "precision",
143
+ "recall",
144
+ "f1",
145
+ "f2",
146
+ "entities_in_answer",
147
+ "entities_in_question",
148
+ ],
149
+ errors="ignore",
150
+ inplace=True,
151
+ )
152
+ perf_df = load_performance_df(csv_result_file, repetition_penalty)
153
+ filtered_df = perf_df[perf_df["id"].isin(df["id"])]
154
+ perf_df = filtered_df.reset_index(drop=True)
155
+ print(f"perf_df len: {len(perf_df)}")
156
+ # print(perf_df.head())
157
+
158
+ df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"]
159
+
160
+ df["precision"] = perf_df["score"].apply(lambda x: x[0])
161
+ df["recall"] = perf_df["score"].apply(lambda x: x[1])
162
+ df["f1"] = perf_df["score"].apply(lambda x: x[2])
163
+
164
+ df[["adjusted_precision", "adjusted_recall"]] = df.apply(
165
+ calculate_adjusted_performance, axis=1
166
+ )
167
+
168
+ df.to_csv(result_file, index=False)
169
+ print(f"performance scores saved to result file: {result_file}")
170
+
171
+ print(f"df len: {len(df)}")
172
+
173
+ return df
174
+
175
+
176
+ ref_df = pd.read_csv(
177
+ "./data/results/gpt-3.5-turbo_non_rag.csv", comment="#", on_bad_lines="warn"
178
+ )
179
+
180
+
181
+ def calculate_performance_score(
182
+ csv_result_file, repetition_penalty, force_recalculate=False
183
+ ):
184
+ result_file = replace_last(
185
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
186
+ )
187
+
188
+ re_creating = False
189
+ if os.path.exists(result_file):
190
+ print(f"loading result file: {result_file}")
191
+ df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
192
+ else:
193
+ print(f"re-creating result file: {result_file}")
194
+ df = pd.DataFrame()
195
+ force_recalculate = True
196
+
197
+ if force_recalculate or "f2" in df.columns or "f1" not in df.columns:
198
+ df.drop(
199
+ columns=[
200
+ "precision",
201
+ "recall",
202
+ "f1",
203
+ "f2",
204
+ "entities_in_answer",
205
+ "entities_in_question",
206
+ "word_count",
207
+ ],
208
+ errors="ignore",
209
+ inplace=True,
210
+ )
211
+ perf_df = load_performance_df(csv_result_file, repetition_penalty)
212
+ filtered_df = perf_df[perf_df["id"].isin(ref_df["id"])]
213
+ perf_df = filtered_df.reset_index(drop=True)
214
+ print(f"perf_df len: {len(perf_df)}")
215
+
216
+ if len(perf_df) != len(ref_df):
217
+ print(f"error: len(perf_df) != {len(ref_df)}")
218
+ missing_ids = [
219
+ id for id in ref_df["id"].unique() if id not in perf_df["id"].unique()
220
+ ]
221
+ print(f"missing_ids: {missing_ids}")
222
+
223
+ # print(perf_df.head())
224
+
225
+ df["id"] = perf_df["id"]
226
+ df["question"] = perf_df["question"]
227
+ df["answer"] = perf_df["pred_answer"]
228
+ df["word_count"] = df["answer"].apply(
229
+ lambda x: len(nltk.word_tokenize(x)) if isinstance(x, str) else 0
230
+ )
231
+ df["ground_truth"] = perf_df["ground_truth"]
232
+ df[["newline_score", "repetition_score", "total_repetitions"]] = df[
233
+ "answer"
234
+ ].apply(detect_scores)
235
+
236
+ df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"]
237
+ df["precision"] = perf_df["score"].apply(lambda x: x[0])
238
+ df["recall"] = perf_df["score"].apply(lambda x: x[1])
239
+ df["f1"] = perf_df["score"].apply(lambda x: x[2])
240
+
241
+ df[["adjusted_precision", "adjusted_recall"]] = df.apply(
242
+ calculate_adjusted_performance, axis=1
243
+ )
244
+
245
+ df.to_csv(result_file, index=False)
246
+ print(f"performance scores saved to result file: {result_file}")
247
+
248
+ print(f"df len: {len(df)}")
249
+
250
+ return df
251
+
252
+
253
+ def adjust_perf_scores_with_repetition_penalty(result, precision, recall):
254
+ newline_score = [
255
+ df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
256
+ ]
257
+
258
+ repetition_score = [
259
+ df["repetition_score"].mean() for df in result["df_list_repetition_penalty"]
260
+ ]
261
+
262
+ precision = [
263
+ f / math.log10(10 + n + r)
264
+ for f, n, r in zip(precision, newline_score, repetition_score)
265
+ ]
266
+ recall = [
267
+ f / math.log10(10 + n + r)
268
+ for f, n, r in zip(recall, newline_score, repetition_score)
269
+ ]
270
+
271
+ return precision, recall
272
+
273
+
274
+ def plot_performance_scores(
275
+ result,
276
+ models=None,
277
+ title="Performance",
278
+ ):
279
+
280
+ if models is None:
281
+ models = result.keys()
282
+ for model in models:
283
+ print(f"model: {model}")
284
+ df = result[model]["df_overall"]
285
+
286
+ # Calculate the statistics
287
+ precision = [
288
+ df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
289
+ ]
290
+ recall = [
291
+ df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
292
+ ]
293
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
294
+ best_f1 = max(f1)
295
+ best_f1_index = f1.index(best_f1)
296
+
297
+ precision, recall = adjust_perf_scores_with_repetition_penalty(
298
+ result[model], precision, recall
299
+ )
300
+ afrp = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
301
+
302
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
303
+ best_afrp = max(afrp)
304
+ best_afrp_index = afrp.index(best_afrp)
305
+
306
+ adjusted_precision = [
307
+ df["adjusted_precision"].mean()
308
+ for df in result[model]["df_list_repetition_penalty"]
309
+ ]
310
+ adjusted_recall = [
311
+ df["adjusted_recall"].mean()
312
+ for df in result[model]["df_list_repetition_penalty"]
313
+ ]
314
+ afrp2 = [
315
+ 2 * (p * r) / (p + r) for p, r in zip(adjusted_precision, adjusted_recall)
316
+ ]
317
+ best_afrp2 = max(afrp2)
318
+ best_afrp2_index = afrp2.index(best_afrp2)
319
+
320
+ repetition_penalties = list(df["repetition_penalty"])
321
+
322
+ # line plot for precision, recall, f1
323
+ plt.figure(figsize=(10, 6))
324
+
325
+ plt.axvspan(
326
+ repetition_penalties[best_f1_index] - 0.01,
327
+ repetition_penalties[best_f1_index] + 0.01,
328
+ alpha=0.5,
329
+ edgecolor="none",
330
+ facecolor="blue",
331
+ )
332
+
333
+ # plt.axvspan(
334
+ # repetition_penalties[best_afrp2_index] - 0.01,
335
+ # repetition_penalties[best_afrp2_index] + 0.01,
336
+ # alpha=0.5,
337
+ # edgecolor="none",
338
+ # facecolor="green",
339
+ # )
340
+
341
+ plt.axvspan(
342
+ repetition_penalties[best_afrp_index] - 0.01,
343
+ repetition_penalties[best_afrp_index] + 0.01,
344
+ alpha=0.5,
345
+ edgecolor="none",
346
+ facecolor="orange",
347
+ )
348
+
349
+ plt.plot(repetition_penalties, f1, label="F1", marker="D", color="blue")
350
+ # plt.plot(
351
+ # repetition_penalties,
352
+ # afrp2,
353
+ # label="Per-question RF Adjusted F1",
354
+ # marker="s",
355
+ # color="green",
356
+ # )
357
+ plt.plot(
358
+ repetition_penalties,
359
+ afrp,
360
+ label="RF Adjusted F1",
361
+ marker="o",
362
+ color="orange",
363
+ )
364
+ plt.xlabel("Repetition Penalties")
365
+ plt.ylabel("Score")
366
+ plt.xlim(0.99, 1.31)
367
+ # y in percentage
368
+ plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
369
+ plt.title(f"{model} {title}")
370
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
371
+
372
+ plt.show()
373
+
374
+
375
+ def plot_best_afrp(
376
+ result,
377
+ models=None,
378
+ title="Models with Best Repetition Factor Adjusted F1",
379
+ ref_result=None,
380
+ ):
381
+ # Initialize lists to store the statistics
382
+ model_names = []
383
+ best_f1 = []
384
+ best_afrp = []
385
+ best_repetition_penalty = []
386
+ best_mtp = []
387
+
388
+ if models is None:
389
+ models = result.keys()
390
+ for model in models:
391
+ print(f"model: {model}")
392
+ df = result[model]["df_overall"]
393
+
394
+ # Calculate the statistics
395
+ precision = [
396
+ df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
397
+ ]
398
+ recall = [
399
+ df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
400
+ ]
401
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
402
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
403
+
404
+ newline_score = [
405
+ df["newline_score"].mean()
406
+ for df in result[model]["df_list_repetition_penalty"]
407
+ ]
408
+ # print(f"newline_score: {newline_score}")
409
+
410
+ repetition_score = [
411
+ df["repetition_score"].mean()
412
+ for df in result[model]["df_list_repetition_penalty"]
413
+ ]
414
+ # print(f"repetition_score: {repetition_score}")
415
+
416
+ afrp = [
417
+ f / math.log10(10 + n + r)
418
+ for f, n, r in zip(f1, newline_score, repetition_score)
419
+ ]
420
+
421
+ best_afrp.append(max(afrp))
422
+ best_afrp_index = afrp.index(best_afrp[-1])
423
+ best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
424
+
425
+ best_f1.append(f1[best_afrp_index])
426
+ best_mtp.append(
427
+ newline_score[best_afrp_index] + repetition_score[best_afrp_index]
428
+ )
429
+
430
+ # print(
431
+ # f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
432
+ # )
433
+
434
+ df = result[model]["df_list_repetition_penalty"][best_afrp_index]
435
+
436
+ model_names.append(
437
+ f"{model} (RP={best_repetition_penalty[-1]})"
438
+ ) # Add the model name to the list
439
+
440
+ if ref_result is not None:
441
+ print("ref_result:", ref_result)
442
+ for model in ref_result.keys():
443
+ model_names.append(model)
444
+ df = pd.read_csv(ref_result[model])
445
+ # df = df[df["id"].isin(wikidata_df["id"])]
446
+
447
+ p = df["precision"].mean()
448
+ r = df["recall"].mean()
449
+
450
+ f1 = 2 * p * r / (p + r) if p + r > 0 else 0
451
+ best_f1.append(f1)
452
+ best_afrp.append(f1)
453
+ best_mtp.append(0)
454
+
455
+ print("model_names:", model_names)
456
+ # print("best_f1:", best_f1)
457
+ # print("best_afrp:", best_afrp)
458
+
459
+ # Create a DataFrame with the statistics
460
+ data = pd.DataFrame(
461
+ {
462
+ "Model": model_names,
463
+ "Repetition Factor Adjusted F1": best_afrp,
464
+ "F1": best_f1,
465
+ }
466
+ )
467
+
468
+ # Melt the DataFrame to a long format
469
+ data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
470
+
471
+ # Pivot the DataFrame to a wide format
472
+ data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
473
+
474
+ # make sure the columns are following the order of the models
475
+ data_pivoted = data_pivoted[model_names]
476
+
477
+ # make sure three groups in the order of precision, recall, f1
478
+ data_pivoted = data_pivoted.reindex(["Repetition Factor Adjusted F1", "F1"])
479
+
480
+ # Plot the statistics
481
+ plt.figure(figsize=(15, 6))
482
+ ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
483
+ plt.title(title)
484
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
485
+
486
+ # Set the rotation of the x-axis labels to 0 degrees
487
+ plt.xticks(rotation=0)
488
+
489
+ # Format the y-axis to display as percentage
490
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
491
+
492
+ # get the max value of the y-axis
493
+ a1 = max(best_afrp)
494
+ a2 = max(best_f1)
495
+
496
+ max_value = max([a1, a2]) * 1.12
497
+ print("max_value:", max_value)
498
+
499
+ # Set the y-axis limit up to 70%
500
+ ax.set_ylim(0, max_value)
501
+
502
+ # Add the values above each bar
503
+ for p in ax.patches:
504
+ ax.annotate(
505
+ f"{p.get_height() * 100:.1f}",
506
+ (p.get_x() + p.get_width() / 2.0, p.get_height()),
507
+ ha="center",
508
+ va="bottom",
509
+ xytext=(0, 10),
510
+ textcoords="offset points",
511
+ rotation=90,
512
+ )
513
+
514
+ plt.show()
515
+ return data_pivoted, best_mtp
516
+
517
+
518
+ def plot_best_performance(
519
+ result,
520
+ models=None,
521
+ title="Models with Best F1 Score",
522
+ adjusted_f1=False,
523
+ ref_result=None,
524
+ ):
525
+ # Initialize lists to store the statistics
526
+ model_names = []
527
+ best_precision = []
528
+ best_recall = []
529
+ best_f1 = []
530
+ best_repetition_penalty = []
531
+ best_mtp = []
532
+
533
+ if models is None:
534
+ models = result.keys()
535
+ for model in models:
536
+ print(f"model: {model}")
537
+ df = result[model]["df_overall"]
538
+
539
+ # Calculate the statistics
540
+ precision = [
541
+ df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
542
+ ]
543
+ recall = [
544
+ df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
545
+ ]
546
+ newline_score = [
547
+ df["newline_score"].mean()
548
+ for df in result[model]["df_list_repetition_penalty"]
549
+ ]
550
+
551
+ repetition_score = [
552
+ df["repetition_score"].mean()
553
+ for df in result[model]["df_list_repetition_penalty"]
554
+ ]
555
+
556
+ if adjusted_f1:
557
+ precision, recall = adjust_perf_scores_with_repetition_penalty(
558
+ result[model], precision, recall
559
+ )
560
+
561
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
562
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
563
+
564
+ best_f1.append(max(f1))
565
+ best_f1_index = f1.index(best_f1[-1])
566
+ best_repetition_penalty.append(df["repetition_penalty"][best_f1_index])
567
+
568
+ best_precision.append(precision[best_f1_index])
569
+ best_recall.append(recall[best_f1_index])
570
+ best_mtp.append(newline_score[best_f1_index] + repetition_score[best_f1_index])
571
+
572
+ print(
573
+ f"best repetition penalty: {best_repetition_penalty[-1]}, best f1: {best_f1[-1]}, precision: {best_precision[-1]}, recall: {best_recall[-1]}"
574
+ )
575
+
576
+ df = result[model]["df_list_repetition_penalty"][best_f1_index]
577
+
578
+ model_names.append(
579
+ f"{model} (RP={best_repetition_penalty[-1]})"
580
+ ) # Add the model name to the list
581
+
582
+ # print sum for columns: newline_score, repetition_score
583
+ print(
584
+ f"newline_score: {df['newline_score'].sum()}, repetition_score: {df['repetition_score'].sum()}"
585
+ )
586
+
587
+ if ref_result is not None:
588
+ print("ref_result:", ref_result)
589
+ for model in ref_result.keys():
590
+ model_names.append(model)
591
+ df = pd.read_csv(ref_result[model])
592
+ # df = df[df["id"].isin(wikidata_df["id"])]
593
+
594
+ best_precision.append(df["precision"].mean())
595
+ best_recall.append(df["recall"].mean())
596
+ f1 = (
597
+ 2
598
+ * (best_precision[-1] * best_recall[-1])
599
+ / (best_precision[-1] + best_recall[-1])
600
+ )
601
+ # best_f1.append(df["f1"].mean())
602
+ best_f1.append(f1)
603
+ best_mtp.append(0)
604
+
605
+ # Create a DataFrame with the statistics
606
+ data = (
607
+ pd.DataFrame(
608
+ {
609
+ "Model": model_names,
610
+ "Adjusted Precision with RP": best_precision,
611
+ "Adjusted Recall with RP": best_recall,
612
+ "Adjusted F1 with RP": best_f1,
613
+ }
614
+ )
615
+ if adjusted_f1
616
+ else pd.DataFrame(
617
+ {
618
+ "Model": model_names,
619
+ "Precision": best_precision,
620
+ "Recall": best_recall,
621
+ "F1": best_f1,
622
+ }
623
+ )
624
+ )
625
+ columns = list(data.columns)
626
+
627
+ # Melt the DataFrame to a long format
628
+ data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
629
+
630
+ # Pivot the DataFrame to a wide format
631
+ data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
632
+
633
+ # make sure the columns are following the order of the models
634
+ data_pivoted = data_pivoted[model_names]
635
+
636
+ # make sure three groups in the order of precision, recall, f1
637
+ data_pivoted = data_pivoted.reindex(columns[1:])
638
+
639
+ # Plot the statistics
640
+ plt.figure(figsize=(10, 6))
641
+ ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
642
+ plt.title(title)
643
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
644
+
645
+ # Set the rotation of the x-axis labels to 0 degrees
646
+ plt.xticks(rotation=0)
647
+
648
+ # Format the y-axis to display as percentage
649
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
650
+
651
+ # get the max value of the y-axis
652
+ a1 = max(best_precision)
653
+ a2 = max(best_recall)
654
+ a3 = max(best_f1)
655
+
656
+ max_value = max([a1, a2, a3]) * 1.12
657
+ print("max_value:", max_value)
658
+
659
+ # Set the y-axis limit up to 70%
660
+ ax.set_ylim(0, max_value)
661
+
662
+ # Add the values above each bar
663
+ for p in ax.patches:
664
+ ax.annotate(
665
+ f"{p.get_height() * 100:.1f}",
666
+ (p.get_x() + p.get_width() / 2.0, p.get_height()),
667
+ ha="center",
668
+ va="bottom",
669
+ xytext=(0, 10),
670
+ textcoords="offset points",
671
+ rotation=90,
672
+ )
673
+
674
+ plt.show()
675
+ return data_pivoted, best_mtp
676
+
677
+
678
+ def plot_best_performance_ms_macro(
679
+ result,
680
+ models=None,
681
+ title="Models with Best Repetition Factor Adjusted Performance",
682
+ ref_result=None,
683
+ skip_generic_prompt=False,
684
+ include_adjusted_performance=True,
685
+ ):
686
+ # Initialize lists to store the statistics
687
+ model_names = []
688
+ best_f1 = []
689
+ best_afrp = []
690
+ best_repetition_penalty = []
691
+ best_bleu1 = []
692
+ best_rougeL = []
693
+ best_mtp = []
694
+
695
+ if models is None:
696
+ models = result.keys()
697
+ for model in models:
698
+ if skip_generic_prompt and "generic prompt" in model:
699
+ continue
700
+ print(f"model: {model}")
701
+ df = result[model]["df_overall"]
702
+
703
+ # Calculate the statistics
704
+ bleu1 = [x for x in df["bleu1"]]
705
+ rougeL = [x for x in df["rougeL"]]
706
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
707
+
708
+ newline_score = [
709
+ df["newline_score"].mean()
710
+ for df in result[model]["df_list_repetition_penalty"]
711
+ ]
712
+ # print(f"newline_score: {newline_score}")
713
+
714
+ repetition_score = [
715
+ df["repetition_score"].mean()
716
+ for df in result[model]["df_list_repetition_penalty"]
717
+ ]
718
+ # print(f"repetition_score: {repetition_score}")
719
+
720
+ afrp = [
721
+ f / math.log10(10 + n + r)
722
+ for f, n, r in zip(f1, newline_score, repetition_score)
723
+ ]
724
+
725
+ best_afrp.append(max(afrp if include_adjusted_performance else f1))
726
+ best_afrp_index = (
727
+ afrp.index(best_afrp[-1])
728
+ if include_adjusted_performance
729
+ else f1.index(best_afrp[-1])
730
+ )
731
+ best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
732
+
733
+ best_f1.append(f1[best_afrp_index])
734
+ best_bleu1.append(bleu1[best_afrp_index])
735
+ best_rougeL.append(rougeL[best_afrp_index])
736
+ best_mtp.append(
737
+ newline_score[best_afrp_index] + repetition_score[best_afrp_index]
738
+ )
739
+
740
+ # print(
741
+ # f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
742
+ # )
743
+
744
+ df = result[model]["df_list_repetition_penalty"][best_afrp_index]
745
+
746
+ model_names.append(
747
+ f"{model} (RP={best_repetition_penalty[-1]})"
748
+ ) # Add the model name to the list
749
+
750
+ if ref_result is not None:
751
+ print("ref_result:", ref_result)
752
+ for model in ref_result.keys():
753
+ model_names.append(model)
754
+ df = pd.read_csv(ref_result[model], comment="#", on_bad_lines="warn")
755
+ # df = df[df["id"].isin(wikidata_df["id"])]
756
+
757
+ p = df["bleu1"][0]
758
+ best_bleu1.append(p)
759
+
760
+ r = df["rougeL"][0]
761
+ best_rougeL.append(r)
762
+
763
+ f1 = 2 * p * r / (p + r) if p + r > 0 else 0
764
+ best_f1.append(f1)
765
+ best_afrp.append(f1)
766
+ best_mtp.append(0)
767
+
768
+ # print("model_names:", model_names)
769
+ # print("best_f1:", best_f1)
770
+ # print("best_afrp:", best_afrp)
771
+
772
+ # Create a DataFrame with the statistics
773
+ data = (
774
+ pd.DataFrame(
775
+ {
776
+ "Model": model_names,
777
+ "Repetition Factor Adjusted Perf Score": best_afrp,
778
+ "Overall Perf Score": best_f1,
779
+ }
780
+ )
781
+ if include_adjusted_performance
782
+ else pd.DataFrame(
783
+ {
784
+ "Model": model_names,
785
+ "Bleu-1": best_bleu1,
786
+ "Rouge-L": best_rougeL,
787
+ "Overall Perf Score": best_f1,
788
+ }
789
+ )
790
+ )
791
+
792
+ # Melt the DataFrame to a long format
793
+ data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
794
+
795
+ # Pivot the DataFrame to a wide format
796
+ data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
797
+
798
+ # make sure the columns are following the order of the models
799
+ data_pivoted = data_pivoted[model_names]
800
+
801
+ columns = list(data.columns)
802
+ data_pivoted = data_pivoted.reindex(columns[1:])
803
+
804
+ # Plot the statistics
805
+ plt.figure(figsize=(10, 6))
806
+ ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
807
+ plt.title(title)
808
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
809
+
810
+ # Set the rotation of the x-axis labels to 0 degrees
811
+ plt.xticks(rotation=0)
812
+
813
+ # Format the y-axis to display as percentage
814
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
815
+
816
+ # get the max value of the y-axis
817
+ a1 = max(best_afrp)
818
+ a2 = max(best_f1)
819
+ a3 = max(best_bleu1)
820
+ a4 = max(best_rougeL)
821
+
822
+ max_value = (
823
+ max([a1, a2] if include_adjusted_performance else [a1, a2, a3, a4]) * 1.12
824
+ )
825
+ print("max_value:", max_value)
826
+
827
+ # Set the y-axis limit up to 70%
828
+ ax.set_ylim(0, max_value)
829
+
830
+ # Add the values above each bar
831
+ for p in ax.patches:
832
+ ax.annotate(
833
+ f"{p.get_height() * 100:.1f}",
834
+ (p.get_x() + p.get_width() / 2.0, p.get_height()),
835
+ ha="center",
836
+ va="bottom",
837
+ xytext=(0, 10),
838
+ textcoords="offset points",
839
+ rotation=90,
840
+ )
841
+
842
+ plt.show()
843
+ return data_pivoted, best_mtp
844
+
845
+
846
+ all_open_source_models = [
847
+ "gemma-1.1-2b-it",
848
+ "Phi-3-mini-128k-instruct",
849
+ "gemma-1.1-7b-it",
850
+ "Llama-2-7b-chat-hf",
851
+ "Mistral-7B-Instruct-v0.2",
852
+ "Meta-Llama-3-8B-Instruct",
853
+ "Llama-2-13b-chat-hf",
854
+ "Llama-2-70b-chat-hf",
855
+ "Meta-Llama-3-70B-Instruct",
856
+ ]
857
+
858
+
859
+ non_rag_csv_result_files = [
860
+ "./data/results/gemma-1.1-2b-it_wd_non_rag.csv", # gemma-1.1-2b-it
861
+ "./data/results/Phi-3-mini-128k-instruct_wd_non_rag_batch_16.csv", # Phi-3-mini-128k-instruct(batch size:16)
862
+ "./data/results/gemma-1.1-7b-it_wd_non_rag.csv", # gemma-1.1-7b-it
863
+ "./data/results/Tune_2024-04-09_09-19-22.csv", # Llama-2-7b-chat-hf
864
+ "./data/results/Tune_2024-04-16_12-24-27.csv.csv", # Mistral-7B-Instruct-v0.2
865
+ "./data/results/Meta-Llama-3-8B-Instruct_wd_non_rag.csv", # Meta-Llama-3-8B-Instruct
866
+ "./data/results/Meta-Llama-3-8B-Instruct_wd_1_non_rag.csv", # Meta-Llama-3-8B-Instruct
867
+ "./data/results/Tune_2024-04-10_16-53-38.csv", # Llama-2-13b-chat-hf
868
+ "./data/results/Llama-2-70b-chat-hf_wd_non_rag.csv", # Llama-2-70b-chat-hf
869
+ "./data/results/Meta-Llama-3-70B-Instruct_wd_non_rag.csv", # Meta-Llama-3-70B-Instruct
870
+ ]
871
+
872
+ rag_csv_result_files = [
873
+ "./data/results/gemma-1.1-2b-it_wd.csv", # gemma-1.1-2b-it
874
+ "./data/results/gemma-1.1-2b-it_wd_true.csv", # gemma-1.1-2b-it(true)
875
+ "./data/results/Phi-3-mini-128k-instruct_wd_rag_batch_4.csv", # Phi-3-mini-128k-instruct(batch size:16)
876
+ "./data/results/Phi-3-mini-128k-instruct_wd_true.csv", # Phi-3-mini-128k-instruct(batch size:16)
877
+ "./data/results/gemma-1.1-7b-it_wd.csv", # gemma-1.1-7b-it
878
+ "./data/results/gemma-1.1-7b-it_wd_true.csv", # gemma-1.1-7b-it(true)
879
+ "./data/results/Tune_2024-03-20_15-35-37.csv", # Llama-2-7b-chat-hf
880
+ "./data/results/Llama-2-7b-chat-hf_wd_true.csv", # Llama-2-7b-chat-hf(true)
881
+ "./data/results/Tune_2024-03-29_11-28-20.csv", # Mistral-7B-Instruct-v0.2
882
+ "./data/results/Mistral-7B-Instruct-v0.2_wd_true.csv", # Mistral-7B-Instruct-v0.2(true)
883
+ "./data/results/Meta-Llama-3-8B-Instruct_wd.csv", # Meta-Llama-3-8b-instruct
884
+ "./data/results/Meta-Llama-3-8B-Instruct_wd_true.csv", # Meta-Llama-3-8b-instruct(true)
885
+ "./data/results/Tune_2024-03-25_23-32-57.csv", # Llama-2-13b-chat-hf
886
+ "./data/results/Llama-2-13b-chat-hf_wd_true.csv", # Llama-2-13b-chat-hf(true)
887
+ "./data/results/Llama-2-70b-chat-hf_wd.csv", # Llama-2-70b-chat-hf
888
+ "./data/results/Llama-2-70b-chat-hf_wd_true.csv", # Llama-2-70b-chat-hf
889
+ "./data/results/Meta-Llama-3-70B-Instruct_wd.csv", # Meta-Llama-3-70B-Instruct
890
+ "./data/results/Meta-Llama-3-70B-Instruct_wd_true.csv", # Meta-Llama-3-70B-Instruct(true)
891
+ ]
892
+
893
+ df_ms_macro = pd.read_json("./data/datasets/ms_macro.json")
894
+
895
+
896
+ def load_for_repetition_penalty_ms_macro(
897
+ csv_result_file, repetition_penalty, force_recalculate=False
898
+ ):
899
+ result_file = replace_last(
900
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
901
+ )
902
+ df = load_with_newline_and_repetition_scores(
903
+ result_file, force_recalculate=force_recalculate
904
+ )
905
+
906
+ if len(df) != len(df_ms_macro):
907
+ print(f"error: len(df) != {len(df_ms_macro)}")
908
+ missing_ids = [
909
+ id for id in df_ms_macro["id"].unique() if id not in df["id"].unique()
910
+ ]
911
+ print(f"missing_ids: {missing_ids}")
912
+
913
+ if df["ground_truth"][0] != str(df_ms_macro["wellFormedAnswers"][0]):
914
+ df["ground_truth"] = df_ms_macro["wellFormedAnswers"]
915
+ print("ground_truth updated for:", result_file)
916
+ df.to_csv(result_file, index=False)
917
+ return df
918
+
919
+
920
+ # MS MACRO
921
+ def plot_performance_scores_ms_macro(
922
+ result,
923
+ models=None,
924
+ title="Performance",
925
+ ):
926
+
927
+ if models is None:
928
+ models = result.keys()
929
+ for model in models:
930
+ print(f"model: {model}")
931
+ df = result[model]["df_overall"]
932
+ # print(result[model]["df_list_repetition_penalty"][0].describe())
933
+
934
+ # Calculate the statistics
935
+ bleu1 = list(df["bleu1"])
936
+ rougeL = list(df["rougeL"])
937
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
938
+ best_f1 = max(f1)
939
+ best_f1_index = f1.index(best_f1)
940
+
941
+ bleu1, rougeL = adjust_perf_scores_with_repetition_penalty(
942
+ result[model], bleu1, rougeL
943
+ )
944
+ afrp = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
945
+
946
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
947
+ best_afrp = max(afrp)
948
+ best_afrp_index = afrp.index(best_afrp)
949
+
950
+ repetition_penalties = list(df["repetition_penalty"])
951
+
952
+ # line plot for precision, recall, f1
953
+ plt.figure(figsize=(10, 6))
954
+
955
+ plt.axvspan(
956
+ repetition_penalties[best_f1_index] - 0.01,
957
+ repetition_penalties[best_f1_index] + 0.01,
958
+ alpha=0.5,
959
+ edgecolor="none",
960
+ facecolor="blue",
961
+ )
962
+
963
+ plt.axvspan(
964
+ repetition_penalties[best_afrp_index] - 0.01,
965
+ repetition_penalties[best_afrp_index] + 0.01,
966
+ alpha=0.5,
967
+ edgecolor="none",
968
+ facecolor="orange",
969
+ )
970
+
971
+ plt.plot(
972
+ repetition_penalties,
973
+ f1,
974
+ label="Overall Perf Score",
975
+ marker="D",
976
+ color="blue",
977
+ )
978
+ plt.plot(
979
+ repetition_penalties,
980
+ afrp,
981
+ label="RF Adjusted Perf Score",
982
+ marker="o",
983
+ color="orange",
984
+ )
985
+
986
+ plt.xlabel("Repetition Penalties")
987
+ plt.ylabel("Score")
988
+ plt.xlim(0.99, 1.31)
989
+ # y in percentage
990
+ plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
991
+ plt.title(f"{model} {title}")
992
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
993
+
994
+ plt.show()
995
+
996
+
997
+ def plot_repetition_factors(result, groups):
998
+ for group in groups:
999
+ # Plot the statistics
1000
+ plt.figure(figsize=(10, 6))
1001
+
1002
+ max_value = 0
1003
+ for model in result.keys():
1004
+ if not group in model.lower():
1005
+ continue
1006
+ print(f"model: {model}")
1007
+ df = result[model]["df_overall"]
1008
+ repetition_panelties = [
1009
+ repetition_penalty for repetition_penalty in df["repetition_penalty"]
1010
+ ]
1011
+
1012
+ mean_score = [
1013
+ math.log10(10 + df["total_repetitions"].mean())
1014
+ for df in result[model]["df_list_repetition_penalty"]
1015
+ ]
1016
+
1017
+ sns.lineplot(x=repetition_panelties, y=mean_score, label=model)
1018
+
1019
+ new_max = max(mean_score)
1020
+ if new_max > max_value:
1021
+ max_value = new_max
1022
+
1023
+ max_value = max_value * 1.05
1024
+ if max_value < 1.5:
1025
+ max_value = 1.5
1026
+ # set ylimit
1027
+ plt.ylim(1, max_value)
1028
+
1029
+ # show grid
1030
+ plt.grid(True)
1031
+ plt.xlabel("Repetition Penalties")
1032
+ plt.ylabel("Repetition Factors")
1033
+ plt.title("Repetition Factors vs Repetition Penalties")
1034
+ plt.legend()
1035
+
1036
+ plt.show()
1037
+
1038
+
1039
+ def plot_repetition_factors_by_group(result, group_filter=None):
1040
+ markers = ["D", "o", "s", "x"]
1041
+ colors = ["blue", "orange", "green", "red"]
1042
+
1043
+ # Plot the statistics
1044
+ plt.figure(figsize=(10, 6))
1045
+ index = 0
1046
+ max_value = 0
1047
+
1048
+ for model in result.keys():
1049
+ if group_filter is not None and group_filter not in model:
1050
+ continue
1051
+
1052
+ print(f"model: {model}")
1053
+
1054
+ df = result[model]["df_overall"]
1055
+ repetition_panelties = [
1056
+ repetition_penalty for repetition_penalty in df["repetition_penalty"]
1057
+ ]
1058
+
1059
+ # Calculate the statistics
1060
+ mean_score = [
1061
+ math.log10(10 + df["total_repetitions"].mean())
1062
+ for df in result[model]["df_list_repetition_penalty"]
1063
+ ]
1064
+ if len(mean_score) != len(repetition_panelties):
1065
+ print(
1066
+ f"model: {model} has different length of repetition penalties and mean score"
1067
+ )
1068
+ print("repetition_panelties:", len(repetition_panelties))
1069
+ print("mean_score:", len(mean_score))
1070
+ continue
1071
+
1072
+ new_max = max(mean_score)
1073
+ if new_max > max_value:
1074
+ max_value = new_max
1075
+
1076
+ sns.lineplot(
1077
+ x=repetition_panelties,
1078
+ y=mean_score,
1079
+ label=model,
1080
+ marker=markers[index],
1081
+ color=colors[index],
1082
+ )
1083
+
1084
+ index += 1
1085
+
1086
+ max_value = max_value * 1.05
1087
+ if max_value < 1.5:
1088
+ max_value = 1.5
1089
+ # set ylimit
1090
+ plt.ylim(1, max_value)
1091
+ max_value = 0
1092
+
1093
+ plt.xlabel("Repetition Penalties")
1094
+ plt.ylabel("Repetition Factors")
1095
+ plt.title("Repetition Factors vs Repetition Penalties")
1096
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
1097
+
1098
+ plt.show()
1099
+
1100
+
1101
+ ms_marco_csv_result_files = [
1102
+ "data/results/gemma-1.1-2b-it_mm_true_false.csv",
1103
+ "data/results/gemma-1.1-2b-it_mm_true.csv",
1104
+ "data/results/gemma-1.1-2b-it_mm_true_false_non_rag.csv",
1105
+ "data/results/Phi-3-mini-128k-instruct_mm_false.csv",
1106
+ "data/results/Phi-3-mini-128k-instruct_mm_true.csv",
1107
+ "data/results/Phi-3-mini-128k-instruct_mm_non_rag.csv",
1108
+ "data/results/gemma-1.1-7b-it_mm_false.csv",
1109
+ "data/results/gemma-1.1-7b-it_mm_true.csv",
1110
+ "data/results/gemma-1.1-7b-it_mm_non_rag.csv",
1111
+ "data/results/Llama-2-7b-chat-hf_mm_true_false.csv",
1112
+ "data/results/Llama-2-7b-chat-hf_mm_true.csv",
1113
+ "data/results/Llama-2-7b-chat-hf_mm_true_false_non_rag.csv",
1114
+ "data/results/Mistral-7B-Instruct-v0.2_mm_false.csv",
1115
+ "data/results/Mistral-7B-Instruct-v0.2_mm_true.csv",
1116
+ "data/results/Mistral-7B-Instruct-v0.2_mm_non_rag.csv",
1117
+ "data/results/Meta-Llama-3-8B-Instruct_mm_true_false.csv",
1118
+ "data/results/Meta-Llama-3-8B-Instruct_mm_true.csv",
1119
+ "data/results/Meta-Llama-3-8B-Instruct_mm_true_false_non_rag.csv",
1120
+ "data/results/Llama-2-13b-chat-hf_mm_false.csv",
1121
+ "data/results/Llama-2-13b-chat-hf_mm_true.csv",
1122
+ "data/results/Llama-2-13b-chat-hf_mm_non_rag.csv",
1123
+ "data/results/Llama-2-70b-chat-hf_mm_false.csv",
1124
+ "data/results/Llama-2-70b-chat-hf_mm_true.csv",
1125
+ "data/results/Llama-2-70b-chat-hf_mm_non_rag.csv",
1126
+ "data/results/Meta-Llama-3-70B-Instruct_mm_false.csv",
1127
+ "data/results/Meta-Llama-3-70B-Instruct_mm_true.csv",
1128
+ "data/results/Meta-Llama-3-70B-Instruct_mm_non_rag.csv",
1129
+ ]
1130
+
1131
+ webqsp_csv_result_files = []
1132
+ webqsp_model_result_counts = {}
1133
+
1134
+
1135
+ def find_model_name(file_path):
1136
+ df = pd.read_csv(file_path, comment="#", on_bad_lines="warn")
1137
+ return df["model"][0]
1138
+
1139
+
1140
+ def add_file(file):
1141
+ model_name = find_model_name(file)
1142
+ if "(generic prompt)" not in model_name:
1143
+ webqsp_csv_result_files.append(file)
1144
+ if model_name not in webqsp_model_result_counts:
1145
+ webqsp_model_result_counts[model_name] = 1
1146
+ else:
1147
+ webqsp_model_result_counts[model_name] += 1
1148
+
1149
+
1150
+ last_model_name = None
1151
+ non_rag_index = 0
1152
+
1153
+ for csv_result_file in rag_csv_result_files:
1154
+ try:
1155
+ model_name = find_model_name(csv_result_file)
1156
+ # print(f"processing model: {model_name} - {csv_result_file}")
1157
+
1158
+ if last_model_name != model_name and last_model_name is not None:
1159
+ while non_rag_index < len(non_rag_csv_result_files):
1160
+ # print(f"processing non-rag file - {file}")
1161
+ file = non_rag_csv_result_files[non_rag_index]
1162
+ non_model_name = find_model_name(file)
1163
+ if non_model_name.startswith(last_model_name):
1164
+ add_file(file)
1165
+ non_rag_index += 1
1166
+ else:
1167
+ break
1168
+
1169
+ add_file(csv_result_file)
1170
+ last_model_name = model_name
1171
+ except FileNotFoundError as e:
1172
+ print("\terror processing file: ", csv_result_file, e)
1173
+ continue
1174
+
1175
+ for file in non_rag_csv_result_files[non_rag_index:]:
1176
+ add_file(file)
1177
+
1178
+
1179
+ def calc_rap_scores(result, precision="precision", recall="recall"):
1180
+ newline_score = [
1181
+ df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
1182
+ ]
1183
+
1184
+ repetition_score = [
1185
+ df["repetition_score"].mean() for df in result["df_list_repetition_penalty"]
1186
+ ]
1187
+
1188
+ if precision in result["df_list_repetition_penalty"][0].columns:
1189
+ precision = [
1190
+ df[precision].mean() for df in result["df_list_repetition_penalty"]
1191
+ ]
1192
+ recall = [df[recall].mean() for df in result["df_list_repetition_penalty"]]
1193
+ else:
1194
+ precision = result["df_overall"][precision]
1195
+ recall = result["df_overall"][recall]
1196
+
1197
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
1198
+
1199
+ rap = [
1200
+ f / math.log10(10 + n + r)
1201
+ for f, n, r in zip(f1, newline_score, repetition_score)
1202
+ ]
1203
+
1204
+ return newline_score, repetition_score, f1, rap
1205
+
1206
+
1207
+ def load_webqsp_result(csv_result_files, force_recalculate=False):
1208
+ model_name_exts = {
1209
+ "true": "(RAG - Chat Template)",
1210
+ "wd": "(RAG - Generic Prompt)",
1211
+ "rag": "(Non-RAG)",
1212
+ }
1213
+
1214
+ result = {}
1215
+ for i, csv_result_file in enumerate(csv_result_files):
1216
+ try:
1217
+ df = pd.read_csv(csv_result_file)
1218
+ parts = re.split(r"[_\.]", csv_result_file)
1219
+ if parts[-2] in model_name_exts.keys():
1220
+ key = parts[-2]
1221
+ elif csv_result_file in non_rag_csv_result_files:
1222
+ key = "rag"
1223
+ else:
1224
+ key = "wd"
1225
+ model_name = f'{df["model"][0]}{model_name_exts[key]}'
1226
+ dfs = [
1227
+ calculate_performance_score(
1228
+ csv_result_file,
1229
+ repetition_penalty,
1230
+ force_recalculate=force_recalculate,
1231
+ )
1232
+ for repetition_penalty in df["repetition_penalty"]
1233
+ ]
1234
+
1235
+ result[model_name] = {
1236
+ "df_overall": df,
1237
+ "df_list_repetition_penalty": dfs,
1238
+ "file": csv_result_file,
1239
+ }
1240
+ newline_score, repetition_score, perf, rap = calc_rap_scores(
1241
+ result[model_name]
1242
+ )
1243
+ df["newline_score"] = newline_score
1244
+ df["repetition_score"] = repetition_score
1245
+ df["total_repetitions"] = df["newline_score"] + df["repetition_score"]
1246
+ df["perf"] = perf
1247
+ df["rap"] = rap
1248
+ except Exception as e:
1249
+ print(f"Error: {e}")
1250
+
1251
+ return result
1252
+
1253
+
1254
+ def load_ms_marco_result(csv_result_files, force_recalculate=False):
1255
+ model_name_exts = {
1256
+ "true": "(RAG - Chat Template)",
1257
+ "false": "(RAG - Generic Prompt)",
1258
+ "rag": "(Non-RAG)",
1259
+ }
1260
+
1261
+ result = {}
1262
+ for csv_result_file in csv_result_files:
1263
+ try:
1264
+ df = pd.read_csv(csv_result_file)
1265
+
1266
+ parts = re.split(r"[_\.]", csv_result_file)
1267
+ model_name = f'{df["model"][0]}{model_name_exts[parts[-2]]}'
1268
+
1269
+ print(f"\tmodel_name: {model_name}")
1270
+ dfs = [
1271
+ load_for_repetition_penalty_ms_macro(
1272
+ csv_result_file,
1273
+ repetition_penalty,
1274
+ force_recalculate=force_recalculate,
1275
+ )
1276
+ for repetition_penalty in df["repetition_penalty"]
1277
+ ]
1278
+ result[model_name] = {
1279
+ "df_overall": df,
1280
+ "df_list_repetition_penalty": dfs,
1281
+ "file": csv_result_file,
1282
+ }
1283
+ newline_score, repetition_score, perf, rap = calc_rap_scores(
1284
+ result[model_name],
1285
+ precision="bleu1",
1286
+ recall="rougeL",
1287
+ )
1288
+ df["newline_score"] = newline_score
1289
+ df["repetition_score"] = repetition_score
1290
+ df["total_repetitions"] = df["newline_score"] + df["repetition_score"]
1291
+ df["perf"] = perf
1292
+ df["rap"] = rap
1293
+ except Exception as e:
1294
+ print(f"Error: {e}")
1295
+
1296
+ return result
eval_modules/calc_repetitions_v5.py ADDED
@@ -0,0 +1,1383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import pandas as pd
5
+ import numpy as np
6
+ import matplotlib.pyplot as plt
7
+ import matplotlib.ticker as mtick
8
+ import seaborn as sns
9
+ import nltk
10
+ import evaluate
11
+
12
+ meteor = evaluate.load("meteor")
13
+
14
+ print(f"loading: {__file__}")
15
+
16
+ # final version
17
+ pattern_excessive_whitespaces = re.compile(r"\s{5,}")
18
+ pattern_text_repetitions = re.compile(r"(.{5}.*)\s*((\1)\s*)+", re.M | re.DOTALL)
19
+
20
+
21
+ def del_excessive_whitespaces(text, debug=False):
22
+ count = 0
23
+
24
+ if isinstance(text, str):
25
+ if debug:
26
+ print("----detect excessive whitespaces----")
27
+ count = len(text)
28
+ text = pattern_excessive_whitespaces.sub("", text)
29
+ count -= len(text)
30
+ if debug and count:
31
+ print(f"removed excessive whitespaces: {count}")
32
+ return text, count
33
+
34
+
35
+ # final version for repetition detection
36
+ def detect_text_repetitions(text, debug=False):
37
+ count = 0
38
+
39
+ if isinstance(text, str):
40
+ if debug:
41
+ print("----detect text repetitions----")
42
+ matches = pattern_text_repetitions.finditer(text)
43
+ for match in matches:
44
+ if debug:
45
+ print(match)
46
+ for groupNum in range(0, len(match.groups())):
47
+ groupNum = groupNum + 1
48
+ print(
49
+ "Group {groupNum} found at {start}-{end}: `{group}`".format(
50
+ groupNum=groupNum,
51
+ start=match.start(groupNum),
52
+ end=match.end(groupNum),
53
+ group=match.group(groupNum),
54
+ )
55
+ )
56
+
57
+ start, end = match.span()
58
+ count += end - start
59
+
60
+ return count
61
+
62
+
63
+ def detect_repetitions(text, debug=False):
64
+ text, count_excessive_whitespaces = del_excessive_whitespaces(text, debug=debug)
65
+ count_text_repetitions = detect_text_repetitions(text, debug=debug)
66
+ total_repetitions = count_excessive_whitespaces + count_text_repetitions
67
+
68
+ result = (count_excessive_whitespaces, count_text_repetitions, total_repetitions)
69
+
70
+ if debug:
71
+ print(result)
72
+ return result
73
+
74
+
75
+ def detect_scores(text, debug=False):
76
+ newline_score, repetition_score, total_repetitions = detect_repetitions(
77
+ text, debug=debug
78
+ )
79
+ return pd.Series([newline_score, repetition_score, total_repetitions])
80
+
81
+
82
+ def load_with_newline_and_repetition_scores(result_file, force_recalculate=False):
83
+ print(f"loading result file: {result_file}")
84
+ df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
85
+
86
+ if (
87
+ force_recalculate
88
+ or "newline_score" not in df.columns
89
+ or "repetition_score" not in df.columns
90
+ or "total_repetitions" not in df.columns
91
+ ):
92
+ df[["newline_score", "repetition_score", "total_repetitions"]] = df[
93
+ "answer"
94
+ ].apply(detect_scores)
95
+ df.to_csv(result_file, index=False)
96
+
97
+ return df
98
+
99
+
100
+ def replace_last(source_string, old_string, new_string):
101
+ head, _sep, tail = source_string.rpartition(old_string)
102
+ return head + new_string + tail
103
+
104
+
105
+ def load_for_repetition_penalty(
106
+ csv_result_file, repetition_penalty, force_recalculate=False
107
+ ):
108
+ result_file = replace_last(
109
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
110
+ )
111
+ return load_with_newline_and_repetition_scores(
112
+ result_file, force_recalculate=force_recalculate
113
+ )
114
+
115
+
116
+ def calc_adjusted_performance(f, r):
117
+ return f / math.log10(10 + r)
118
+
119
+
120
+ def calculate_adjusted_performance(row):
121
+ r = row["total_repetitions"]
122
+ adjusted_precision = calc_adjusted_performance(row["precision"], r)
123
+ adjusted_recall = calc_adjusted_performance(row["recall"], r)
124
+ return pd.Series([adjusted_precision, adjusted_recall])
125
+
126
+
127
+ def load_performance_df(csv_result_file, repetition_penalty):
128
+ result_file = replace_last(
129
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}-t2_evaluated.json"
130
+ )
131
+ result_file = result_file.replace("/results/", "/eval/")
132
+ print(f"loading json file: {result_file}")
133
+ df = pd.read_json(result_file)
134
+
135
+ return df
136
+
137
+
138
+ def calculate_performance_score_v1(
139
+ csv_result_file, repetition_penalty, force_recalculate=False
140
+ ):
141
+ result_file = replace_last(
142
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
143
+ )
144
+ print(f"loading result file: {result_file}")
145
+ df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
146
+
147
+ if force_recalculate or "f2" in df.columns or "f1" not in df.columns:
148
+ df.drop(
149
+ columns=[
150
+ "precision",
151
+ "recall",
152
+ "f1",
153
+ "f2",
154
+ "entities_in_answer",
155
+ "entities_in_question",
156
+ ],
157
+ errors="ignore",
158
+ inplace=True,
159
+ )
160
+ perf_df = load_performance_df(csv_result_file, repetition_penalty)
161
+ filtered_df = perf_df[perf_df["id"].isin(df["id"])]
162
+ perf_df = filtered_df.reset_index(drop=True)
163
+ print(f"perf_df len: {len(perf_df)}")
164
+ # print(perf_df.head())
165
+
166
+ df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"]
167
+
168
+ df["precision"] = perf_df["score"].apply(lambda x: x[0])
169
+ df["recall"] = perf_df["score"].apply(lambda x: x[1])
170
+ df["f1"] = perf_df["score"].apply(lambda x: x[2])
171
+
172
+ df[["adjusted_precision", "adjusted_recall"]] = df.apply(
173
+ calculate_adjusted_performance, axis=1
174
+ )
175
+
176
+ df.to_csv(result_file, index=False)
177
+ print(f"performance scores saved to result file: {result_file}")
178
+
179
+ print(f"df len: {len(df)}")
180
+
181
+ return df
182
+
183
+
184
+ ref_df = pd.read_csv(
185
+ "./data/results/gpt-3.5-turbo_non_rag.csv", comment="#", on_bad_lines="warn"
186
+ )
187
+
188
+
189
+ def calculate_performance_score(
190
+ csv_result_file, repetition_penalty, force_recalculate=False
191
+ ):
192
+ result_file = replace_last(
193
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
194
+ )
195
+
196
+ re_creating = False
197
+ if os.path.exists(result_file):
198
+ print(f"loading result file: {result_file}")
199
+ df = pd.read_csv(result_file, comment="#", on_bad_lines="warn")
200
+ else:
201
+ print(f"re-creating result file: {result_file}")
202
+ df = pd.DataFrame()
203
+ force_recalculate = True
204
+
205
+ if force_recalculate or "f2" in df.columns or "f1" not in df.columns:
206
+ df.drop(
207
+ columns=[
208
+ "precision",
209
+ "recall",
210
+ "f1",
211
+ "f2",
212
+ "entities_in_answer",
213
+ "entities_in_question",
214
+ "word_count",
215
+ ],
216
+ errors="ignore",
217
+ inplace=True,
218
+ )
219
+ perf_df = load_performance_df(csv_result_file, repetition_penalty)
220
+ filtered_df = perf_df[perf_df["id"].isin(ref_df["id"])]
221
+ perf_df = filtered_df.reset_index(drop=True)
222
+ print(f"perf_df len: {len(perf_df)}")
223
+
224
+ if len(perf_df) != len(ref_df):
225
+ print(f"error: len(perf_df) != {len(ref_df)}")
226
+ missing_ids = [
227
+ id for id in ref_df["id"].unique() if id not in perf_df["id"].unique()
228
+ ]
229
+ print(f"missing_ids: {missing_ids}")
230
+
231
+ # print(perf_df.head())
232
+
233
+ df["id"] = perf_df["id"]
234
+ df["question"] = perf_df["question"]
235
+ df["answer"] = perf_df["pred_answer"]
236
+ df["word_count"] = df["answer"].apply(
237
+ lambda x: len(nltk.word_tokenize(x)) if isinstance(x, str) else 0
238
+ )
239
+ df["ground_truth"] = perf_df["ground_truth"]
240
+ df[["newline_score", "repetition_score", "total_repetitions"]] = df[
241
+ "answer"
242
+ ].apply(detect_scores)
243
+
244
+ df["eval_gemini_1.0_pro"] = perf_df["eval_gemini_1.0_pro"]
245
+ df["precision"] = perf_df["score"].apply(lambda x: x[0])
246
+ df["recall"] = perf_df["score"].apply(lambda x: x[1])
247
+ df["f1"] = perf_df["score"].apply(lambda x: x[2])
248
+
249
+ df[["adjusted_precision", "adjusted_recall"]] = df.apply(
250
+ calculate_adjusted_performance, axis=1
251
+ )
252
+
253
+ df.to_csv(result_file, index=False)
254
+ print(f"performance scores saved to result file: {result_file}")
255
+
256
+ print(f"df len: {len(df)}")
257
+
258
+ return df
259
+
260
+
261
+ def adjust_perf_scores_with_repetition_penalty(result, precision, recall):
262
+ newline_score = [
263
+ df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
264
+ ]
265
+
266
+ repetition_score = [
267
+ df["repetition_score"].mean() for df in result["df_list_repetition_penalty"]
268
+ ]
269
+
270
+ precision = [
271
+ f / math.log10(10 + n + r)
272
+ for f, n, r in zip(precision, newline_score, repetition_score)
273
+ ]
274
+ recall = [
275
+ f / math.log10(10 + n + r)
276
+ for f, n, r in zip(recall, newline_score, repetition_score)
277
+ ]
278
+
279
+ return precision, recall
280
+
281
+
282
+ def plot_performance_scores(
283
+ result,
284
+ models=None,
285
+ title="Performance",
286
+ ):
287
+ if models is None:
288
+ models = result.keys()
289
+ for model in models:
290
+ print(f"model: {model}")
291
+ df = result[model]["df_overall"]
292
+
293
+ # Calculate the statistics
294
+ precision = [
295
+ df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
296
+ ]
297
+ recall = [
298
+ df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
299
+ ]
300
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
301
+ best_f1 = max(f1)
302
+ best_f1_index = f1.index(best_f1)
303
+
304
+ precision, recall = adjust_perf_scores_with_repetition_penalty(
305
+ result[model], precision, recall
306
+ )
307
+ afrp = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
308
+
309
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
310
+ best_afrp = max(afrp)
311
+ best_afrp_index = afrp.index(best_afrp)
312
+
313
+ adjusted_precision = [
314
+ df["adjusted_precision"].mean()
315
+ for df in result[model]["df_list_repetition_penalty"]
316
+ ]
317
+ adjusted_recall = [
318
+ df["adjusted_recall"].mean()
319
+ for df in result[model]["df_list_repetition_penalty"]
320
+ ]
321
+ afrp2 = [
322
+ 2 * (p * r) / (p + r) for p, r in zip(adjusted_precision, adjusted_recall)
323
+ ]
324
+ best_afrp2 = max(afrp2)
325
+ best_afrp2_index = afrp2.index(best_afrp2)
326
+
327
+ repetition_penalties = list(df["repetition_penalty"])
328
+
329
+ # line plot for precision, recall, f1
330
+ plt.figure(figsize=(10, 6))
331
+
332
+ plt.axvspan(
333
+ repetition_penalties[best_f1_index] - 0.01,
334
+ repetition_penalties[best_f1_index] + 0.01,
335
+ alpha=0.5,
336
+ edgecolor="none",
337
+ facecolor="blue",
338
+ )
339
+
340
+ # plt.axvspan(
341
+ # repetition_penalties[best_afrp2_index] - 0.01,
342
+ # repetition_penalties[best_afrp2_index] + 0.01,
343
+ # alpha=0.5,
344
+ # edgecolor="none",
345
+ # facecolor="green",
346
+ # )
347
+
348
+ plt.axvspan(
349
+ repetition_penalties[best_afrp_index] - 0.01,
350
+ repetition_penalties[best_afrp_index] + 0.01,
351
+ alpha=0.5,
352
+ edgecolor="none",
353
+ facecolor="orange",
354
+ )
355
+
356
+ plt.plot(repetition_penalties, f1, label="F1", marker="D", color="blue")
357
+ # plt.plot(
358
+ # repetition_penalties,
359
+ # afrp2,
360
+ # label="Per-question RAP - F1",
361
+ # marker="s",
362
+ # color="green",
363
+ # )
364
+ plt.plot(
365
+ repetition_penalties,
366
+ afrp,
367
+ label="RAP - F1",
368
+ marker="o",
369
+ color="orange",
370
+ )
371
+ plt.xlabel("Repetition Penalties")
372
+ plt.ylabel("Score")
373
+ # plt.xlim(0.99, 1.31)
374
+ # y in percentage
375
+ plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
376
+ plt.title(f"{model} {title}")
377
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
378
+
379
+ plt.show()
380
+
381
+
382
+ def plot_best_afrp(
383
+ result,
384
+ models=None,
385
+ title="Models with Best RAP - F1",
386
+ ref_result=None,
387
+ ):
388
+ # Initialize lists to store the statistics
389
+ model_names = []
390
+ best_f1 = []
391
+ best_afrp = []
392
+ best_repetition_penalty = []
393
+ best_mtr = []
394
+
395
+ if models is None:
396
+ models = result.keys()
397
+ for model in models:
398
+ print(f"model: {model}")
399
+ df = result[model]["df_overall"]
400
+
401
+ # Calculate the statistics
402
+ precision = [
403
+ df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
404
+ ]
405
+ recall = [
406
+ df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
407
+ ]
408
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
409
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
410
+
411
+ newline_score = [
412
+ df["newline_score"].mean()
413
+ for df in result[model]["df_list_repetition_penalty"]
414
+ ]
415
+ # print(f"newline_score: {newline_score}")
416
+
417
+ repetition_score = [
418
+ df["repetition_score"].mean()
419
+ for df in result[model]["df_list_repetition_penalty"]
420
+ ]
421
+ # print(f"repetition_score: {repetition_score}")
422
+
423
+ afrp = [
424
+ f / math.log10(10 + n + r)
425
+ for f, n, r in zip(f1, newline_score, repetition_score)
426
+ ]
427
+
428
+ best_afrp.append(max(afrp))
429
+ best_afrp_index = afrp.index(best_afrp[-1])
430
+ best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
431
+
432
+ best_f1.append(f1[best_afrp_index])
433
+ best_mtr.append(
434
+ newline_score[best_afrp_index] + repetition_score[best_afrp_index]
435
+ )
436
+
437
+ # print(
438
+ # f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
439
+ # )
440
+
441
+ df = result[model]["df_list_repetition_penalty"][best_afrp_index]
442
+
443
+ model_names.append(
444
+ f"{model} (RP={best_repetition_penalty[-1]})"
445
+ ) # Add the model name to the list
446
+
447
+ if ref_result is not None:
448
+ print("ref_result:", ref_result)
449
+ for model in ref_result.keys():
450
+ model_names.append(model)
451
+ df = pd.read_csv(ref_result[model])
452
+ # df = df[df["id"].isin(wikidata_df["id"])]
453
+
454
+ p = df["precision"].mean()
455
+ r = df["recall"].mean()
456
+
457
+ f1 = 2 * p * r / (p + r) if p + r > 0 else 0
458
+ best_f1.append(f1)
459
+ best_afrp.append(f1)
460
+ best_mtr.append(0)
461
+
462
+ print("model_names:", model_names)
463
+ # print("best_f1:", best_f1)
464
+ # print("best_afrp:", best_afrp)
465
+
466
+ # Create a DataFrame with the statistics
467
+ data = pd.DataFrame(
468
+ {
469
+ "Model": model_names,
470
+ "RAP - F1": best_afrp,
471
+ "F1": best_f1,
472
+ }
473
+ )
474
+
475
+ # Melt the DataFrame to a long format
476
+ data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
477
+
478
+ # Pivot the DataFrame to a wide format
479
+ data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
480
+
481
+ # make sure the columns are following the order of the models
482
+ data_pivoted = data_pivoted[model_names]
483
+
484
+ # make sure three groups in the order of precision, recall, f1
485
+ data_pivoted = data_pivoted.reindex(["RAP - F1", "F1"])
486
+
487
+ # Plot the statistics
488
+ plt.figure(figsize=(15, 6))
489
+ ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
490
+ plt.title(title)
491
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
492
+
493
+ # Set the rotation of the x-axis labels to 0 degrees
494
+ plt.xticks(rotation=0)
495
+
496
+ # Format the y-axis to display as percentage
497
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
498
+
499
+ # get the max value of the y-axis
500
+ a1 = max(best_afrp)
501
+ a2 = max(best_f1)
502
+
503
+ max_value = max([a1, a2]) * 1.12
504
+ print("max_value:", max_value)
505
+
506
+ # Set the y-axis limit up to 70%
507
+ ax.set_ylim(0, max_value)
508
+
509
+ # Add the values above each bar
510
+ for p in ax.patches:
511
+ ax.annotate(
512
+ f"{p.get_height() * 100:.1f}",
513
+ (p.get_x() + p.get_width() / 2.0, p.get_height()),
514
+ ha="center",
515
+ va="bottom",
516
+ xytext=(0, 10),
517
+ textcoords="offset points",
518
+ rotation=90,
519
+ )
520
+
521
+ plt.show()
522
+ return data_pivoted, best_mtr
523
+
524
+
525
+ def plot_best_performance(
526
+ result,
527
+ models=None,
528
+ title="Models with Best F1 Score",
529
+ adjusted_f1=False,
530
+ ref_result=None,
531
+ ):
532
+ # Initialize lists to store the statistics
533
+ model_names = []
534
+ best_precision = []
535
+ best_recall = []
536
+ best_f1 = []
537
+ best_repetition_penalty = []
538
+ best_mtr = []
539
+
540
+ if models is None:
541
+ models = result.keys()
542
+ for model in models:
543
+ print(f"model: {model}")
544
+ df = result[model]["df_overall"]
545
+
546
+ # Calculate the statistics
547
+ precision = [
548
+ df["precision"].mean() for df in result[model]["df_list_repetition_penalty"]
549
+ ]
550
+ recall = [
551
+ df["recall"].mean() for df in result[model]["df_list_repetition_penalty"]
552
+ ]
553
+ newline_score = [
554
+ df["newline_score"].mean()
555
+ for df in result[model]["df_list_repetition_penalty"]
556
+ ]
557
+
558
+ repetition_score = [
559
+ df["repetition_score"].mean()
560
+ for df in result[model]["df_list_repetition_penalty"]
561
+ ]
562
+
563
+ if adjusted_f1:
564
+ precision, recall = adjust_perf_scores_with_repetition_penalty(
565
+ result[model], precision, recall
566
+ )
567
+
568
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
569
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
570
+
571
+ best_f1.append(max(f1))
572
+ best_f1_index = f1.index(best_f1[-1])
573
+ best_repetition_penalty.append(df["repetition_penalty"][best_f1_index])
574
+
575
+ best_precision.append(precision[best_f1_index])
576
+ best_recall.append(recall[best_f1_index])
577
+ best_mtr.append(newline_score[best_f1_index] + repetition_score[best_f1_index])
578
+
579
+ print(
580
+ f"best repetition penalty: {best_repetition_penalty[-1]}, best f1: {best_f1[-1]}, precision: {best_precision[-1]}, recall: {best_recall[-1]}"
581
+ )
582
+
583
+ df = result[model]["df_list_repetition_penalty"][best_f1_index]
584
+
585
+ model_names.append(
586
+ f"{model} (RP={best_repetition_penalty[-1]})"
587
+ ) # Add the model name to the list
588
+
589
+ # print sum for columns: newline_score, repetition_score
590
+ print(
591
+ f"newline_score: {df['newline_score'].sum()}, repetition_score: {df['repetition_score'].sum()}"
592
+ )
593
+
594
+ if ref_result is not None:
595
+ print("ref_result:", ref_result)
596
+ for model in ref_result.keys():
597
+ model_names.append(model)
598
+ df = pd.read_csv(ref_result[model])
599
+ # df = df[df["id"].isin(wikidata_df["id"])]
600
+
601
+ best_precision.append(df["precision"].mean())
602
+ best_recall.append(df["recall"].mean())
603
+ f1 = (
604
+ 2
605
+ * (best_precision[-1] * best_recall[-1])
606
+ / (best_precision[-1] + best_recall[-1])
607
+ )
608
+ # best_f1.append(df["f1"].mean())
609
+ best_f1.append(f1)
610
+ best_mtr.append(0)
611
+
612
+ # Create a DataFrame with the statistics
613
+ data = (
614
+ pd.DataFrame(
615
+ {
616
+ "Model": model_names,
617
+ "Adjusted Precision with RP": best_precision,
618
+ "Adjusted Recall with RP": best_recall,
619
+ "Adjusted F1 with RP": best_f1,
620
+ }
621
+ )
622
+ if adjusted_f1
623
+ else pd.DataFrame(
624
+ {
625
+ "Model": model_names,
626
+ "Precision": best_precision,
627
+ "Recall": best_recall,
628
+ "F1": best_f1,
629
+ }
630
+ )
631
+ )
632
+ columns = list(data.columns)
633
+
634
+ # Melt the DataFrame to a long format
635
+ data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
636
+
637
+ # Pivot the DataFrame to a wide format
638
+ data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
639
+
640
+ # make sure the columns are following the order of the models
641
+ data_pivoted = data_pivoted[model_names]
642
+
643
+ # make sure three groups in the order of precision, recall, f1
644
+ data_pivoted = data_pivoted.reindex(columns[1:])
645
+
646
+ # Plot the statistics
647
+ plt.figure(figsize=(10, 6))
648
+ ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
649
+ plt.title(title)
650
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
651
+
652
+ # Set the rotation of the x-axis labels to 0 degrees
653
+ plt.xticks(rotation=0)
654
+
655
+ # Format the y-axis to display as percentage
656
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
657
+
658
+ # get the max value of the y-axis
659
+ a1 = max(best_precision)
660
+ a2 = max(best_recall)
661
+ a3 = max(best_f1)
662
+
663
+ max_value = max([a1, a2, a3]) * 1.12
664
+ print("max_value:", max_value)
665
+
666
+ # Set the y-axis limit up to 70%
667
+ ax.set_ylim(0, max_value)
668
+
669
+ # Add the values above each bar
670
+ for p in ax.patches:
671
+ ax.annotate(
672
+ f"{p.get_height() * 100:.1f}",
673
+ (p.get_x() + p.get_width() / 2.0, p.get_height()),
674
+ ha="center",
675
+ va="bottom",
676
+ xytext=(0, 10),
677
+ textcoords="offset points",
678
+ rotation=90,
679
+ )
680
+
681
+ plt.show()
682
+ return data_pivoted, best_mtr
683
+
684
+
685
+ def plot_best_performance_ms_macro(
686
+ result,
687
+ models=None,
688
+ title="Models with Best RAP - Performance",
689
+ ref_result=None,
690
+ skip_generic_prompt=False,
691
+ include_adjusted_performance=True,
692
+ ):
693
+ # Initialize lists to store the statistics
694
+ model_names = []
695
+ best_f1 = []
696
+ best_afrp = []
697
+ best_repetition_penalty = []
698
+ best_bleu1 = []
699
+ best_rougeL = []
700
+ best_mtr = []
701
+
702
+ if models is None:
703
+ models = result.keys()
704
+ for model in models:
705
+ if skip_generic_prompt and "generic prompt" in model:
706
+ continue
707
+ print(f"model: {model}")
708
+ df = result[model]["df_overall"]
709
+
710
+ # Calculate the statistics
711
+ bleu1 = [x for x in df["bleu1"]]
712
+ rougeL = [x for x in df["rougeL"]]
713
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
714
+
715
+ newline_score = [
716
+ df["newline_score"].mean()
717
+ for df in result[model]["df_list_repetition_penalty"]
718
+ ]
719
+ # print(f"newline_score: {newline_score}")
720
+
721
+ repetition_score = [
722
+ df["repetition_score"].mean()
723
+ for df in result[model]["df_list_repetition_penalty"]
724
+ ]
725
+ # print(f"repetition_score: {repetition_score}")
726
+
727
+ afrp = [
728
+ f / math.log10(10 + n + r)
729
+ for f, n, r in zip(f1, newline_score, repetition_score)
730
+ ]
731
+
732
+ best_afrp.append(max(afrp if include_adjusted_performance else f1))
733
+ best_afrp_index = (
734
+ afrp.index(best_afrp[-1])
735
+ if include_adjusted_performance
736
+ else f1.index(best_afrp[-1])
737
+ )
738
+ best_repetition_penalty.append(df["repetition_penalty"][best_afrp_index])
739
+
740
+ best_f1.append(f1[best_afrp_index])
741
+ best_bleu1.append(bleu1[best_afrp_index])
742
+ best_rougeL.append(rougeL[best_afrp_index])
743
+ best_mtr.append(
744
+ newline_score[best_afrp_index] + repetition_score[best_afrp_index]
745
+ )
746
+
747
+ # print(
748
+ # f"best repetition penalty: {best_repetition_penalty[-1]}, best afrp: {best_afrp[-1]}, f1: {best_f1[-1]}"
749
+ # )
750
+
751
+ df = result[model]["df_list_repetition_penalty"][best_afrp_index]
752
+
753
+ model_names.append(
754
+ f"{model} (RP={best_repetition_penalty[-1]})"
755
+ ) # Add the model name to the list
756
+
757
+ if ref_result is not None:
758
+ print("ref_result:", ref_result)
759
+ for model in ref_result.keys():
760
+ model_names.append(model)
761
+ df = pd.read_csv(ref_result[model], comment="#", on_bad_lines="warn")
762
+ # df = df[df["id"].isin(wikidata_df["id"])]
763
+
764
+ p = df["bleu1"][0]
765
+ best_bleu1.append(p)
766
+
767
+ r = df["rougeL"][0]
768
+ best_rougeL.append(r)
769
+
770
+ f1 = 2 * p * r / (p + r) if p + r > 0 else 0
771
+ best_f1.append(f1)
772
+ best_afrp.append(f1)
773
+ best_mtr.append(0)
774
+
775
+ # print("model_names:", model_names)
776
+ # print("best_f1:", best_f1)
777
+ # print("best_afrp:", best_afrp)
778
+
779
+ # Create a DataFrame with the statistics
780
+ data = (
781
+ pd.DataFrame(
782
+ {
783
+ "Model": model_names,
784
+ "RAP - Perf Score": best_afrp,
785
+ "Overall Perf Score": best_f1,
786
+ }
787
+ )
788
+ if include_adjusted_performance
789
+ else pd.DataFrame(
790
+ {
791
+ "Model": model_names,
792
+ "Bleu-1": best_bleu1,
793
+ "Rouge-L": best_rougeL,
794
+ "Overall Perf Score": best_f1,
795
+ }
796
+ )
797
+ )
798
+
799
+ # Melt the DataFrame to a long format
800
+ data_melted = data.melt(id_vars="Model", var_name="Metric", value_name="Score")
801
+
802
+ # Pivot the DataFrame to a wide format
803
+ data_pivoted = data_melted.pivot(index="Metric", columns="Model", values="Score")
804
+
805
+ # make sure the columns are following the order of the models
806
+ data_pivoted = data_pivoted[model_names]
807
+
808
+ columns = list(data.columns)
809
+ data_pivoted = data_pivoted.reindex(columns[1:])
810
+
811
+ # Plot the statistics
812
+ plt.figure(figsize=(10, 6))
813
+ ax = data_pivoted.plot(kind="bar", ax=plt.gca(), width=0.9)
814
+ plt.title(title)
815
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
816
+
817
+ # Set the rotation of the x-axis labels to 0 degrees
818
+ plt.xticks(rotation=0)
819
+
820
+ # Format the y-axis to display as percentage
821
+ ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
822
+
823
+ # get the max value of the y-axis
824
+ a1 = max(best_afrp)
825
+ a2 = max(best_f1)
826
+ a3 = max(best_bleu1)
827
+ a4 = max(best_rougeL)
828
+
829
+ max_value = (
830
+ max([a1, a2] if include_adjusted_performance else [a1, a2, a3, a4]) * 1.12
831
+ )
832
+ print("max_value:", max_value)
833
+
834
+ # Set the y-axis limit up to 70%
835
+ ax.set_ylim(0, max_value)
836
+
837
+ # Add the values above each bar
838
+ for p in ax.patches:
839
+ ax.annotate(
840
+ f"{p.get_height() * 100:.1f}",
841
+ (p.get_x() + p.get_width() / 2.0, p.get_height()),
842
+ ha="center",
843
+ va="bottom",
844
+ xytext=(0, 10),
845
+ textcoords="offset points",
846
+ rotation=90,
847
+ )
848
+
849
+ plt.show()
850
+ return data_pivoted, best_mtr
851
+
852
+
853
+ all_open_source_models = [
854
+ "gemma-1.1-2b-it",
855
+ "Phi-3-mini-128k-instruct",
856
+ "gemma-1.1-7b-it",
857
+ "Llama-2-7b-chat-hf",
858
+ "Mistral-7B-Instruct-v0.2",
859
+ "Meta-Llama-3-8B-Instruct",
860
+ "Llama-2-13b-chat-hf",
861
+ "Llama-2-70b-chat-hf",
862
+ "Meta-Llama-3-70B-Instruct",
863
+ ]
864
+
865
+
866
+ non_rag_csv_result_files = [
867
+ "./data/results/gemma-1.1-2b-it_wd_non_rag.csv", # gemma-1.1-2b-it
868
+ "./data/results/Phi-3-mini-128k-instruct_wd_non_rag_batch_16.csv", # Phi-3-mini-128k-instruct(batch size:16)
869
+ "./data/results/gemma-1.1-7b-it_wd_non_rag.csv", # gemma-1.1-7b-it
870
+ "./data/results/Tune_2024-04-09_09-19-22.csv", # Llama-2-7b-chat-hf
871
+ "./data/results/Tune_2024-04-16_12-24-27.csv.csv", # Mistral-7B-Instruct-v0.2
872
+ "./data/results/Meta-Llama-3-8B-Instruct_wd_non_rag.csv", # Meta-Llama-3-8B-Instruct
873
+ "./data/results/Meta-Llama-3-8B-Instruct_wd_1_non_rag.csv", # Meta-Llama-3-8B-Instruct
874
+ "./data/results/Tune_2024-04-10_16-53-38.csv", # Llama-2-13b-chat-hf
875
+ "./data/results/Llama-2-70b-chat-hf_wd_non_rag.csv", # Llama-2-70b-chat-hf
876
+ "./data/results/Meta-Llama-3-70B-Instruct_wd_non_rag.csv", # Meta-Llama-3-70B-Instruct
877
+ ]
878
+
879
+ rag_csv_result_files = [
880
+ "./data/results/gemma-1.1-2b-it_wd.csv", # gemma-1.1-2b-it
881
+ "./data/results/gemma-1.1-2b-it_wd_true.csv", # gemma-1.1-2b-it(true)
882
+ "./data/results/Phi-3-mini-128k-instruct_wd_rag_batch_4.csv", # Phi-3-mini-128k-instruct(batch size:16)
883
+ "./data/results/Phi-3-mini-128k-instruct_wd_true.csv", # Phi-3-mini-128k-instruct(batch size:16)
884
+ "./data/results/gemma-1.1-7b-it_wd.csv", # gemma-1.1-7b-it
885
+ "./data/results/gemma-1.1-7b-it_wd_true.csv", # gemma-1.1-7b-it(true)
886
+ "./data/results/Tune_2024-03-20_15-35-37.csv", # Llama-2-7b-chat-hf
887
+ "./data/results/Llama-2-7b-chat-hf_wd_true.csv", # Llama-2-7b-chat-hf(true)
888
+ "./data/results/Tune_2024-03-29_11-28-20.csv", # Mistral-7B-Instruct-v0.2
889
+ "./data/results/Mistral-7B-Instruct-v0.2_wd_true.csv", # Mistral-7B-Instruct-v0.2(true)
890
+ "./data/results/Meta-Llama-3-8B-Instruct_wd.csv", # Meta-Llama-3-8b-instruct
891
+ "./data/results/Meta-Llama-3-8B-Instruct_wd_true.csv", # Meta-Llama-3-8b-instruct(true)
892
+ "./data/results/Tune_2024-03-25_23-32-57.csv", # Llama-2-13b-chat-hf
893
+ "./data/results/Llama-2-13b-chat-hf_wd_true.csv", # Llama-2-13b-chat-hf(true)
894
+ "./data/results/Llama-2-70b-chat-hf_wd.csv", # Llama-2-70b-chat-hf
895
+ "./data/results/Llama-2-70b-chat-hf_wd_true.csv", # Llama-2-70b-chat-hf
896
+ "./data/results/Meta-Llama-3-70B-Instruct_wd.csv", # Meta-Llama-3-70B-Instruct
897
+ "./data/results/Meta-Llama-3-70B-Instruct_wd_true.csv", # Meta-Llama-3-70B-Instruct(true)
898
+ ]
899
+
900
+ df_ms_macro = pd.read_json("./data/datasets/ms_macro.json")
901
+
902
+
903
+ def load_for_repetition_penalty_ms_macro(
904
+ csv_result_file, repetition_penalty, force_recalculate=False
905
+ ):
906
+ result_file = replace_last(
907
+ csv_result_file, ".csv", f"_RP_{repetition_penalty:.3f}.csv"
908
+ )
909
+ df = load_with_newline_and_repetition_scores(
910
+ result_file, force_recalculate=force_recalculate
911
+ )
912
+
913
+ if len(df) != len(df_ms_macro):
914
+ print(f"error: len(df) != {len(df_ms_macro)}")
915
+ missing_ids = [
916
+ id for id in df_ms_macro["id"].unique() if id not in df["id"].unique()
917
+ ]
918
+ print(f"missing_ids: {missing_ids}")
919
+
920
+ if df["ground_truth"][0] != str(df_ms_macro["wellFormedAnswers"][0]):
921
+ df["ground_truth"] = df_ms_macro["wellFormedAnswers"]
922
+ print("ground_truth updated for:", result_file)
923
+ df.to_csv(result_file, index=False)
924
+ return df
925
+
926
+
927
+ # MS MACRO
928
+ def plot_performance_scores_ms_macro(
929
+ result,
930
+ models=None,
931
+ title="Performance",
932
+ ):
933
+ if models is None:
934
+ models = result.keys()
935
+ for model in models:
936
+ print(f"model: {model}")
937
+ df = result[model]["df_overall"]
938
+ # print(result[model]["df_list_repetition_penalty"][0].describe())
939
+
940
+ # Calculate the statistics
941
+ bleu1 = list(df["bleu1"])
942
+ rougeL = list(df["rougeL"])
943
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
944
+ best_f1 = max(f1)
945
+ best_f1_index = f1.index(best_f1)
946
+
947
+ bleu1, rougeL = adjust_perf_scores_with_repetition_penalty(
948
+ result[model], bleu1, rougeL
949
+ )
950
+ afrp = [2 * (p * r) / (p + r) for p, r in zip(bleu1, rougeL)]
951
+
952
+ # f1 = [df["f1"].mean() for df in result[model]["df_list_repetition_penalty"]]
953
+ best_afrp = max(afrp)
954
+ best_afrp_index = afrp.index(best_afrp)
955
+
956
+ repetition_penalties = list(df["repetition_penalty"])
957
+
958
+ # line plot for precision, recall, f1
959
+ plt.figure(figsize=(10, 6))
960
+
961
+ plt.axvspan(
962
+ repetition_penalties[best_f1_index] - 0.01,
963
+ repetition_penalties[best_f1_index] + 0.01,
964
+ alpha=0.5,
965
+ edgecolor="none",
966
+ facecolor="blue",
967
+ )
968
+
969
+ plt.axvspan(
970
+ repetition_penalties[best_afrp_index] - 0.01,
971
+ repetition_penalties[best_afrp_index] + 0.01,
972
+ alpha=0.5,
973
+ edgecolor="none",
974
+ facecolor="orange",
975
+ )
976
+
977
+ plt.plot(
978
+ repetition_penalties,
979
+ f1,
980
+ label="Overall Perf Score",
981
+ marker="D",
982
+ color="blue",
983
+ )
984
+ plt.plot(
985
+ repetition_penalties,
986
+ afrp,
987
+ label="RAP - Perf Score",
988
+ marker="o",
989
+ color="orange",
990
+ )
991
+
992
+ plt.xlabel("Repetition Penalties")
993
+ plt.ylabel("Score")
994
+ # plt.xlim(0.99, 1.31)
995
+ # y in percentage
996
+ plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
997
+ plt.title(f"{model} {title}")
998
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
999
+
1000
+ plt.show()
1001
+
1002
+
1003
+ def plot_repetition_factors(result, groups):
1004
+ for group in groups:
1005
+ # Plot the statistics
1006
+ plt.figure(figsize=(10, 6))
1007
+
1008
+ max_value = 0
1009
+ for model in result.keys():
1010
+ if not group in model.lower():
1011
+ continue
1012
+ print(f"model: {model}")
1013
+ df = result[model]["df_overall"]
1014
+ repetition_panelties = [
1015
+ repetition_penalty for repetition_penalty in df["repetition_penalty"]
1016
+ ]
1017
+
1018
+ mean_score = [
1019
+ # math.log10(10 + df["total_repetitions"].mean())
1020
+ df["total_repetitions"].mean()
1021
+ for df in result[model]["df_list_repetition_penalty"]
1022
+ ]
1023
+
1024
+ sns.lineplot(x=repetition_panelties, y=mean_score, label=model)
1025
+
1026
+ new_max = max(mean_score)
1027
+ if new_max > max_value:
1028
+ max_value = new_max
1029
+
1030
+ max_value = max_value * 1.05
1031
+ # if max_value < 1.5:
1032
+ # max_value = 1.5
1033
+ # set ylimit
1034
+ plt.ylim(0, max_value)
1035
+
1036
+ # show grid
1037
+ plt.grid(True)
1038
+ plt.xlabel("Repetition Penalties")
1039
+ plt.ylabel("Mean Total Repetitions")
1040
+ plt.title("Mean Total Repetitions vs Repetition Penalties")
1041
+ plt.legend()
1042
+
1043
+ plt.show()
1044
+
1045
+
1046
+ def plot_repetition_factors_by_group(result, group_filter=None):
1047
+ markers = ["D", "o", "s", "x"]
1048
+ colors = ["blue", "orange", "green", "red"]
1049
+
1050
+ # Plot the statistics
1051
+ plt.figure(figsize=(10, 6))
1052
+ index = 0
1053
+ max_value = 0
1054
+
1055
+ for model in result.keys():
1056
+ if group_filter is not None and group_filter not in model:
1057
+ continue
1058
+
1059
+ print(f"model: {model}")
1060
+
1061
+ df = result[model]["df_overall"]
1062
+ repetition_panelties = [
1063
+ repetition_penalty for repetition_penalty in df["repetition_penalty"]
1064
+ ]
1065
+
1066
+ # Calculate the statistics
1067
+ mean_score = [
1068
+ # math.log10(10 + df["total_repetitions"].mean())
1069
+ df["total_repetitions"].mean()
1070
+ for df in result[model]["df_list_repetition_penalty"]
1071
+ ]
1072
+ if len(mean_score) != len(repetition_panelties):
1073
+ print(
1074
+ f"model: {model} has different length of repetition penalties and mean score"
1075
+ )
1076
+ print("repetition_panelties:", len(repetition_panelties))
1077
+ print("mean_score:", len(mean_score))
1078
+ continue
1079
+
1080
+ new_max = max(mean_score)
1081
+ if new_max > max_value:
1082
+ max_value = new_max
1083
+
1084
+ sns.lineplot(
1085
+ x=repetition_panelties,
1086
+ y=mean_score,
1087
+ label=model,
1088
+ marker=markers[index],
1089
+ color=colors[index],
1090
+ )
1091
+
1092
+ index += 1
1093
+
1094
+ max_value = max_value * 1.05
1095
+ # if max_value < 1.5:
1096
+ # max_value = 1.5
1097
+ # set ylimit
1098
+ plt.ylim(0, max_value)
1099
+ max_value = 0
1100
+
1101
+ plt.xlabel("Repetition Penalties")
1102
+ plt.ylabel("Mean Total Repetitions")
1103
+ plt.title("Mean Total Repetitions vs Repetition Penalties")
1104
+ plt.legend(bbox_to_anchor=(1.0, 0.5), loc="center left")
1105
+
1106
+ plt.show()
1107
+
1108
+
1109
+ ms_marco_csv_result_files = [
1110
+ "data/results/gemma-1.1-2b-it_mm_true_false.csv",
1111
+ "data/results/gemma-1.1-2b-it_mm_true.csv",
1112
+ "data/results/gemma-1.1-2b-it_mm_true_false_non_rag.csv",
1113
+ "data/results/Phi-3-mini-128k-instruct_mm_false.csv",
1114
+ "data/results/Phi-3-mini-128k-instruct_mm_true.csv",
1115
+ "data/results/Phi-3-mini-128k-instruct_mm_non_rag.csv",
1116
+ "data/results/gemma-1.1-7b-it_mm_false.csv",
1117
+ "data/results/gemma-1.1-7b-it_mm_true.csv",
1118
+ "data/results/gemma-1.1-7b-it_mm_non_rag.csv",
1119
+ "data/results/Llama-2-7b-chat-hf_mm_true_false.csv",
1120
+ "data/results/Llama-2-7b-chat-hf_mm_true.csv",
1121
+ "data/results/Llama-2-7b-chat-hf_mm_true_false_non_rag.csv",
1122
+ "data/results/Mistral-7B-Instruct-v0.2_mm_false.csv",
1123
+ "data/results/Mistral-7B-Instruct-v0.2_mm_true.csv",
1124
+ "data/results/Mistral-7B-Instruct-v0.2_mm_non_rag.csv",
1125
+ "data/results/Meta-Llama-3-8B-Instruct_mm_true_false.csv",
1126
+ "data/results/Meta-Llama-3-8B-Instruct_mm_true.csv",
1127
+ "data/results/Meta-Llama-3-8B-Instruct_mm_true_false_non_rag.csv",
1128
+ "data/results/Llama-2-13b-chat-hf_mm_false.csv",
1129
+ "data/results/Llama-2-13b-chat-hf_mm_true.csv",
1130
+ "data/results/Llama-2-13b-chat-hf_mm_non_rag.csv",
1131
+ "data/results/Llama-2-70b-chat-hf_mm_false.csv",
1132
+ "data/results/Llama-2-70b-chat-hf_mm_true.csv",
1133
+ "data/results/Llama-2-70b-chat-hf_mm_non_rag.csv",
1134
+ "data/results/Meta-Llama-3-70B-Instruct_mm_false.csv",
1135
+ "data/results/Meta-Llama-3-70B-Instruct_mm_true.csv",
1136
+ "data/results/Meta-Llama-3-70B-Instruct_mm_non_rag.csv",
1137
+ ]
1138
+
1139
+ webqsp_csv_result_files = []
1140
+ webqsp_model_result_counts = {}
1141
+
1142
+
1143
+ def find_model_name(file_path):
1144
+ df = pd.read_csv(file_path, comment="#", on_bad_lines="warn")
1145
+ return df["model"][0]
1146
+
1147
+
1148
+ def add_file(file):
1149
+ model_name = find_model_name(file)
1150
+ if "(generic prompt)" not in model_name:
1151
+ webqsp_csv_result_files.append(file)
1152
+ if model_name not in webqsp_model_result_counts:
1153
+ webqsp_model_result_counts[model_name] = 1
1154
+ else:
1155
+ webqsp_model_result_counts[model_name] += 1
1156
+
1157
+
1158
+ last_model_name = None
1159
+ non_rag_index = 0
1160
+
1161
+ for csv_result_file in rag_csv_result_files:
1162
+ try:
1163
+ model_name = find_model_name(csv_result_file)
1164
+ # print(f"processing model: {model_name} - {csv_result_file}")
1165
+
1166
+ if last_model_name != model_name and last_model_name is not None:
1167
+ while non_rag_index < len(non_rag_csv_result_files):
1168
+ # print(f"processing non-rag file - {file}")
1169
+ file = non_rag_csv_result_files[non_rag_index]
1170
+ non_model_name = find_model_name(file)
1171
+ if non_model_name.startswith(last_model_name):
1172
+ add_file(file)
1173
+ non_rag_index += 1
1174
+ else:
1175
+ break
1176
+
1177
+ add_file(csv_result_file)
1178
+ last_model_name = model_name
1179
+ except FileNotFoundError as e:
1180
+ print("\terror processing file: ", csv_result_file, e)
1181
+ continue
1182
+
1183
+ for file in non_rag_csv_result_files[non_rag_index:]:
1184
+ add_file(file)
1185
+
1186
+
1187
+ def calc_rap_scores(result, precision="precision", recall="recall"):
1188
+ newline_score = [
1189
+ df["newline_score"].mean() for df in result["df_list_repetition_penalty"]
1190
+ ]
1191
+
1192
+ repetition_score = [
1193
+ df["repetition_score"].mean() for df in result["df_list_repetition_penalty"]
1194
+ ]
1195
+
1196
+ if precision in result["df_list_repetition_penalty"][0].columns:
1197
+ precision = [
1198
+ df[precision].mean() for df in result["df_list_repetition_penalty"]
1199
+ ]
1200
+ recall = [df[recall].mean() for df in result["df_list_repetition_penalty"]]
1201
+ else:
1202
+ precision = result["df_overall"][precision]
1203
+ recall = result["df_overall"][recall]
1204
+
1205
+ f1 = [2 * (p * r) / (p + r) for p, r in zip(precision, recall)]
1206
+
1207
+ rap = [
1208
+ f / math.log10(10 + n + r)
1209
+ for f, n, r in zip(f1, newline_score, repetition_score)
1210
+ ]
1211
+
1212
+ return newline_score, repetition_score, f1, rap
1213
+
1214
+
1215
+ def load_webqsp_result(csv_result_files, force_recalculate=False):
1216
+ model_name_exts = {
1217
+ "true": "(RAG - Chat Template)",
1218
+ "wd": "(RAG - Generic Prompt)",
1219
+ "rag": "(Non-RAG)",
1220
+ }
1221
+
1222
+ result = {}
1223
+ for i, csv_result_file in enumerate(csv_result_files):
1224
+ try:
1225
+ df = pd.read_csv(csv_result_file)
1226
+ parts = re.split(r"[_\.]", csv_result_file)
1227
+ if parts[-2] in model_name_exts.keys():
1228
+ key = parts[-2]
1229
+ elif csv_result_file in non_rag_csv_result_files:
1230
+ key = "rag"
1231
+ else:
1232
+ key = "wd"
1233
+ model_name = f'{df["model"][0]}{model_name_exts[key]}'
1234
+ dfs = [
1235
+ calculate_performance_score(
1236
+ csv_result_file,
1237
+ repetition_penalty,
1238
+ force_recalculate=force_recalculate,
1239
+ )
1240
+ for repetition_penalty in df["repetition_penalty"]
1241
+ ]
1242
+
1243
+ answer_lens = []
1244
+ for df_rpp in dfs:
1245
+ df_rpp["answer_len"] = df_rpp["answer"].apply(
1246
+ lambda x: len(x) if isinstance(x, str) else 0
1247
+ )
1248
+ answer_lens.append(df_rpp["answer_len"].mean())
1249
+
1250
+ result[model_name] = {
1251
+ "df_overall": df,
1252
+ "df_list_repetition_penalty": dfs,
1253
+ "file": csv_result_file,
1254
+ }
1255
+ newline_score, repetition_score, perf, rap = calc_rap_scores(
1256
+ result[model_name]
1257
+ )
1258
+ df["newline_score"] = newline_score
1259
+ df["repetition_score"] = repetition_score
1260
+ df["total_repetitions"] = df["newline_score"] + df["repetition_score"]
1261
+ df["answer_len"] = answer_lens
1262
+ df["perf"] = perf
1263
+ df["rap"] = rap
1264
+ except Exception as e:
1265
+ print(f"Error: {e}")
1266
+
1267
+ return result
1268
+
1269
+
1270
+ def load_ms_marco_result(csv_result_files, force_recalculate=False):
1271
+ model_name_exts = {
1272
+ "true": "(RAG - Chat Template)",
1273
+ "false": "(RAG - Generic Prompt)",
1274
+ "rag": "(Non-RAG)",
1275
+ }
1276
+
1277
+ result = {}
1278
+ for csv_result_file in csv_result_files:
1279
+ try:
1280
+ df = pd.read_csv(csv_result_file)
1281
+
1282
+ parts = re.split(r"[_\.]", csv_result_file)
1283
+ model_name = f'{df["model"][0]}{model_name_exts[parts[-2]]}'
1284
+
1285
+ print(f"\tmodel_name: {model_name}")
1286
+ dfs = [
1287
+ load_for_repetition_penalty_ms_macro(
1288
+ csv_result_file,
1289
+ repetition_penalty,
1290
+ force_recalculate=force_recalculate,
1291
+ )
1292
+ for repetition_penalty in df["repetition_penalty"]
1293
+ ]
1294
+
1295
+ answer_lens = []
1296
+ for df_rpp in dfs:
1297
+ df_rpp["answer_len"] = df_rpp["answer"].apply(
1298
+ lambda x: len(x) if isinstance(x, str) else 0
1299
+ )
1300
+ answer_lens.append(df_rpp["answer_len"].mean())
1301
+
1302
+ result[model_name] = {
1303
+ "df_overall": df,
1304
+ "df_list_repetition_penalty": dfs,
1305
+ "file": csv_result_file,
1306
+ }
1307
+ newline_score, repetition_score, perf, rap = calc_rap_scores(
1308
+ result[model_name],
1309
+ precision="bleu1",
1310
+ recall="rougeL",
1311
+ )
1312
+ df["newline_score"] = newline_score
1313
+ df["repetition_score"] = repetition_score
1314
+ df["total_repetitions"] = df["newline_score"] + df["repetition_score"]
1315
+ df["answer_len"] = answer_lens
1316
+ df["perf"] = perf
1317
+ df["rap"] = rap
1318
+ except Exception as e:
1319
+ print(f"Error: {e}")
1320
+
1321
+ return result
1322
+
1323
+
1324
+ def load_ms_marco_result_v2(csv_result_files, force_recalculate=False):
1325
+ model_name_exts = {
1326
+ "true": "(RAG - Chat Template)",
1327
+ "false": "(RAG - Generic Prompt)",
1328
+ "rag": "(Non-RAG)",
1329
+ }
1330
+
1331
+ result = {}
1332
+ for csv_result_file in csv_result_files:
1333
+ try:
1334
+ df = pd.read_csv(csv_result_file)
1335
+
1336
+ parts = re.split(r"[_\.]", csv_result_file)
1337
+ model_name = f'{df["model"][0]}{model_name_exts[parts[-2]]}'
1338
+
1339
+ print(f"\tmodel_name: {model_name}")
1340
+ dfs = [
1341
+ load_for_repetition_penalty_ms_macro(
1342
+ csv_result_file,
1343
+ repetition_penalty,
1344
+ force_recalculate=force_recalculate,
1345
+ )
1346
+ for repetition_penalty in df["repetition_penalty"]
1347
+ ]
1348
+
1349
+ answer_lens = []
1350
+ for df_rpp in dfs:
1351
+ df_rpp["answer_len"] = df_rpp["answer"].apply(
1352
+ lambda x: len(x) if isinstance(x, str) else 0
1353
+ )
1354
+ answer_lens.append(df_rpp["answer_len"].mean())
1355
+ df["answer_len"] = answer_lens
1356
+
1357
+ meteor_scores = []
1358
+ for df_rpp in dfs:
1359
+ meteor_score = meteor.compute(
1360
+ predictions=df_rpp["answer"], references=df_rpp["ground_truth"]
1361
+ )["meteor"]
1362
+ meteor_scores.append(meteor_score)
1363
+ df["meteor_scores"] = meteor_scores
1364
+
1365
+ result[model_name] = {
1366
+ "df_overall": df,
1367
+ "df_list_repetition_penalty": dfs,
1368
+ "file": csv_result_file,
1369
+ }
1370
+ newline_score, repetition_score, perf, rap = calc_rap_scores(
1371
+ result[model_name],
1372
+ precision="meteor_scores",
1373
+ recall="meteor_scores",
1374
+ )
1375
+ df["newline_score"] = newline_score
1376
+ df["repetition_score"] = repetition_score
1377
+ df["total_repetitions"] = df["newline_score"] + df["repetition_score"]
1378
+ df["perf"] = perf
1379
+ df["rap"] = rap
1380
+ except Exception as e:
1381
+ print(f"Error: {e}")
1382
+
1383
+ return result
notebooks/00_Repetition_Algorithms_Comparison.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/04_RAPGeT_v2.ipynb ADDED
The diff for this file is too large to render. See raw diff