dh-mc commited on
Commit
7dab5c1
·
1 Parent(s): 0f5efb0

completed Qwen2-72B results

Browse files
llm_toolkit/translation_utils.py CHANGED
@@ -181,6 +181,24 @@ def count_entries_with_max_tokens(entries, max_tokens):
181
  return count
182
 
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  def get_metrics(df, max_output_tokens=2048):
185
  metrics_df = pd.DataFrame(df.columns.T)[2:]
186
  metrics_df.rename(columns={0: "model"}, inplace=True)
@@ -199,9 +217,18 @@ def get_metrics(df, max_output_tokens=2048):
199
  ews_score = []
200
  repetition_score = []
201
  total_repetitions = []
202
- num_entries_with_max_output_tokens = []
 
203
 
204
- for col in df.columns[2:]:
 
 
 
 
 
 
 
 
205
  metrics = calc_metrics(df["english"], df[col], debug=True)
206
  print(f"{col}: {metrics}")
207
 
@@ -209,8 +236,8 @@ def get_metrics(df, max_output_tokens=2048):
209
  bleu_1.append(metrics["bleu_scores"]["bleu"])
210
  rouge_l.append(metrics["rouge_scores"]["rougeL"])
211
 
212
- df[["ews_score", "repetition_score", "total_repetitions"]] = df[col].apply(
213
- detect_scores
214
  )
215
  ews_score.append(df["ews_score"].mean())
216
  repetition_score.append(df["repetition_score"].mean())
@@ -223,10 +250,10 @@ def get_metrics(df, max_output_tokens=2048):
223
  lambda x: len(tokenizers[model](x)["input_ids"])
224
  )
225
 
226
- new_col = f"output_tokens-{model}"
227
  df[new_col] = df[col].apply(lambda x: len(tokenizers[model](x)["input_ids"]))
228
 
229
- num_entries_with_max_output_tokens.append(
230
  count_entries_with_max_tokens(df[new_col], max_output_tokens)
231
  )
232
 
@@ -236,14 +263,12 @@ def get_metrics(df, max_output_tokens=2048):
236
  metrics_df["ews_score"] = ews_score
237
  metrics_df["repetition_score"] = repetition_score
238
  metrics_df["total_repetitions"] = total_repetitions
239
- metrics_df["num_entries_with_max_output_tokens"] = (
240
- num_entries_with_max_output_tokens
241
- )
242
-
243
  metrics_df["rap"] = metrics_df.apply(
244
  lambda x: x["meteor"] / math.log10(10 + x["total_repetitions"]), axis=1
245
  )
246
 
 
 
247
  return metrics_df
248
 
249
 
 
181
  return count
182
 
183
 
184
+ def detect_repetition_scores(row, col, debug=False):
185
+ # print(f"row: {row}")
186
+ newline_score, repetition_score, total_repetitions = detect_repetitions(
187
+ row[col], debug=debug
188
+ )
189
+ newline_score -= row["ground_truth_ews_score"]
190
+ repetition_score -= row["ground_truth_repetition_score"]
191
+ total_repetitions -= row["ground_truth_total_repetitions"]
192
+
193
+ return pd.Series(
194
+ [
195
+ newline_score if newline_score > 0 else 0,
196
+ repetition_score if repetition_score > 0 else 0,
197
+ total_repetitions if total_repetitions > 0 else 0,
198
+ ]
199
+ )
200
+
201
+
202
  def get_metrics(df, max_output_tokens=2048):
203
  metrics_df = pd.DataFrame(df.columns.T)[2:]
204
  metrics_df.rename(columns={0: "model"}, inplace=True)
 
217
  ews_score = []
218
  repetition_score = []
219
  total_repetitions = []
220
+ num_max_output_tokens = []
221
+ columns = df.columns[2:]
222
 
223
+ df[
224
+ [
225
+ "ground_truth_ews_score",
226
+ "ground_truth_repetition_score",
227
+ "ground_truth_total_repetitions",
228
+ ]
229
+ ] = df["english"].apply(detect_scores)
230
+
231
+ for col in columns:
232
  metrics = calc_metrics(df["english"], df[col], debug=True)
233
  print(f"{col}: {metrics}")
234
 
 
236
  bleu_1.append(metrics["bleu_scores"]["bleu"])
237
  rouge_l.append(metrics["rouge_scores"]["rougeL"])
238
 
239
+ df[["ews_score", "repetition_score", "total_repetitions"]] = df.apply(
240
+ lambda x: detect_repetition_scores(x, col), axis=1
241
  )
242
  ews_score.append(df["ews_score"].mean())
243
  repetition_score.append(df["repetition_score"].mean())
 
250
  lambda x: len(tokenizers[model](x)["input_ids"])
251
  )
252
 
253
+ new_col = f"output_tokens-{col}"
254
  df[new_col] = df[col].apply(lambda x: len(tokenizers[model](x)["input_ids"]))
255
 
256
+ num_max_output_tokens.append(
257
  count_entries_with_max_tokens(df[new_col], max_output_tokens)
258
  )
259
 
 
263
  metrics_df["ews_score"] = ews_score
264
  metrics_df["repetition_score"] = repetition_score
265
  metrics_df["total_repetitions"] = total_repetitions
 
 
 
 
266
  metrics_df["rap"] = metrics_df.apply(
267
  lambda x: x["meteor"] / math.log10(10 + x["total_repetitions"]), axis=1
268
  )
269
 
270
+ metrics_df["num_max_output_tokens"] = num_max_output_tokens
271
+
272
  return metrics_df
273
 
274
 
notebooks/00_Data Analysis.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
results/mac-results.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed53200b9b2d65ee34d37affd3deb1057b14d3fc8c509d6889d65175fc31a1c6
3
- size 12273645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae923aacf82b91408bbdead4a5add27bf3ca463af2a4ea18824647efa4c6df47
3
+ size 12132300
results/mac-results_metrics.csv ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model,rpp,meteor,bleu_1,rouge_l,ews_score,repetition_score,total_repetitions,rap,num_max_output_tokens
2
+ 01-ai/Yi-1.5-9B-Chat,1.00,0.3463725436435439,0.09312113035602035,0.33279900782244626,0.0,0.35127978817299205,0.35127978817299205,0.34125573890735983,2
3
+ 01-ai/Yi-1.5-9B-Chat,1.02,0.3471185374158656,0.09126513887574451,0.3329223332114953,0.0,0.264783759929391,0.264783759929391,0.3432230230787291,4
4
+ 01-ai/Yi-1.5-9B-Chat,1.04,0.3471882673119874,0.09019886552461354,0.3321870466419108,0.0,0.37775816416593117,0.37775816416593117,0.3416859125059273,8
5
+ 01-ai/Yi-1.5-9B-Chat,1.06,0.3475947948648639,0.09004996084071014,0.33143152244770613,0.0,0.46866725507502205,0.46866725507502205,0.34081549880066125,9
6
+ 01-ai/Yi-1.5-9B-Chat,1.08,0.34751102711658816,0.09004837948083254,0.33169125948525535,0.0,0.3115622241835834,0.3115622241835834,0.3429415142848335,4
7
+ 01-ai/Yi-1.5-9B-Chat,1.10,0.3483026982644252,0.08832819752923171,0.3313149728213199,0.0,0.265666372462489,0.265666372462489,0.3443811795403635,6
8
+ 01-ai/Yi-1.5-9B-Chat,1.12,0.34415914233475586,0.08286056438796492,0.3267174273303975,0.0,0.4315975286849073,0.4315975286849073,0.3379573469517685,11
9
+ 01-ai/Yi-1.5-9B-Chat,1.14,0.3423084527203913,0.07890758839182645,0.3253386227189031,0.06090026478375993,0.30185348631950576,0.3627537511032657,0.33709189557055336,16
10
+ 01-ai/Yi-1.5-9B-Chat,1.16,0.34089943422352015,0.07400825605871394,0.3224006234779672,0.02294792586054722,0.2833186231244484,0.3062665489849956,0.33649094177763084,22
11
+ 01-ai/Yi-1.5-9B-Chat,1.18,0.3365273541015462,0.06786555450924157,0.3143386280928703,0.0,0.30979699911738745,0.30979699911738745,0.33212663825955735,26
12
+ 01-ai/Yi-1.5-9B-Chat,1.20,0.3355307477803475,0.06314678954328107,0.31123860048630714,0.0,0.22241835834068843,0.22241835834068843,0.33235553904085485,36
13
+ Qwen/Qwen2-72B-Instruct,1.00,0.3928168861285181,0.12345162681603773,0.38390142579747066,0.0,0.17563989408649602,0.17563989408649602,0.389868803763904,0
14
+ Qwen/Qwen2-72B-Instruct,1.02,0.3936651928828143,0.12446659906815814,0.3839114141179235,0.0,0.147396293027361,0.147396293027361,0.39117939588436124,0
15
+ Qwen/Qwen2-72B-Instruct,1.04,0.39263683565035906,0.12496255366843562,0.38459581775094975,0.0,0.15798764342453664,0.15798764342453664,0.38998196316138,0
16
+ Qwen/Qwen2-72B-Instruct,1.06,0.39300072547277504,0.12419625082296233,0.3842780423867672,0.0,0.15798764342453664,0.15798764342453664,0.3903433924885219,0
17
+ Qwen/Qwen2-72B-Instruct,1.08,0.39260920223934465,0.12159917926987915,0.38447418699052216,0.0,0.15798764342453664,0.15798764342453664,0.3899545165977931,0
18
+ Qwen/Qwen2-72B-Instruct,1.10,0.3908754537278073,0.12158997920112467,0.382023720181386,0.0,0.1650485436893204,0.1650485436893204,0.38811615859747245,0
19
+ Qwen/Qwen2-72B-Instruct,1.12,0.3901405252446773,0.11689725142339027,0.3820602123452673,0.0,0.19505736981465135,0.19505736981465135,0.38689460563454847,1
20
+ Qwen/Qwen2-72B-Instruct,1.14,0.38779002122304185,0.1157566203933617,0.3781701938094464,0.0,0.1703442188879082,0.1703442188879082,0.38496604428211745,0
21
+ Qwen/Qwen2-72B-Instruct,1.16,0.3867065727555076,0.11470275348285906,0.37688474492298285,0.0,0.15975286849073256,0.15975286849073256,0.3840630117198038,1
22
+ Qwen/Qwen2-72B-Instruct,1.18,0.3836165246347915,0.10917281839005354,0.3747478193448276,0.0,0.14916151809355693,0.14916151809355693,0.3811655674825621,3
23
+ Qwen/Qwen2-72B-Instruct,1.20,0.37971211487547984,0.10498466771923821,0.37197960114090106,0.0,0.293909973521624,0.293909973521624,0.37499454632380763,6
24
+ Qwen/Qwen2-72B-Instruct,1.22,0.3774442827665068,0.10330617330273109,0.3699012784671952,0.06001765225066196,0.14563106796116504,0.205648720211827,0.37413668409339734,4
25
+ Qwen/Qwen2-72B-Instruct,1.24,0.37236689663431144,0.0975677163987442,0.3660038409634332,0.0,0.14210061782877317,0.14210061782877317,0.37009896173470336,7
26
+ Qwen/Qwen2-72B-Instruct,1.26,0.36709984654252126,0.09446497792017897,0.36287607522182885,0.00529567519858782,0.18623124448367168,0.1915269196822595,0.3640999339946495,9
27
+ Qwen/Qwen2-72B-Instruct,1.28,0.363672126987302,0.08776847871058446,0.35837939967274784,0.0,0.1262135922330097,0.1262135922330097,0.36170190642085254,11
28
+ Qwen/Qwen2-72B-Instruct,1.30,0.36072372289732685,0.07886593555129406,0.35434208671286505,0.3000882612533098,0.22065313327449249,0.5207413945278023,0.3529426277694367,27
29
+ Qwen/Qwen2-7B-Instruct,1.00,0.3788852766816091,0.11779757565648481,0.36942355750319633,0.0,0.0997352162400706,0.0997352162400706,0.37725929362438315,0
30
+ Qwen/Qwen2-7B-Instruct,1.02,0.37747961314136774,0.11676496093416945,0.3685378471451668,0.0,0.0997352162400706,0.0997352162400706,0.37585966247769537,0
31
+ Qwen/Qwen2-7B-Instruct,1.04,0.37726610964345525,0.1156236824251826,0.3673369615606667,0.0,0.09090909090909091,0.09090909090909091,0.37578914794963475,0
32
+ Qwen/Qwen2-7B-Instruct,1.06,0.37811123052366835,0.11425460970355292,0.3674510486875733,0.0,0.09002647837599294,0.09002647837599294,0.37664521213491803,0
33
+ Qwen/Qwen2-7B-Instruct,1.08,0.37410786991499057,0.11100236068028714,0.3629286781065867,0.0,0.0820829655781112,0.0820829655781112,0.37278438583748985,0
34
+ Qwen/Qwen2-7B-Instruct,1.10,0.3726776797510745,0.10913952265827548,0.35984560125194764,0.0,0.09090909090909091,0.09090909090909091,0.37121868133307606,0
35
+ Qwen/Qwen2-7B-Instruct,1.12,0.37111350769392976,0.10686948327720774,0.3573554465690655,0.0,0.09090909090909091,0.09090909090909091,0.36966063286390244,0
36
+ Qwen/Qwen2-7B-Instruct,1.14,0.36899468281467784,0.10532730577953538,0.3565920877170038,0.0,0.09090909090909091,0.09090909090909091,0.3675501029867792,0
37
+ Qwen/Qwen2-7B-Instruct,1.16,0.36740747078226954,0.10220163431022722,0.3538496075663,0.0,0.0997352162400706,0.0997352162400706,0.36583074463491855,0
38
+ Qwen/Qwen2-7B-Instruct,1.18,0.36291646275476663,0.09906644982386541,0.3499896954704415,0.0,0.0997352162400706,0.0997352162400706,0.36135900973153,0
39
+ Qwen/Qwen2-7B-Instruct,1.20,0.3602807930821352,0.09532425997250199,0.34611214566424575,0.0,0.11032656663724624,0.11032656663724624,0.3585721310217351,0
40
+ Qwen/Qwen2-7B-Instruct,1.22,0.35715128912133703,0.0920447399321579,0.3441700414615685,0.0,0.08561341571050309,0.08561341571050309,0.3558338791174631,0
41
+ Qwen/Qwen2-7B-Instruct,1.24,0.352755026120472,0.08591470945904531,0.3394417789236487,0.0,0.08561341571050309,0.08561341571050309,0.3514538324401377,0
42
+ Qwen/Qwen2-7B-Instruct,1.26,0.3483233677173315,0.07972359456247886,0.33446894013409734,0.0,0.08561341571050309,0.08561341571050309,0.3470385209221742,0
43
+ Qwen/Qwen2-7B-Instruct,1.28,0.34450122231539704,0.07518096876457613,0.33039776348046035,0.0,0.09179170344218888,0.09179170344218888,0.34313954918633316,1
44
+ Qwen/Qwen2-7B-Instruct,1.30,0.3401098279932269,0.07026740554261787,0.3254126623075102,0.0,0.09002647837599294,0.09002647837599294,0.3387911491977248,3
45
+ shenzhi-wang/Llama3.1-70B-Chinese-Chat,1.00,0.3815423445635067,0.11524878188694271,0.36996911815693134,0.0,0.1968225948808473,0.1968225948808473,0.37833975022913946,0
46
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.00,0.3579680086793429,0.10207096308148353,0.3459640175229714,0.0,0.19240953221535745,0.19240953221535745,0.3550294775004645,0
47
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.02,0.3572226770743513,0.10061303169730976,0.3448975271352682,0.0,0.1615180935569285,0.1615180935569285,0.3547540871288482,0
48
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.04,0.35670586983276636,0.10074138007196803,0.3449144632118436,0.0,0.1615180935569285,0.1615180935569285,0.3542408512875192,0
49
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.06,0.35549318326656437,0.0998891248706679,0.34322366122312886,0.0,0.15975286849073256,0.15975286849073256,0.35306299977874456,0
50
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.08,0.3549933805160392,0.09858894278315135,0.3430005301668489,0.0,0.1526919682259488,0.1526919682259488,0.35267236961581055,0
51
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.10,0.3534792705039357,0.09604337437044752,0.3415984757584558,0.0,0.12886142983230361,0.12886142983230361,0.35152457252197683,0
52
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.12,0.35134989369086755,0.09466593964355864,0.3402757462396958,0.0,0.13592233009708737,0.13592233009708737,0.3493018413002178,0
53
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.14,0.3523467471502627,0.09455136235619709,0.34011481374007896,0.0,0.15710503089143865,0.15710503089143865,0.34997741984243846,0
54
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.16,0.35026382260485167,0.09291738095604976,0.33774387946289774,0.0,0.17299205648720212,0.17299205648720212,0.34767410066010357,0
55
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.18,0.3489231946755186,0.09267866809703615,0.33706232254061985,0.0,0.18711385701676964,0.18711385701676964,0.3461363951386201,0
56
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.20,0.3481931091877492,0.09121903225057944,0.33581387922683614,0.0,0.2003530450132392,0.2003530450132392,0.34521897747455255,0
57
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.22,0.34604714296451533,0.08945165053230478,0.33477451607196596,0.0,0.18711385701676964,0.18711385701676964,0.34328331404034373,1
58
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.24,0.3441084154272239,0.0880200303756021,0.33240744238972986,0.0,0.20123565754633715,0.20123565754633715,0.34115646351180073,1
59
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.26,0.3434534163683513,0.08571979267389605,0.33010684388262634,0.0,0.17828773168578993,0.17828773168578993,0.34083758204748993,0
60
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.28,0.34008394315191964,0.08346595677194628,0.3274872658281857,0.0,0.19858781994704325,0.19858781994704325,0.33720420576281357,2
61
+ shenzhi-wang/Llama3.1-8B-Chinese-Chat,1.30,0.3385373237572206,0.08244181010811574,0.32639975491275247,0.0,0.18005295675198588,0.18005295675198588,0.33593382411969774,0
62
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.00,0.3256642047768536,0.08331314362646546,0.3158704402692302,0.0,0.16857899382171226,0.16857899382171226,0.3233168382824646,0
63
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.02,0.3261638331201866,0.08437219278343962,0.3157031443159532,0.0,0.17210944395410416,0.17210944395410416,0.3237644072634129,0
64
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.04,0.3261267542205407,0.0841026780937562,0.31557218541916426,0.0,0.9267431597528685,0.9267431597528685,0.3140391563308914,1
65
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.06,0.32610191030444663,0.08440911364941035,0.315363371759167,0.0,0.7917034421888791,0.7917034421888791,0.3156568252183852,1
66
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.08,0.32519072627069395,0.08573531403311445,0.3153935939954504,0.0,0.7917034421888791,0.7917034421888791,0.31477482652351235,1
67
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.10,0.32510929376904546,0.08572184129459336,0.31430790316573815,0.0,0.2921447484554281,0.2921447484554281,0.3210937323218641,0
68
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.12,0.325321692973156,0.08501006133800607,0.3134505738999941,0.0,0.294792586054722,0.294792586054722,0.3212680607332789,0
69
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.14,0.3224620858016468,0.08389328832417228,0.3130539552486071,0.0,0.12268314210061783,0.12268314210061783,0.3207634371908326,0
70
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.16,0.32354623636120206,0.08389983318570625,0.3135227768775155,0.0,0.12268314210061783,0.12268314210061783,0.32184187671977993,0
71
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.18,0.3227464993995023,0.08237511984991769,0.31232054358058636,0.0,0.10326566637246248,0.10326566637246248,0.3213128743065604,0
72
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.20,0.3213479416591043,0.08021470447158471,0.31080818824701156,0.0,0.08473080317740513,0.08473080317740513,0.3201747222670157,0
73
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.22,0.31939727082775615,0.08027275774782588,0.308833370619165,0.0,0.10061782877316858,0.10061782877316858,0.3180145628216866,0
74
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.24,0.3188662188138966,0.07877965659256216,0.30709225454106126,0.0,0.0820829655781112,0.0820829655781112,0.31773816351917916,0
75
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.26,0.31805084189335,0.07777595035895293,0.30653839142082673,0.0,0.07325684024713151,0.07325684024713151,0.31704583595098484,0
76
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.28,0.31564132115319793,0.07471248687074669,0.30482678714954914,0.0,0.05736981465136805,0.05736981465136805,0.3148590790692337,0
77
+ shenzhi-wang/Mistral-7B-v0.3-Chinese-Chat,1.30,0.31448483374273595,0.07484673889486904,0.3033930752633869,0.0,0.06796116504854369,0.06796116504854369,0.3135624797736094,0