Attila1011 commited on
Commit
1aa6f21
·
verified ·
1 Parent(s): a468ff2

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  checkpoints/checkpoint-4096/eval_state.json filter=lfs diff=lfs merge=lfs -text
37
  checkpoints-v2/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  checkpoints/checkpoint-4096/eval_state.json filter=lfs diff=lfs merge=lfs -text
37
  checkpoints-v2/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
38
+ checkpoints-v2.1/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
checkpoints-v2.1/checkpoint-12288/eval_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05b3bca630c7d08caf62d59aa41ee15d1aad0e5b971befe95d42644b1526bea9
3
+ size 44108941
checkpoints-v2.1/checkpoint-12288/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f679c8e0b39bf014779cac799fd1c51930d771c35519b8824a50844c0bdcb78
3
+ size 37402680
checkpoints-v2.1/checkpoint-12288/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6adc558ecff7a93c46301e5c792542680f0ae1cf6478199b736271df1782ce20
3
+ size 512267
checkpoints-v2.1/checkpoint-12288/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f36e6c28b77555b6da6de84681647b558ac8ebc553a1b458e45112e416a213c
3
+ size 14645
checkpoints-v2.1/checkpoint-12288/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:904fc891c4da67fa7d4b7db7eba41d622dec830fc89cf51c20e856ab78c9fc76
3
+ size 1383
checkpoints-v2.1/checkpoint-12288/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87b2b9d7efb9af569cd6999393703c76b185862a0221b931604de9e5bf2b79a6
3
+ size 1465
checkpoints-v2.1/checkpoint-12288/trainer_state.json ADDED
@@ -0,0 +1,790 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.5675488430095608,
6
+ "eval_steps": 1024,
7
+ "global_step": 12288,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.011823934229365849,
14
+ "grad_norm": 1.1338618993759155,
15
+ "learning_rate": 1.9615384615384617e-05,
16
+ "loss": 10.4459,
17
+ "step": 256
18
+ },
19
+ {
20
+ "epoch": 0.023647868458731697,
21
+ "grad_norm": 1.0851895809173584,
22
+ "learning_rate": 3.930769230769231e-05,
23
+ "loss": 7.9458,
24
+ "step": 512
25
+ },
26
+ {
27
+ "epoch": 0.03547180268809755,
28
+ "grad_norm": 0.9216171503067017,
29
+ "learning_rate": 4.999617095521894e-05,
30
+ "loss": 5.6401,
31
+ "step": 768
32
+ },
33
+ {
34
+ "epoch": 0.047295736917463395,
35
+ "grad_norm": 0.5787180066108704,
36
+ "learning_rate": 4.9961092368776736e-05,
37
+ "loss": 3.8256,
38
+ "step": 1024
39
+ },
40
+ {
41
+ "epoch": 0.047295736917463395,
42
+ "eval_acr_loss": 0.9939580324305791,
43
+ "eval_across_var": 0.003025606172010473,
44
+ "eval_bleu": 0.5515564027779047,
45
+ "eval_ce_loss": 2.3591716888288383,
46
+ "eval_cos_loss": 0.9272929947125857,
47
+ "eval_cov": 0.0706253400131992,
48
+ "eval_cov_loss": 0.00802828647306861,
49
+ "eval_global_var": 0.2767718589469178,
50
+ "eval_loss": 2.832596059803549,
51
+ "eval_mse_loss": 1.9087850182023767,
52
+ "eval_per_var": 0.2680246013484589,
53
+ "eval_within_var": 0.27379363795665845,
54
+ "step": 1024
55
+ },
56
+ {
57
+ "epoch": 0.047295736917463395,
58
+ "eval_acr_loss": 0.9939580324305791,
59
+ "eval_across_var": 0.003025606172010473,
60
+ "eval_bleu": 0.5515564027779047,
61
+ "eval_ce_loss": 2.3591716888288383,
62
+ "eval_cos_loss": 0.9272929947125857,
63
+ "eval_cov": 0.0706253400131992,
64
+ "eval_cov_loss": 0.00802828647306861,
65
+ "eval_global_var": 0.2767718589469178,
66
+ "eval_loss": 2.832596059803549,
67
+ "eval_mse_loss": 1.9087850182023767,
68
+ "eval_per_var": 0.2680246013484589,
69
+ "eval_runtime": 159.2542,
70
+ "eval_samples_per_second": 175.776,
71
+ "eval_steps_per_second": 2.75,
72
+ "eval_within_var": 0.27379363795665845,
73
+ "step": 1024
74
+ },
75
+ {
76
+ "epoch": 0.05911967114682925,
77
+ "grad_norm": 0.38924261927604675,
78
+ "learning_rate": 4.988941132556799e-05,
79
+ "loss": 2.7681,
80
+ "step": 1280
81
+ },
82
+ {
83
+ "epoch": 0.0709436053761951,
84
+ "grad_norm": 0.3134535551071167,
85
+ "learning_rate": 4.9781232937269974e-05,
86
+ "loss": 2.1522,
87
+ "step": 1536
88
+ },
89
+ {
90
+ "epoch": 0.08276753960556095,
91
+ "grad_norm": 0.2510242760181427,
92
+ "learning_rate": 4.963671583455164e-05,
93
+ "loss": 1.7487,
94
+ "step": 1792
95
+ },
96
+ {
97
+ "epoch": 0.09459147383492679,
98
+ "grad_norm": 0.22504042088985443,
99
+ "learning_rate": 4.945607193446079e-05,
100
+ "loss": 1.4694,
101
+ "step": 2048
102
+ },
103
+ {
104
+ "epoch": 0.09459147383492679,
105
+ "eval_acr_loss": 0.9901781947645423,
106
+ "eval_across_var": 0.004923156680556261,
107
+ "eval_bleu": 0.8030880203505176,
108
+ "eval_ce_loss": 0.700762519825539,
109
+ "eval_cos_loss": 0.7673689819634233,
110
+ "eval_cov": 0.07080983897866723,
111
+ "eval_cov_loss": 0.008125514746749917,
112
+ "eval_global_var": 0.4061385202625571,
113
+ "eval_loss": 1.1126885499856243,
114
+ "eval_mse_loss": 1.6458245439616512,
115
+ "eval_per_var": 0.3933824513056507,
116
+ "eval_within_var": 0.4013502570592105,
117
+ "step": 2048
118
+ },
119
+ {
120
+ "epoch": 0.09459147383492679,
121
+ "eval_acr_loss": 0.9901781947645423,
122
+ "eval_across_var": 0.004923156680556261,
123
+ "eval_bleu": 0.8030880203505176,
124
+ "eval_ce_loss": 0.700762519825539,
125
+ "eval_cos_loss": 0.7673689819634233,
126
+ "eval_cov": 0.07080983897866723,
127
+ "eval_cov_loss": 0.008125514746749917,
128
+ "eval_global_var": 0.4061385202625571,
129
+ "eval_loss": 1.1126885499856243,
130
+ "eval_mse_loss": 1.6458245439616512,
131
+ "eval_per_var": 0.3933824513056507,
132
+ "eval_runtime": 155.1424,
133
+ "eval_samples_per_second": 180.434,
134
+ "eval_steps_per_second": 2.823,
135
+ "eval_within_var": 0.4013502570592105,
136
+ "step": 2048
137
+ },
138
+ {
139
+ "epoch": 0.10641540806429264,
140
+ "grad_norm": 0.18675386905670166,
141
+ "learning_rate": 4.923956612967301e-05,
142
+ "loss": 1.2664,
143
+ "step": 2304
144
+ },
145
+ {
146
+ "epoch": 0.1182393422936585,
147
+ "grad_norm": 0.18214967846870422,
148
+ "learning_rate": 4.898751590005826e-05,
149
+ "loss": 1.1058,
150
+ "step": 2560
151
+ },
152
+ {
153
+ "epoch": 0.13006327652302435,
154
+ "grad_norm": 0.15246237814426422,
155
+ "learning_rate": 4.870029084713462e-05,
156
+ "loss": 0.981,
157
+ "step": 2816
158
+ },
159
+ {
160
+ "epoch": 0.1418872107523902,
161
+ "grad_norm": 0.1368647962808609,
162
+ "learning_rate": 4.837831215209188e-05,
163
+ "loss": 0.8816,
164
+ "step": 3072
165
+ },
166
+ {
167
+ "epoch": 0.1418872107523902,
168
+ "eval_acr_loss": 0.9858010461613468,
169
+ "eval_across_var": 0.007125071276481089,
170
+ "eval_bleu": 0.8956566730262217,
171
+ "eval_ce_loss": 0.32717657133460587,
172
+ "eval_cos_loss": 0.6143105866974348,
173
+ "eval_cov": 0.06988288496182934,
174
+ "eval_cov_loss": 0.007913092302252032,
175
+ "eval_global_var": 0.5146751926369864,
176
+ "eval_loss": 0.6788079935938256,
177
+ "eval_mse_loss": 1.3672495941593223,
178
+ "eval_per_var": 0.4983278039383562,
179
+ "eval_within_var": 0.5077701699516001,
180
+ "step": 3072
181
+ },
182
+ {
183
+ "epoch": 0.1418872107523902,
184
+ "eval_acr_loss": 0.9858010461613468,
185
+ "eval_across_var": 0.007125071276481089,
186
+ "eval_bleu": 0.8956566730262217,
187
+ "eval_ce_loss": 0.32717657133460587,
188
+ "eval_cos_loss": 0.6143105866974348,
189
+ "eval_cov": 0.06988288496182934,
190
+ "eval_cov_loss": 0.007913092302252032,
191
+ "eval_global_var": 0.5146751926369864,
192
+ "eval_loss": 0.6788079935938256,
193
+ "eval_mse_loss": 1.3672495941593223,
194
+ "eval_per_var": 0.4983278039383562,
195
+ "eval_runtime": 156.011,
196
+ "eval_samples_per_second": 179.43,
197
+ "eval_steps_per_second": 2.807,
198
+ "eval_within_var": 0.5077701699516001,
199
+ "step": 3072
200
+ },
201
+ {
202
+ "epoch": 0.15371114498175603,
203
+ "grad_norm": 0.13020840287208557,
204
+ "learning_rate": 4.802205195817963e-05,
205
+ "loss": 0.8019,
206
+ "step": 3328
207
+ },
208
+ {
209
+ "epoch": 0.1655350792111219,
210
+ "grad_norm": 0.12300444394350052,
211
+ "learning_rate": 4.763203267836576e-05,
212
+ "loss": 0.7339,
213
+ "step": 3584
214
+ },
215
+ {
216
+ "epoch": 0.17735901344048774,
217
+ "grad_norm": 0.10956571996212006,
218
+ "learning_rate": 4.720882622928019e-05,
219
+ "loss": 0.6774,
220
+ "step": 3840
221
+ },
222
+ {
223
+ "epoch": 0.18918294766985358,
224
+ "grad_norm": 0.11182258278131485,
225
+ "learning_rate": 4.675305319256765e-05,
226
+ "loss": 0.6307,
227
+ "step": 4096
228
+ },
229
+ {
230
+ "epoch": 0.18918294766985358,
231
+ "eval_acr_loss": 0.9774447257660296,
232
+ "eval_across_var": 0.011342588644134536,
233
+ "eval_bleu": 0.9352497637682005,
234
+ "eval_ce_loss": 0.18979091641225226,
235
+ "eval_cos_loss": 0.49574217472446563,
236
+ "eval_cov": 0.06955429843571632,
237
+ "eval_cov_loss": 0.007831381105490403,
238
+ "eval_global_var": 0.6115789544092466,
239
+ "eval_loss": 0.49388179734145127,
240
+ "eval_mse_loss": 1.144643609110079,
241
+ "eval_per_var": 0.592599529109589,
242
+ "eval_within_var": 0.600530758568141,
243
+ "step": 4096
244
+ },
245
+ {
246
+ "epoch": 0.18918294766985358,
247
+ "eval_acr_loss": 0.9774447257660296,
248
+ "eval_across_var": 0.011342588644134536,
249
+ "eval_bleu": 0.9352497637682005,
250
+ "eval_ce_loss": 0.18979091641225226,
251
+ "eval_cos_loss": 0.49574217472446563,
252
+ "eval_cov": 0.06955429843571632,
253
+ "eval_cov_loss": 0.007831381105490403,
254
+ "eval_global_var": 0.6115789544092466,
255
+ "eval_loss": 0.49388179734145127,
256
+ "eval_mse_loss": 1.144643609110079,
257
+ "eval_per_var": 0.592599529109589,
258
+ "eval_runtime": 155.2844,
259
+ "eval_samples_per_second": 180.269,
260
+ "eval_steps_per_second": 2.821,
261
+ "eval_within_var": 0.600530758568141,
262
+ "step": 4096
263
+ },
264
+ {
265
+ "epoch": 0.20100688189921945,
266
+ "grad_norm": 0.1080719456076622,
267
+ "learning_rate": 4.6265381904878854e-05,
268
+ "loss": 0.588,
269
+ "step": 4352
270
+ },
271
+ {
272
+ "epoch": 0.2128308161285853,
273
+ "grad_norm": 0.10819243639707565,
274
+ "learning_rate": 4.57465274778347e-05,
275
+ "loss": 0.5554,
276
+ "step": 4608
277
+ },
278
+ {
279
+ "epoch": 0.22465475035795113,
280
+ "grad_norm": 0.1115206629037857,
281
+ "learning_rate": 4.519725074940068e-05,
282
+ "loss": 0.5198,
283
+ "step": 4864
284
+ },
285
+ {
286
+ "epoch": 0.236478684587317,
287
+ "grad_norm": 0.1552964597940445,
288
+ "learning_rate": 4.461835716820895e-05,
289
+ "loss": 0.473,
290
+ "step": 5120
291
+ },
292
+ {
293
+ "epoch": 0.236478684587317,
294
+ "eval_acr_loss": 0.11126784305088222,
295
+ "eval_across_var": 0.680014660641483,
296
+ "eval_bleu": 0.9523306149700821,
297
+ "eval_ce_loss": 0.12964293614165967,
298
+ "eval_cos_loss": 0.4240787292588247,
299
+ "eval_cov": 0.07667060747538527,
300
+ "eval_cov_loss": 0.010102802944969232,
301
+ "eval_global_var": 1.6775805329623288,
302
+ "eval_loss": 0.31928212995126365,
303
+ "eval_mse_loss": 1.0156728980475909,
304
+ "eval_per_var": 1.6460250784817352,
305
+ "eval_within_var": 1.0011268127454471,
306
+ "step": 5120
307
+ },
308
+ {
309
+ "epoch": 0.236478684587317,
310
+ "eval_acr_loss": 0.11126784305088222,
311
+ "eval_across_var": 0.680014660641483,
312
+ "eval_bleu": 0.9523306149700821,
313
+ "eval_ce_loss": 0.12964293614165967,
314
+ "eval_cos_loss": 0.4240787292588247,
315
+ "eval_cov": 0.07667060747538527,
316
+ "eval_cov_loss": 0.010102802944969232,
317
+ "eval_global_var": 1.6775805329623288,
318
+ "eval_loss": 0.31928212995126365,
319
+ "eval_mse_loss": 1.0156728980475909,
320
+ "eval_per_var": 1.6460250784817352,
321
+ "eval_runtime": 154.2182,
322
+ "eval_samples_per_second": 181.516,
323
+ "eval_steps_per_second": 2.84,
324
+ "eval_within_var": 1.0011268127454471,
325
+ "step": 5120
326
+ },
327
+ {
328
+ "epoch": 0.24830261881668284,
329
+ "grad_norm": 0.11613737791776657,
330
+ "learning_rate": 4.401069561246422e-05,
331
+ "loss": 0.3958,
332
+ "step": 5376
333
+ },
334
+ {
335
+ "epoch": 0.2601265530460487,
336
+ "grad_norm": 0.11101594567298889,
337
+ "learning_rate": 4.337515714516545e-05,
338
+ "loss": 0.3648,
339
+ "step": 5632
340
+ },
341
+ {
342
+ "epoch": 0.27195048727541454,
343
+ "grad_norm": 0.14844343066215515,
344
+ "learning_rate": 4.2712673707468434e-05,
345
+ "loss": 0.3464,
346
+ "step": 5888
347
+ },
348
+ {
349
+ "epoch": 0.2837744215047804,
350
+ "grad_norm": 0.10389428585767746,
351
+ "learning_rate": 4.202421675210565e-05,
352
+ "loss": 0.3281,
353
+ "step": 6144
354
+ },
355
+ {
356
+ "epoch": 0.2837744215047804,
357
+ "eval_acr_loss": 0.015462962337612025,
358
+ "eval_across_var": 0.9652468334866441,
359
+ "eval_bleu": 0.9665450842704094,
360
+ "eval_ce_loss": 0.08923274265882904,
361
+ "eval_cos_loss": 0.35504293285276245,
362
+ "eval_cov": 0.06710940844392123,
363
+ "eval_cov_loss": 0.007327720047650884,
364
+ "eval_global_var": 2.2822310216894977,
365
+ "eval_loss": 0.24187510450408883,
366
+ "eval_mse_loss": 0.8843358904803724,
367
+ "eval_per_var": 2.3488914454908674,
368
+ "eval_within_var": 1.3261088448572376,
369
+ "step": 6144
370
+ },
371
+ {
372
+ "epoch": 0.2837744215047804,
373
+ "eval_acr_loss": 0.015462962337612025,
374
+ "eval_across_var": 0.9652468334866441,
375
+ "eval_bleu": 0.9665450842704094,
376
+ "eval_ce_loss": 0.08923274265882904,
377
+ "eval_cos_loss": 0.35504293285276245,
378
+ "eval_cov": 0.06710940844392123,
379
+ "eval_cov_loss": 0.007327720047650884,
380
+ "eval_global_var": 2.2822310216894977,
381
+ "eval_loss": 0.24187510450408883,
382
+ "eval_mse_loss": 0.8843358904803724,
383
+ "eval_per_var": 2.3488914454908674,
384
+ "eval_runtime": 155.0579,
385
+ "eval_samples_per_second": 180.533,
386
+ "eval_steps_per_second": 2.825,
387
+ "eval_within_var": 1.3261088448572376,
388
+ "step": 6144
389
+ },
390
+ {
391
+ "epoch": 0.2955983557341462,
392
+ "grad_norm": 0.129238098859787,
393
+ "learning_rate": 4.131079581886694e-05,
394
+ "loss": 0.3099,
395
+ "step": 6400
396
+ },
397
+ {
398
+ "epoch": 0.30742228996351206,
399
+ "grad_norm": 0.1061507984995842,
400
+ "learning_rate": 4.057345705423016e-05,
401
+ "loss": 0.2963,
402
+ "step": 6656
403
+ },
404
+ {
405
+ "epoch": 0.3192462241928779,
406
+ "grad_norm": 0.10803277790546417,
407
+ "learning_rate": 3.981328167731251e-05,
408
+ "loss": 0.2854,
409
+ "step": 6912
410
+ },
411
+ {
412
+ "epoch": 0.3310701584222438,
413
+ "grad_norm": 0.10297808796167374,
414
+ "learning_rate": 3.9031384394391954e-05,
415
+ "loss": 0.2709,
416
+ "step": 7168
417
+ },
418
+ {
419
+ "epoch": 0.3310701584222438,
420
+ "eval_acr_loss": 0.014716924630300589,
421
+ "eval_across_var": 0.9652062489834006,
422
+ "eval_bleu": 0.9750850142716162,
423
+ "eval_ce_loss": 0.0659905160653945,
424
+ "eval_cos_loss": 0.31133458848413265,
425
+ "eval_cov": 0.06627595805686358,
426
+ "eval_cov_loss": 0.007171058822125537,
427
+ "eval_global_var": 2.3926940639269407,
428
+ "eval_loss": 0.20149783922793113,
429
+ "eval_mse_loss": 0.8055339337211765,
430
+ "eval_per_var": 2.513992936643836,
431
+ "eval_within_var": 1.4371973874906427,
432
+ "step": 7168
433
+ },
434
+ {
435
+ "epoch": 0.3310701584222438,
436
+ "eval_acr_loss": 0.014716924630300589,
437
+ "eval_across_var": 0.9652062489834006,
438
+ "eval_bleu": 0.9750850142716162,
439
+ "eval_ce_loss": 0.0659905160653945,
440
+ "eval_cos_loss": 0.31133458848413265,
441
+ "eval_cov": 0.06627595805686358,
442
+ "eval_cov_loss": 0.007171058822125537,
443
+ "eval_global_var": 2.3926940639269407,
444
+ "eval_loss": 0.20149783922793113,
445
+ "eval_mse_loss": 0.8055339337211765,
446
+ "eval_per_var": 2.513992936643836,
447
+ "eval_runtime": 152.9262,
448
+ "eval_samples_per_second": 183.049,
449
+ "eval_steps_per_second": 2.864,
450
+ "eval_within_var": 1.4371973874906427,
451
+ "step": 7168
452
+ },
453
+ {
454
+ "epoch": 0.34289409265160964,
455
+ "grad_norm": 0.08976765722036362,
456
+ "learning_rate": 3.822891176432382e-05,
457
+ "loss": 0.2629,
458
+ "step": 7424
459
+ },
460
+ {
461
+ "epoch": 0.3547180268809755,
462
+ "grad_norm": 0.25151142477989197,
463
+ "learning_rate": 3.7407040517249335e-05,
464
+ "loss": 0.2533,
465
+ "step": 7680
466
+ },
467
+ {
468
+ "epoch": 0.3665419611103413,
469
+ "grad_norm": 0.09347163140773773,
470
+ "learning_rate": 3.6566975829061614e-05,
471
+ "loss": 0.2437,
472
+ "step": 7936
473
+ },
474
+ {
475
+ "epoch": 0.37836589533970716,
476
+ "grad_norm": 0.15004394948482513,
477
+ "learning_rate": 3.5709949554159355e-05,
478
+ "loss": 0.2348,
479
+ "step": 8192
480
+ },
481
+ {
482
+ "epoch": 0.37836589533970716,
483
+ "eval_acr_loss": 0.014583685287022457,
484
+ "eval_across_var": 0.9538344581649728,
485
+ "eval_bleu": 0.980561100967318,
486
+ "eval_ce_loss": 0.05129089445454073,
487
+ "eval_cos_loss": 0.28137742255104187,
488
+ "eval_cov": 0.06562282614511987,
489
+ "eval_cov_loss": 0.007066378377369482,
490
+ "eval_global_var": 2.479759738869863,
491
+ "eval_loss": 0.17527394500225102,
492
+ "eval_mse_loss": 0.7551626324925793,
493
+ "eval_per_var": 2.6612933433219177,
494
+ "eval_within_var": 1.53611661745533,
495
+ "step": 8192
496
+ },
497
+ {
498
+ "epoch": 0.37836589533970716,
499
+ "eval_acr_loss": 0.014583685287022457,
500
+ "eval_across_var": 0.9538344581649728,
501
+ "eval_bleu": 0.980561100967318,
502
+ "eval_ce_loss": 0.05129089445454073,
503
+ "eval_cos_loss": 0.28137742255104187,
504
+ "eval_cov": 0.06562282614511987,
505
+ "eval_cov_loss": 0.007066378377369482,
506
+ "eval_global_var": 2.479759738869863,
507
+ "eval_loss": 0.17527394500225102,
508
+ "eval_mse_loss": 0.7551626324925793,
509
+ "eval_per_var": 2.6612933433219177,
510
+ "eval_runtime": 151.3473,
511
+ "eval_samples_per_second": 184.959,
512
+ "eval_steps_per_second": 2.894,
513
+ "eval_within_var": 1.53611661745533,
514
+ "step": 8192
515
+ },
516
+ {
517
+ "epoch": 0.390189829569073,
518
+ "grad_norm": 0.09314695745706558,
519
+ "learning_rate": 3.483721841907964e-05,
520
+ "loss": 0.2288,
521
+ "step": 8448
522
+ },
523
+ {
524
+ "epoch": 0.4020137637984389,
525
+ "grad_norm": 0.09459128230810165,
526
+ "learning_rate": 3.395006217965885e-05,
527
+ "loss": 0.2225,
528
+ "step": 8704
529
+ },
530
+ {
531
+ "epoch": 0.41383769802780473,
532
+ "grad_norm": 0.1038607731461525,
533
+ "learning_rate": 3.3049781744423665e-05,
534
+ "loss": 0.215,
535
+ "step": 8960
536
+ },
537
+ {
538
+ "epoch": 0.4256616322571706,
539
+ "grad_norm": 0.06964848935604095,
540
+ "learning_rate": 3.213769726696439e-05,
541
+ "loss": 0.2103,
542
+ "step": 9216
543
+ },
544
+ {
545
+ "epoch": 0.4256616322571706,
546
+ "eval_acr_loss": 0.01336662589730249,
547
+ "eval_across_var": 0.987674045780478,
548
+ "eval_bleu": 0.9840502134478613,
549
+ "eval_ce_loss": 0.0413797390200708,
550
+ "eval_cos_loss": 0.2597663049080056,
551
+ "eval_cov": 0.0652730584688927,
552
+ "eval_cov_loss": 0.007007736591494655,
553
+ "eval_global_var": 2.6066415168378994,
554
+ "eval_loss": 0.15707423451216254,
555
+ "eval_mse_loss": 0.7215510081482804,
556
+ "eval_per_var": 2.776688249143836,
557
+ "eval_within_var": 1.6285416399507218,
558
+ "step": 9216
559
+ },
560
+ {
561
+ "epoch": 0.4256616322571706,
562
+ "eval_acr_loss": 0.01336662589730249,
563
+ "eval_across_var": 0.987674045780478,
564
+ "eval_bleu": 0.9840502134478613,
565
+ "eval_ce_loss": 0.0413797390200708,
566
+ "eval_cos_loss": 0.2597663049080056,
567
+ "eval_cov": 0.0652730584688927,
568
+ "eval_cov_loss": 0.007007736591494655,
569
+ "eval_global_var": 2.6066415168378994,
570
+ "eval_loss": 0.15707423451216254,
571
+ "eval_mse_loss": 0.7215510081482804,
572
+ "eval_per_var": 2.776688249143836,
573
+ "eval_runtime": 151.5839,
574
+ "eval_samples_per_second": 184.67,
575
+ "eval_steps_per_second": 2.889,
576
+ "eval_within_var": 1.6285416399507218,
577
+ "step": 9216
578
+ },
579
+ {
580
+ "epoch": 0.4374855664865364,
581
+ "grad_norm": 0.07443105429410934,
582
+ "learning_rate": 3.121514621008757e-05,
583
+ "loss": 0.2053,
584
+ "step": 9472
585
+ },
586
+ {
587
+ "epoch": 0.44930950071590225,
588
+ "grad_norm": 0.08320944011211395,
589
+ "learning_rate": 3.0283481384586697e-05,
590
+ "loss": 0.2017,
591
+ "step": 9728
592
+ },
593
+ {
594
+ "epoch": 0.4611334349452681,
595
+ "grad_norm": 0.1169746071100235,
596
+ "learning_rate": 2.9344068965507027e-05,
597
+ "loss": 0.1966,
598
+ "step": 9984
599
+ },
600
+ {
601
+ "epoch": 0.472957369174634,
602
+ "grad_norm": 0.08953411877155304,
603
+ "learning_rate": 2.840199155190943e-05,
604
+ "loss": 0.1938,
605
+ "step": 10240
606
+ },
607
+ {
608
+ "epoch": 0.472957369174634,
609
+ "eval_acr_loss": 0.012974254141045301,
610
+ "eval_across_var": 0.9815338524781405,
611
+ "eval_bleu": 0.9866441773938769,
612
+ "eval_ce_loss": 0.034374653984745755,
613
+ "eval_cos_loss": 0.24375769958648508,
614
+ "eval_cov": 0.06485466107930223,
615
+ "eval_cov_loss": 0.006919686747374668,
616
+ "eval_global_var": 2.6645574700342465,
617
+ "eval_loss": 0.1440824061359989,
618
+ "eval_mse_loss": 0.6987405730981261,
619
+ "eval_per_var": 2.866460652111872,
620
+ "eval_within_var": 1.6929740287941886,
621
+ "step": 10240
622
+ },
623
+ {
624
+ "epoch": 0.472957369174634,
625
+ "eval_acr_loss": 0.012974254141045301,
626
+ "eval_across_var": 0.9815338524781405,
627
+ "eval_bleu": 0.9866441773938769,
628
+ "eval_ce_loss": 0.034374653984745755,
629
+ "eval_cos_loss": 0.24375769958648508,
630
+ "eval_cov": 0.06485466107930223,
631
+ "eval_cov_loss": 0.006919686747374668,
632
+ "eval_global_var": 2.6645574700342465,
633
+ "eval_loss": 0.1440824061359989,
634
+ "eval_mse_loss": 0.6987405730981261,
635
+ "eval_per_var": 2.866460652111872,
636
+ "eval_runtime": 151.0941,
637
+ "eval_samples_per_second": 185.269,
638
+ "eval_steps_per_second": 2.899,
639
+ "eval_within_var": 1.6929740287941886,
640
+ "step": 10240
641
+ },
642
+ {
643
+ "epoch": 0.48478130340399983,
644
+ "grad_norm": 0.08886408805847168,
645
+ "learning_rate": 2.745124265175868e-05,
646
+ "loss": 0.1892,
647
+ "step": 10496
648
+ },
649
+ {
650
+ "epoch": 0.49660523763336567,
651
+ "grad_norm": 0.11508477479219437,
652
+ "learning_rate": 2.6496899297412598e-05,
653
+ "loss": 0.1853,
654
+ "step": 10752
655
+ },
656
+ {
657
+ "epoch": 0.5084291718627315,
658
+ "grad_norm": 0.0753609761595726,
659
+ "learning_rate": 2.554036091926675e-05,
660
+ "loss": 0.1839,
661
+ "step": 11008
662
+ },
663
+ {
664
+ "epoch": 0.5202531060920974,
665
+ "grad_norm": 0.09823817759752274,
666
+ "learning_rate": 2.4583030166456618e-05,
667
+ "loss": 0.18,
668
+ "step": 11264
669
+ },
670
+ {
671
+ "epoch": 0.5202531060920974,
672
+ "eval_acr_loss": 0.013261525430046982,
673
+ "eval_across_var": 1.0205278622505327,
674
+ "eval_bleu": 0.9883516965111658,
675
+ "eval_ce_loss": 0.029720396500880316,
676
+ "eval_cos_loss": 0.23187202939840212,
677
+ "eval_cov": 0.06469566310377425,
678
+ "eval_cov_loss": 0.006903172251619569,
679
+ "eval_global_var": 2.7817003781392695,
680
+ "eval_loss": 0.13512653335248498,
681
+ "eval_mse_loss": 0.6834642570040542,
682
+ "eval_per_var": 3.0010434503424657,
683
+ "eval_within_var": 1.7715753001165173,
684
+ "step": 11264
685
+ },
686
+ {
687
+ "epoch": 0.5202531060920974,
688
+ "eval_acr_loss": 0.013261525430046982,
689
+ "eval_across_var": 1.0205278622505327,
690
+ "eval_bleu": 0.9883516965111658,
691
+ "eval_ce_loss": 0.029720396500880316,
692
+ "eval_cos_loss": 0.23187202939840212,
693
+ "eval_cov": 0.06469566310377425,
694
+ "eval_cov_loss": 0.006903172251619569,
695
+ "eval_global_var": 2.7817003781392695,
696
+ "eval_loss": 0.13512653335248498,
697
+ "eval_mse_loss": 0.6834642570040542,
698
+ "eval_per_var": 3.0010434503424657,
699
+ "eval_runtime": 151.0663,
700
+ "eval_samples_per_second": 185.303,
701
+ "eval_steps_per_second": 2.899,
702
+ "eval_within_var": 1.7715753001165173,
703
+ "step": 11264
704
+ },
705
+ {
706
+ "epoch": 0.5320770403214632,
707
+ "grad_norm": 0.09726043790578842,
708
+ "learning_rate": 2.3626310850040373e-05,
709
+ "loss": 0.1772,
710
+ "step": 11520
711
+ },
712
+ {
713
+ "epoch": 0.5439009745508291,
714
+ "grad_norm": 0.0948578342795372,
715
+ "learning_rate": 2.2671605884477816e-05,
716
+ "loss": 0.1754,
717
+ "step": 11776
718
+ },
719
+ {
720
+ "epoch": 0.5557249087801949,
721
+ "grad_norm": 0.08570394665002823,
722
+ "learning_rate": 2.1720315230424133e-05,
723
+ "loss": 0.1733,
724
+ "step": 12032
725
+ },
726
+ {
727
+ "epoch": 0.5675488430095608,
728
+ "grad_norm": 0.10034994781017303,
729
+ "learning_rate": 2.0777519879097458e-05,
730
+ "loss": 0.1722,
731
+ "step": 12288
732
+ },
733
+ {
734
+ "epoch": 0.5675488430095608,
735
+ "eval_acr_loss": 0.01237716493560104,
736
+ "eval_across_var": 0.997331017772901,
737
+ "eval_bleu": 0.9896437649984566,
738
+ "eval_ce_loss": 0.026328031107630222,
739
+ "eval_cos_loss": 0.22285830515296493,
740
+ "eval_cov": 0.064281376529502,
741
+ "eval_cov_loss": 0.006827230591828761,
742
+ "eval_global_var": 2.807202482876712,
743
+ "eval_loss": 0.12841411385702217,
744
+ "eval_mse_loss": 0.6729901853489549,
745
+ "eval_per_var": 3.002407962328767,
746
+ "eval_within_var": 1.8194298801356799,
747
+ "step": 12288
748
+ },
749
+ {
750
+ "epoch": 0.5675488430095608,
751
+ "eval_acr_loss": 0.01237716493560104,
752
+ "eval_across_var": 0.997331017772901,
753
+ "eval_bleu": 0.9896437649984566,
754
+ "eval_ce_loss": 0.026328031107630222,
755
+ "eval_cos_loss": 0.22285830515296493,
756
+ "eval_cov": 0.064281376529502,
757
+ "eval_cov_loss": 0.006827230591828761,
758
+ "eval_global_var": 2.807202482876712,
759
+ "eval_loss": 0.12841411385702217,
760
+ "eval_mse_loss": 0.6729901853489549,
761
+ "eval_per_var": 3.002407962328767,
762
+ "eval_runtime": 149.5452,
763
+ "eval_samples_per_second": 187.188,
764
+ "eval_steps_per_second": 2.929,
765
+ "eval_within_var": 1.8194298801356799,
766
+ "step": 12288
767
+ }
768
+ ],
769
+ "logging_steps": 256,
770
+ "max_steps": 21651,
771
+ "num_input_tokens_seen": 0,
772
+ "num_train_epochs": 1,
773
+ "save_steps": 1024,
774
+ "stateful_callbacks": {
775
+ "TrainerControl": {
776
+ "args": {
777
+ "should_epoch_stop": false,
778
+ "should_evaluate": false,
779
+ "should_log": false,
780
+ "should_save": true,
781
+ "should_training_stop": false
782
+ },
783
+ "attributes": {}
784
+ }
785
+ },
786
+ "total_flos": 0.0,
787
+ "train_batch_size": 64,
788
+ "trial_name": null,
789
+ "trial_params": null
790
+ }
checkpoints-v2.1/checkpoint-12288/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c810769b954d6649ec9ac0f62a330cbf05d281180d4c04842e778e387b6a864
3
+ size 5777