agentlans commited on
Commit
ec60add
1 Parent(s): 3b42509

Upload 8 files

Browse files
README.md CHANGED
@@ -4,19 +4,19 @@ base_model: agentlans/multilingual-e5-small-aligned
4
  tags:
5
  - generated_from_trainer
6
  model-index:
7
- - name: multilingual-e5-small-aligned-sentiment
8
  results: []
9
  ---
10
 
11
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
  should probably proofread and complete it, then remove this comment. -->
13
 
14
- # multilingual-e5-small-aligned-sentiment
15
 
16
  This model is a fine-tuned version of [agentlans/multilingual-e5-small-aligned](https://huggingface.co/agentlans/multilingual-e5-small-aligned) on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 0.2188
19
- - Mse: 0.2188
20
 
21
  ## Model description
22
 
@@ -36,7 +36,7 @@ More information needed
36
 
37
  The following hyperparameters were used during training:
38
  - learning_rate: 5e-05
39
- - train_batch_size: 64
40
  - eval_batch_size: 8
41
  - seed: 42
42
  - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
@@ -45,11 +45,11 @@ The following hyperparameters were used during training:
45
 
46
  ### Training results
47
 
48
- | Training Loss | Epoch | Step | Validation Loss | Mse |
49
- |:-------------:|:-----:|:-----:|:---------------:|:------:|
50
- | 0.2635 | 1.0 | 13548 | 0.2526 | 0.2526 |
51
- | 0.1944 | 2.0 | 27096 | 0.2277 | 0.2277 |
52
- | 0.1489 | 3.0 | 40644 | 0.2188 | 0.2188 |
53
 
54
 
55
  ### Framework versions
 
4
  tags:
5
  - generated_from_trainer
6
  model-index:
7
+ - name: multilingual-e5-small-aligned-transformed-sentiment
8
  results: []
9
  ---
10
 
11
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
  should probably proofread and complete it, then remove this comment. -->
13
 
14
+ # multilingual-e5-small-aligned-transformed-sentiment
15
 
16
  This model is a fine-tuned version of [agentlans/multilingual-e5-small-aligned](https://huggingface.co/agentlans/multilingual-e5-small-aligned) on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
+ - Loss: 0.2082
19
+ - Mse: 0.2082
20
 
21
  ## Model description
22
 
 
36
 
37
  The following hyperparameters were used during training:
38
  - learning_rate: 5e-05
39
+ - train_batch_size: 32
40
  - eval_batch_size: 8
41
  - seed: 42
42
  - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 
45
 
46
  ### Training results
47
 
48
+ | Training Loss | Epoch | Step | Validation Loss | Mse |
49
+ |:-------------:|:-----:|:------:|:---------------:|:------:|
50
+ | 0.1898 | 1.0 | 54191 | 0.2322 | 0.2322 |
51
+ | 0.1186 | 2.0 | 108382 | 0.2139 | 0.2139 |
52
+ | 0.0861 | 3.0 | 162573 | 0.2082 | 0.2082 |
53
 
54
 
55
  ### Framework versions
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_loss": 0.21882201731204987,
4
- "eval_mse": 0.21882200171115507,
5
- "eval_runtime": 50.6658,
6
- "eval_samples": 96338,
7
- "eval_samples_per_second": 1901.439,
8
- "eval_steps_per_second": 237.695,
9
- "total_flos": 4.283504864539085e+16,
10
- "train_loss": 0.21721052708159796,
11
- "train_runtime": 3074.1411,
12
- "train_samples": 867042,
13
- "train_samples_per_second": 846.131,
14
- "train_steps_per_second": 13.221
15
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_loss": 0.20824576914310455,
4
+ "eval_mse": 0.20824573578672098,
5
+ "eval_runtime": 118.1347,
6
+ "eval_samples": 192676,
7
+ "eval_samples_per_second": 1630.985,
8
+ "eval_steps_per_second": 203.877,
9
+ "total_flos": 8.56700972907817e+16,
10
+ "train_loss": 0.16141534491764947,
11
+ "train_runtime": 8977.4486,
12
+ "train_samples": 1734084,
13
+ "train_samples_per_second": 579.48,
14
+ "train_steps_per_second": 18.109
15
  }
eval_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_loss": 0.21882201731204987,
4
- "eval_mse": 0.21882200171115507,
5
- "eval_runtime": 50.6658,
6
- "eval_samples": 96338,
7
- "eval_samples_per_second": 1901.439,
8
- "eval_steps_per_second": 237.695
9
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_loss": 0.20824576914310455,
4
+ "eval_mse": 0.20824573578672098,
5
+ "eval_runtime": 118.1347,
6
+ "eval_samples": 192676,
7
+ "eval_samples_per_second": 1630.985,
8
+ "eval_steps_per_second": 203.877
9
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5333c9df0d67cb5904f8530167efd77528f3f24bd07b181bbd3c10ec9946baeb
3
  size 470640124
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1625359b708464b43c87eea957f8f6c642c0ed136ac047d2d480e5e37858bab4
3
  size 470640124
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 3.0,
3
- "total_flos": 4.283504864539085e+16,
4
- "train_loss": 0.21721052708159796,
5
- "train_runtime": 3074.1411,
6
- "train_samples": 867042,
7
- "train_samples_per_second": 846.131,
8
- "train_steps_per_second": 13.221
9
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "total_flos": 8.56700972907817e+16,
4
+ "train_loss": 0.16141534491764947,
5
+ "train_runtime": 8977.4486,
6
+ "train_samples": 1734084,
7
+ "train_samples_per_second": 579.48,
8
+ "train_steps_per_second": 18.109
9
  }
trainer_state.json CHANGED
@@ -1,619 +1,2327 @@
1
  {
2
- "best_metric": 0.21882201731204987,
3
- "best_model_checkpoint": "multilingual-e5-small-aligned-sentiment/checkpoint-40644",
4
  "epoch": 3.0,
5
  "eval_steps": 500,
6
- "global_step": 40644,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.03690581635665781,
13
- "grad_norm": 6.3122382164001465,
14
- "learning_rate": 4.938490306072237e-05,
15
- "loss": 0.4391,
16
  "step": 500
17
  },
18
  {
19
- "epoch": 0.07381163271331562,
20
- "grad_norm": 2.3589205741882324,
21
- "learning_rate": 4.876980612144474e-05,
22
- "loss": 0.3671,
23
  "step": 1000
24
  },
25
  {
26
- "epoch": 0.11071744906997343,
27
- "grad_norm": 5.466468811035156,
28
- "learning_rate": 4.815470918216711e-05,
29
- "loss": 0.3484,
30
  "step": 1500
31
  },
32
  {
33
- "epoch": 0.14762326542663123,
34
- "grad_norm": 3.5365800857543945,
35
- "learning_rate": 4.7539612242889484e-05,
36
- "loss": 0.3363,
37
  "step": 2000
38
  },
39
  {
40
- "epoch": 0.18452908178328906,
41
- "grad_norm": 4.123874187469482,
42
- "learning_rate": 4.692451530361185e-05,
43
- "loss": 0.339,
44
  "step": 2500
45
  },
46
  {
47
- "epoch": 0.22143489813994685,
48
- "grad_norm": 3.2261433601379395,
49
- "learning_rate": 4.6309418364334224e-05,
50
- "loss": 0.3275,
51
  "step": 3000
52
  },
53
  {
54
- "epoch": 0.2583407144966047,
55
- "grad_norm": 4.198851108551025,
56
- "learning_rate": 4.5694321425056594e-05,
57
- "loss": 0.3209,
58
  "step": 3500
59
  },
60
  {
61
- "epoch": 0.29524653085326247,
62
- "grad_norm": 4.112880706787109,
63
- "learning_rate": 4.507922448577896e-05,
64
- "loss": 0.3161,
65
  "step": 4000
66
  },
67
  {
68
- "epoch": 0.33215234720992026,
69
- "grad_norm": 2.5054564476013184,
70
- "learning_rate": 4.4464127546501335e-05,
71
- "loss": 0.3075,
72
  "step": 4500
73
  },
74
  {
75
- "epoch": 0.3690581635665781,
76
- "grad_norm": 2.706176280975342,
77
- "learning_rate": 4.38490306072237e-05,
78
- "loss": 0.3003,
79
  "step": 5000
80
  },
81
  {
82
- "epoch": 0.4059639799232359,
83
- "grad_norm": 4.4395432472229,
84
- "learning_rate": 4.323393366794607e-05,
85
- "loss": 0.3049,
86
  "step": 5500
87
  },
88
  {
89
- "epoch": 0.4428697962798937,
90
- "grad_norm": 2.091357946395874,
91
- "learning_rate": 4.261883672866844e-05,
92
- "loss": 0.3025,
93
  "step": 6000
94
  },
95
  {
96
- "epoch": 0.4797756126365515,
97
- "grad_norm": 3.8511946201324463,
98
- "learning_rate": 4.200373978939081e-05,
99
- "loss": 0.2954,
100
  "step": 6500
101
  },
102
  {
103
- "epoch": 0.5166814289932093,
104
- "grad_norm": 5.480827331542969,
105
- "learning_rate": 4.138864285011318e-05,
106
- "loss": 0.2906,
107
  "step": 7000
108
  },
109
  {
110
- "epoch": 0.5535872453498671,
111
- "grad_norm": 2.709707260131836,
112
- "learning_rate": 4.077354591083555e-05,
113
- "loss": 0.2802,
114
  "step": 7500
115
  },
116
  {
117
- "epoch": 0.5904930617065249,
118
- "grad_norm": 2.0577099323272705,
119
- "learning_rate": 4.015844897155792e-05,
120
- "loss": 0.283,
121
  "step": 8000
122
  },
123
  {
124
- "epoch": 0.6273988780631827,
125
- "grad_norm": 2.7647361755371094,
126
- "learning_rate": 3.954335203228029e-05,
127
- "loss": 0.2812,
128
  "step": 8500
129
  },
130
  {
131
- "epoch": 0.6643046944198405,
132
- "grad_norm": 2.9902350902557373,
133
- "learning_rate": 3.892825509300266e-05,
134
- "loss": 0.2762,
135
  "step": 9000
136
  },
137
  {
138
- "epoch": 0.7012105107764984,
139
- "grad_norm": 2.2046756744384766,
140
- "learning_rate": 3.8313158153725024e-05,
141
- "loss": 0.2736,
142
  "step": 9500
143
  },
144
  {
145
- "epoch": 0.7381163271331562,
146
- "grad_norm": 3.422405958175659,
147
- "learning_rate": 3.76980612144474e-05,
148
- "loss": 0.2758,
149
  "step": 10000
150
  },
151
  {
152
- "epoch": 0.775022143489814,
153
- "grad_norm": 5.36775016784668,
154
- "learning_rate": 3.708296427516977e-05,
155
- "loss": 0.2695,
156
  "step": 10500
157
  },
158
  {
159
- "epoch": 0.8119279598464718,
160
- "grad_norm": 2.30532169342041,
161
- "learning_rate": 3.6467867335892135e-05,
162
- "loss": 0.2685,
163
  "step": 11000
164
  },
165
  {
166
- "epoch": 0.8488337762031296,
167
- "grad_norm": 3.8830413818359375,
168
- "learning_rate": 3.585277039661451e-05,
169
- "loss": 0.2715,
170
  "step": 11500
171
  },
172
  {
173
- "epoch": 0.8857395925597874,
174
- "grad_norm": 2.014604091644287,
175
- "learning_rate": 3.5237673457336876e-05,
176
- "loss": 0.2608,
177
  "step": 12000
178
  },
179
  {
180
- "epoch": 0.9226454089164452,
181
- "grad_norm": 2.6041476726531982,
182
- "learning_rate": 3.4622576518059246e-05,
183
- "loss": 0.2632,
184
  "step": 12500
185
  },
186
  {
187
- "epoch": 0.959551225273103,
188
- "grad_norm": 3.3516054153442383,
189
- "learning_rate": 3.400747957878162e-05,
190
- "loss": 0.2621,
191
  "step": 13000
192
  },
193
  {
194
- "epoch": 0.9964570416297609,
195
- "grad_norm": 2.724219799041748,
196
- "learning_rate": 3.3392382639503986e-05,
197
- "loss": 0.2635,
198
  "step": 13500
199
  },
200
  {
201
- "epoch": 1.0,
202
- "eval_loss": 0.2525743246078491,
203
- "eval_mse": 0.2525743009952109,
204
- "eval_runtime": 50.6282,
205
- "eval_samples_per_second": 1902.852,
206
- "eval_steps_per_second": 237.871,
207
- "step": 13548
208
- },
209
- {
210
- "epoch": 1.0333628579864187,
211
- "grad_norm": 2.4586572647094727,
212
- "learning_rate": 3.277728570022636e-05,
213
- "loss": 0.2128,
214
  "step": 14000
215
  },
216
  {
217
- "epoch": 1.0702686743430765,
218
- "grad_norm": 2.403661012649536,
219
- "learning_rate": 3.216218876094873e-05,
220
- "loss": 0.2062,
221
  "step": 14500
222
  },
223
  {
224
- "epoch": 1.1071744906997343,
225
- "grad_norm": 1.8662785291671753,
226
- "learning_rate": 3.15470918216711e-05,
227
- "loss": 0.2088,
228
  "step": 15000
229
  },
230
  {
231
- "epoch": 1.144080307056392,
232
- "grad_norm": 4.150296688079834,
233
- "learning_rate": 3.093199488239347e-05,
234
- "loss": 0.203,
235
  "step": 15500
236
  },
237
  {
238
- "epoch": 1.1809861234130499,
239
- "grad_norm": 2.604682207107544,
240
- "learning_rate": 3.0316897943115834e-05,
241
- "loss": 0.2002,
242
  "step": 16000
243
  },
244
  {
245
- "epoch": 1.2178919397697077,
246
- "grad_norm": 2.8031857013702393,
247
- "learning_rate": 2.9701801003838208e-05,
248
- "loss": 0.2056,
249
  "step": 16500
250
  },
251
  {
252
- "epoch": 1.2547977561263655,
253
- "grad_norm": 4.056972503662109,
254
- "learning_rate": 2.9086704064560578e-05,
255
- "loss": 0.2067,
256
  "step": 17000
257
  },
258
  {
259
- "epoch": 1.2917035724830233,
260
- "grad_norm": 2.9248251914978027,
261
- "learning_rate": 2.8471607125282945e-05,
262
- "loss": 0.2049,
263
  "step": 17500
264
  },
265
  {
266
- "epoch": 1.328609388839681,
267
- "grad_norm": 1.4066252708435059,
268
- "learning_rate": 2.7856510186005312e-05,
269
- "loss": 0.201,
270
  "step": 18000
271
  },
272
  {
273
- "epoch": 1.3655152051963388,
274
- "grad_norm": 1.4685883522033691,
275
- "learning_rate": 2.7241413246727686e-05,
276
- "loss": 0.1983,
277
  "step": 18500
278
  },
279
  {
280
- "epoch": 1.4024210215529966,
281
- "grad_norm": 1.9358257055282593,
282
- "learning_rate": 2.6626316307450056e-05,
283
- "loss": 0.1983,
284
  "step": 19000
285
  },
286
  {
287
- "epoch": 1.4393268379096544,
288
- "grad_norm": 1.5204322338104248,
289
- "learning_rate": 2.6011219368172423e-05,
290
- "loss": 0.2032,
291
  "step": 19500
292
  },
293
  {
294
- "epoch": 1.4762326542663124,
295
- "grad_norm": 2.7880804538726807,
296
- "learning_rate": 2.5396122428894797e-05,
297
- "loss": 0.2045,
298
  "step": 20000
299
  },
300
  {
301
- "epoch": 1.51313847062297,
302
- "grad_norm": 2.1745991706848145,
303
- "learning_rate": 2.4781025489617167e-05,
304
- "loss": 0.1979,
305
  "step": 20500
306
  },
307
  {
308
- "epoch": 1.550044286979628,
309
- "grad_norm": 1.6532700061798096,
310
- "learning_rate": 2.4165928550339534e-05,
311
- "loss": 0.1979,
312
  "step": 21000
313
  },
314
  {
315
- "epoch": 1.5869501033362858,
316
- "grad_norm": 2.7065317630767822,
317
- "learning_rate": 2.3550831611061904e-05,
318
- "loss": 0.1958,
319
  "step": 21500
320
  },
321
  {
322
- "epoch": 1.6238559196929436,
323
- "grad_norm": 2.1913399696350098,
324
- "learning_rate": 2.2935734671784274e-05,
325
- "loss": 0.1958,
326
  "step": 22000
327
  },
328
  {
329
- "epoch": 1.6607617360496014,
330
- "grad_norm": 2.5118260383605957,
331
- "learning_rate": 2.2320637732506645e-05,
332
- "loss": 0.2016,
333
  "step": 22500
334
  },
335
  {
336
- "epoch": 1.6976675524062592,
337
- "grad_norm": 1.4727787971496582,
338
- "learning_rate": 2.1705540793229015e-05,
339
- "loss": 0.1965,
340
  "step": 23000
341
  },
342
  {
343
- "epoch": 1.734573368762917,
344
- "grad_norm": 2.5935685634613037,
345
- "learning_rate": 2.1090443853951382e-05,
346
- "loss": 0.1936,
347
  "step": 23500
348
  },
349
  {
350
- "epoch": 1.7714791851195748,
351
- "grad_norm": 1.701431155204773,
352
- "learning_rate": 2.0475346914673755e-05,
353
- "loss": 0.1982,
354
  "step": 24000
355
  },
356
  {
357
- "epoch": 1.8083850014762326,
358
- "grad_norm": 2.9000027179718018,
359
- "learning_rate": 1.9860249975396122e-05,
360
- "loss": 0.1955,
361
  "step": 24500
362
  },
363
  {
364
- "epoch": 1.8452908178328906,
365
- "grad_norm": 3.60319447517395,
366
- "learning_rate": 1.9245153036118493e-05,
367
- "loss": 0.1962,
368
  "step": 25000
369
  },
370
  {
371
- "epoch": 1.8821966341895484,
372
- "grad_norm": 2.8174662590026855,
373
- "learning_rate": 1.8630056096840863e-05,
374
- "loss": 0.1918,
375
  "step": 25500
376
  },
377
  {
378
- "epoch": 1.9191024505462062,
379
- "grad_norm": 3.1348931789398193,
380
- "learning_rate": 1.8014959157563233e-05,
381
- "loss": 0.1951,
382
  "step": 26000
383
  },
384
  {
385
- "epoch": 1.956008266902864,
386
- "grad_norm": 2.737175941467285,
387
- "learning_rate": 1.7399862218285603e-05,
388
- "loss": 0.1885,
389
  "step": 26500
390
  },
391
  {
392
- "epoch": 1.9929140832595218,
393
- "grad_norm": 3.2441189289093018,
394
- "learning_rate": 1.678476527900797e-05,
395
- "loss": 0.1944,
396
  "step": 27000
397
  },
398
  {
399
- "epoch": 2.0,
400
- "eval_loss": 0.2276565432548523,
401
- "eval_mse": 0.22765655234639334,
402
- "eval_runtime": 50.6433,
403
- "eval_samples_per_second": 1902.284,
404
- "eval_steps_per_second": 237.8,
405
- "step": 27096
406
- },
407
- {
408
- "epoch": 2.0298198996161796,
409
- "grad_norm": 2.415947198867798,
410
- "learning_rate": 1.6169668339730344e-05,
411
- "loss": 0.1606,
412
  "step": 27500
413
  },
414
  {
415
- "epoch": 2.0667257159728374,
416
- "grad_norm": 2.8117918968200684,
417
- "learning_rate": 1.555457140045271e-05,
418
- "loss": 0.1566,
419
  "step": 28000
420
  },
421
  {
422
- "epoch": 2.103631532329495,
423
- "grad_norm": 2.8294386863708496,
424
- "learning_rate": 1.4939474461175081e-05,
425
- "loss": 0.1549,
426
  "step": 28500
427
  },
428
  {
429
- "epoch": 2.140537348686153,
430
- "grad_norm": 2.073002576828003,
431
- "learning_rate": 1.4324377521897453e-05,
432
- "loss": 0.151,
433
  "step": 29000
434
  },
435
  {
436
- "epoch": 2.1774431650428108,
437
- "grad_norm": 2.204664707183838,
438
- "learning_rate": 1.3709280582619822e-05,
439
- "loss": 0.1558,
440
  "step": 29500
441
  },
442
  {
443
- "epoch": 2.2143489813994686,
444
- "grad_norm": 2.2928786277770996,
445
- "learning_rate": 1.3094183643342192e-05,
446
- "loss": 0.1531,
447
  "step": 30000
448
  },
449
  {
450
- "epoch": 2.2512547977561264,
451
- "grad_norm": 4.089919567108154,
452
- "learning_rate": 1.2479086704064562e-05,
453
- "loss": 0.1578,
454
  "step": 30500
455
  },
456
  {
457
- "epoch": 2.288160614112784,
458
- "grad_norm": 3.0547707080841064,
459
- "learning_rate": 1.186398976478693e-05,
460
- "loss": 0.1551,
461
  "step": 31000
462
  },
463
  {
464
- "epoch": 2.325066430469442,
465
- "grad_norm": 1.791717767715454,
466
- "learning_rate": 1.1248892825509301e-05,
467
- "loss": 0.1538,
468
  "step": 31500
469
  },
470
  {
471
- "epoch": 2.3619722468260997,
472
- "grad_norm": 1.498639702796936,
473
- "learning_rate": 1.0633795886231671e-05,
474
- "loss": 0.1533,
475
  "step": 32000
476
  },
477
  {
478
- "epoch": 2.3988780631827575,
479
- "grad_norm": 1.8389638662338257,
480
- "learning_rate": 1.001869894695404e-05,
481
- "loss": 0.1536,
482
  "step": 32500
483
  },
484
  {
485
- "epoch": 2.4357838795394153,
486
- "grad_norm": 2.8968076705932617,
487
- "learning_rate": 9.40360200767641e-06,
488
- "loss": 0.1518,
489
  "step": 33000
490
  },
491
  {
492
- "epoch": 2.472689695896073,
493
- "grad_norm": 1.8149243593215942,
494
- "learning_rate": 8.78850506839878e-06,
495
- "loss": 0.1508,
496
  "step": 33500
497
  },
498
  {
499
- "epoch": 2.509595512252731,
500
- "grad_norm": 2.4595253467559814,
501
- "learning_rate": 8.17340812912115e-06,
502
- "loss": 0.1509,
503
  "step": 34000
504
  },
505
  {
506
- "epoch": 2.5465013286093887,
507
- "grad_norm": 2.2790329456329346,
508
- "learning_rate": 7.55831118984352e-06,
509
- "loss": 0.1495,
510
  "step": 34500
511
  },
512
  {
513
- "epoch": 2.5834071449660465,
514
- "grad_norm": 2.1698362827301025,
515
- "learning_rate": 6.94321425056589e-06,
516
- "loss": 0.1507,
517
  "step": 35000
518
  },
519
  {
520
- "epoch": 2.6203129613227043,
521
- "grad_norm": 1.564191460609436,
522
- "learning_rate": 6.328117311288259e-06,
523
- "loss": 0.1512,
524
  "step": 35500
525
  },
526
  {
527
- "epoch": 2.657218777679362,
528
- "grad_norm": 1.279205322265625,
529
- "learning_rate": 5.713020372010629e-06,
530
- "loss": 0.1479,
531
  "step": 36000
532
  },
533
  {
534
- "epoch": 2.69412459403602,
535
- "grad_norm": 2.1811535358428955,
536
- "learning_rate": 5.097923432732999e-06,
537
- "loss": 0.1508,
538
  "step": 36500
539
  },
540
  {
541
- "epoch": 2.7310304103926777,
542
- "grad_norm": 2.391449451446533,
543
- "learning_rate": 4.482826493455368e-06,
544
- "loss": 0.1444,
545
  "step": 37000
546
  },
547
  {
548
- "epoch": 2.7679362267493355,
549
- "grad_norm": 1.848325490951538,
550
- "learning_rate": 3.8677295541777385e-06,
551
- "loss": 0.1487,
552
  "step": 37500
553
  },
554
  {
555
- "epoch": 2.8048420431059933,
556
- "grad_norm": 2.8446269035339355,
557
- "learning_rate": 3.2526326149001084e-06,
558
- "loss": 0.1497,
559
  "step": 38000
560
  },
561
  {
562
- "epoch": 2.841747859462651,
563
- "grad_norm": 2.272193670272827,
564
- "learning_rate": 2.6375356756224782e-06,
565
- "loss": 0.1506,
566
  "step": 38500
567
  },
568
  {
569
- "epoch": 2.878653675819309,
570
- "grad_norm": 2.2728445529937744,
571
- "learning_rate": 2.022438736344848e-06,
572
- "loss": 0.1497,
573
  "step": 39000
574
  },
575
  {
576
- "epoch": 2.9155594921759667,
577
- "grad_norm": 1.8776350021362305,
578
- "learning_rate": 1.4073417970672177e-06,
579
- "loss": 0.1465,
580
  "step": 39500
581
  },
582
  {
583
- "epoch": 2.952465308532625,
584
- "grad_norm": 1.9717949628829956,
585
- "learning_rate": 7.922448577895876e-07,
586
- "loss": 0.1493,
587
  "step": 40000
588
  },
589
  {
590
- "epoch": 2.9893711248892827,
591
- "grad_norm": 2.8036680221557617,
592
- "learning_rate": 1.771479185119575e-07,
593
- "loss": 0.1489,
594
  "step": 40500
595
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
  {
597
  "epoch": 3.0,
598
- "eval_loss": 0.21882201731204987,
599
- "eval_mse": 0.21882200171115507,
600
- "eval_runtime": 50.9689,
601
- "eval_samples_per_second": 1890.132,
602
- "eval_steps_per_second": 236.281,
603
- "step": 40644
604
  },
605
  {
606
  "epoch": 3.0,
607
- "step": 40644,
608
- "total_flos": 4.283504864539085e+16,
609
- "train_loss": 0.21721052708159796,
610
- "train_runtime": 3074.1411,
611
- "train_samples_per_second": 846.131,
612
- "train_steps_per_second": 13.221
613
  }
614
  ],
615
  "logging_steps": 500,
616
- "max_steps": 40644,
617
  "num_input_tokens_seen": 0,
618
  "num_train_epochs": 3,
619
  "save_steps": 500,
@@ -629,8 +2337,8 @@
629
  "attributes": {}
630
  }
631
  },
632
- "total_flos": 4.283504864539085e+16,
633
- "train_batch_size": 64,
634
  "trial_name": null,
635
  "trial_params": null
636
  }
 
1
  {
2
+ "best_metric": 0.20824576914310455,
3
+ "best_model_checkpoint": "multilingual-e5-small-aligned-transformed-sentiment/checkpoint-162573",
4
  "epoch": 3.0,
5
  "eval_steps": 500,
6
+ "global_step": 162573,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.009226624347216328,
13
+ "grad_norm": 4.861542224884033,
14
+ "learning_rate": 4.98462229275464e-05,
15
+ "loss": 0.4705,
16
  "step": 500
17
  },
18
  {
19
+ "epoch": 0.018453248694432656,
20
+ "grad_norm": 4.163719177246094,
21
+ "learning_rate": 4.969244585509279e-05,
22
+ "loss": 0.3878,
23
  "step": 1000
24
  },
25
  {
26
+ "epoch": 0.027679873041648984,
27
+ "grad_norm": 5.356781482696533,
28
+ "learning_rate": 4.9538668782639185e-05,
29
+ "loss": 0.3727,
30
  "step": 1500
31
  },
32
  {
33
+ "epoch": 0.03690649738886531,
34
+ "grad_norm": 4.754086017608643,
35
+ "learning_rate": 4.9384891710185583e-05,
36
+ "loss": 0.3647,
37
  "step": 2000
38
  },
39
  {
40
+ "epoch": 0.04613312173608164,
41
+ "grad_norm": 4.055091857910156,
42
+ "learning_rate": 4.9231114637731975e-05,
43
+ "loss": 0.3508,
44
  "step": 2500
45
  },
46
  {
47
+ "epoch": 0.05535974608329797,
48
+ "grad_norm": 4.6239824295043945,
49
+ "learning_rate": 4.907733756527837e-05,
50
+ "loss": 0.3418,
51
  "step": 3000
52
  },
53
  {
54
+ "epoch": 0.0645863704305143,
55
+ "grad_norm": 3.2537200450897217,
56
+ "learning_rate": 4.8923560492824766e-05,
57
+ "loss": 0.3471,
58
  "step": 3500
59
  },
60
  {
61
+ "epoch": 0.07381299477773062,
62
+ "grad_norm": 8.821883201599121,
63
+ "learning_rate": 4.876978342037116e-05,
64
+ "loss": 0.3448,
65
  "step": 4000
66
  },
67
  {
68
+ "epoch": 0.08303961912494695,
69
+ "grad_norm": 4.157027244567871,
70
+ "learning_rate": 4.8616006347917556e-05,
71
+ "loss": 0.3304,
72
  "step": 4500
73
  },
74
  {
75
+ "epoch": 0.09226624347216328,
76
+ "grad_norm": 5.966025352478027,
77
+ "learning_rate": 4.846222927546395e-05,
78
+ "loss": 0.3288,
79
  "step": 5000
80
  },
81
  {
82
+ "epoch": 0.1014928678193796,
83
+ "grad_norm": 2.689772367477417,
84
+ "learning_rate": 4.830845220301034e-05,
85
+ "loss": 0.315,
86
  "step": 5500
87
  },
88
  {
89
+ "epoch": 0.11071949216659593,
90
+ "grad_norm": 4.417109966278076,
91
+ "learning_rate": 4.815467513055674e-05,
92
+ "loss": 0.3171,
93
  "step": 6000
94
  },
95
  {
96
+ "epoch": 0.11994611651381226,
97
+ "grad_norm": 2.441032886505127,
98
+ "learning_rate": 4.800089805810313e-05,
99
+ "loss": 0.3111,
100
  "step": 6500
101
  },
102
  {
103
+ "epoch": 0.1291727408610286,
104
+ "grad_norm": 5.741962432861328,
105
+ "learning_rate": 4.784712098564952e-05,
106
+ "loss": 0.3161,
107
  "step": 7000
108
  },
109
  {
110
+ "epoch": 0.13839936520824492,
111
+ "grad_norm": 3.6779587268829346,
112
+ "learning_rate": 4.769334391319592e-05,
113
+ "loss": 0.3172,
114
  "step": 7500
115
  },
116
  {
117
+ "epoch": 0.14762598955546125,
118
+ "grad_norm": 3.389577627182007,
119
+ "learning_rate": 4.753956684074232e-05,
120
+ "loss": 0.3089,
121
  "step": 8000
122
  },
123
  {
124
+ "epoch": 0.15685261390267757,
125
+ "grad_norm": 6.343785285949707,
126
+ "learning_rate": 4.7385789768288705e-05,
127
+ "loss": 0.3038,
128
  "step": 8500
129
  },
130
  {
131
+ "epoch": 0.1660792382498939,
132
+ "grad_norm": 3.811483383178711,
133
+ "learning_rate": 4.7232012695835104e-05,
134
+ "loss": 0.2977,
135
  "step": 9000
136
  },
137
  {
138
+ "epoch": 0.17530586259711023,
139
+ "grad_norm": 3.0193912982940674,
140
+ "learning_rate": 4.70782356233815e-05,
141
+ "loss": 0.302,
142
  "step": 9500
143
  },
144
  {
145
+ "epoch": 0.18453248694432656,
146
+ "grad_norm": 5.484386444091797,
147
+ "learning_rate": 4.6924458550927894e-05,
148
+ "loss": 0.3041,
149
  "step": 10000
150
  },
151
  {
152
+ "epoch": 0.19375911129154288,
153
+ "grad_norm": 4.629725933074951,
154
+ "learning_rate": 4.6770681478474286e-05,
155
+ "loss": 0.3036,
156
  "step": 10500
157
  },
158
  {
159
+ "epoch": 0.2029857356387592,
160
+ "grad_norm": 9.174530982971191,
161
+ "learning_rate": 4.6616904406020685e-05,
162
+ "loss": 0.2884,
163
  "step": 11000
164
  },
165
  {
166
+ "epoch": 0.21221235998597554,
167
+ "grad_norm": 11.994110107421875,
168
+ "learning_rate": 4.6463127333567077e-05,
169
+ "loss": 0.2913,
170
  "step": 11500
171
  },
172
  {
173
+ "epoch": 0.22143898433319187,
174
+ "grad_norm": 4.78723669052124,
175
+ "learning_rate": 4.630935026111347e-05,
176
+ "loss": 0.2903,
177
  "step": 12000
178
  },
179
  {
180
+ "epoch": 0.2306656086804082,
181
+ "grad_norm": 4.056216239929199,
182
+ "learning_rate": 4.615557318865987e-05,
183
+ "loss": 0.2885,
184
  "step": 12500
185
  },
186
  {
187
+ "epoch": 0.23989223302762452,
188
+ "grad_norm": 2.5596282482147217,
189
+ "learning_rate": 4.600179611620626e-05,
190
+ "loss": 0.2865,
191
  "step": 13000
192
  },
193
  {
194
+ "epoch": 0.24911885737484085,
195
+ "grad_norm": 3.354088544845581,
196
+ "learning_rate": 4.584801904375266e-05,
197
+ "loss": 0.2801,
198
  "step": 13500
199
  },
200
  {
201
+ "epoch": 0.2583454817220572,
202
+ "grad_norm": 2.7451272010803223,
203
+ "learning_rate": 4.569424197129905e-05,
204
+ "loss": 0.2804,
 
 
 
 
 
 
 
 
 
205
  "step": 14000
206
  },
207
  {
208
+ "epoch": 0.2675721060692735,
209
+ "grad_norm": 2.0492589473724365,
210
+ "learning_rate": 4.554046489884544e-05,
211
+ "loss": 0.2805,
212
  "step": 14500
213
  },
214
  {
215
+ "epoch": 0.27679873041648984,
216
+ "grad_norm": 2.7824437618255615,
217
+ "learning_rate": 4.538668782639184e-05,
218
+ "loss": 0.276,
219
  "step": 15000
220
  },
221
  {
222
+ "epoch": 0.28602535476370616,
223
+ "grad_norm": 2.753225326538086,
224
+ "learning_rate": 4.523291075393823e-05,
225
+ "loss": 0.2779,
226
  "step": 15500
227
  },
228
  {
229
+ "epoch": 0.2952519791109225,
230
+ "grad_norm": 4.156832218170166,
231
+ "learning_rate": 4.5079133681484624e-05,
232
+ "loss": 0.2776,
233
  "step": 16000
234
  },
235
  {
236
+ "epoch": 0.3044786034581388,
237
+ "grad_norm": 1.4764610528945923,
238
+ "learning_rate": 4.492535660903102e-05,
239
+ "loss": 0.2777,
240
  "step": 16500
241
  },
242
  {
243
+ "epoch": 0.31370522780535515,
244
+ "grad_norm": 2.732165813446045,
245
+ "learning_rate": 4.477157953657742e-05,
246
+ "loss": 0.2759,
247
  "step": 17000
248
  },
249
  {
250
+ "epoch": 0.3229318521525715,
251
+ "grad_norm": 3.756098508834839,
252
+ "learning_rate": 4.4617802464123806e-05,
253
+ "loss": 0.2707,
254
  "step": 17500
255
  },
256
  {
257
+ "epoch": 0.3321584764997878,
258
+ "grad_norm": 3.7828195095062256,
259
+ "learning_rate": 4.4464025391670205e-05,
260
+ "loss": 0.2692,
261
  "step": 18000
262
  },
263
  {
264
+ "epoch": 0.34138510084700413,
265
+ "grad_norm": 6.942204475402832,
266
+ "learning_rate": 4.4310248319216603e-05,
267
+ "loss": 0.2722,
268
  "step": 18500
269
  },
270
  {
271
+ "epoch": 0.35061172519422046,
272
+ "grad_norm": 2.0811824798583984,
273
+ "learning_rate": 4.4156471246762995e-05,
274
+ "loss": 0.2624,
275
  "step": 19000
276
  },
277
  {
278
+ "epoch": 0.3598383495414368,
279
+ "grad_norm": 2.2063019275665283,
280
+ "learning_rate": 4.400269417430939e-05,
281
+ "loss": 0.2567,
282
  "step": 19500
283
  },
284
  {
285
+ "epoch": 0.3690649738886531,
286
+ "grad_norm": 3.381683826446533,
287
+ "learning_rate": 4.3848917101855786e-05,
288
+ "loss": 0.2623,
289
  "step": 20000
290
  },
291
  {
292
+ "epoch": 0.37829159823586944,
293
+ "grad_norm": 2.4916694164276123,
294
+ "learning_rate": 4.369514002940218e-05,
295
+ "loss": 0.2665,
296
  "step": 20500
297
  },
298
  {
299
+ "epoch": 0.38751822258308577,
300
+ "grad_norm": 3.138047695159912,
301
+ "learning_rate": 4.354136295694857e-05,
302
+ "loss": 0.251,
303
  "step": 21000
304
  },
305
  {
306
+ "epoch": 0.3967448469303021,
307
+ "grad_norm": 4.300042152404785,
308
+ "learning_rate": 4.338758588449497e-05,
309
+ "loss": 0.2542,
310
  "step": 21500
311
  },
312
  {
313
+ "epoch": 0.4059714712775184,
314
+ "grad_norm": 4.118566513061523,
315
+ "learning_rate": 4.323380881204136e-05,
316
+ "loss": 0.2545,
317
  "step": 22000
318
  },
319
  {
320
+ "epoch": 0.41519809562473475,
321
+ "grad_norm": 3.6837940216064453,
322
+ "learning_rate": 4.308003173958776e-05,
323
+ "loss": 0.2506,
324
  "step": 22500
325
  },
326
  {
327
+ "epoch": 0.4244247199719511,
328
+ "grad_norm": 3.9393532276153564,
329
+ "learning_rate": 4.292625466713415e-05,
330
+ "loss": 0.248,
331
  "step": 23000
332
  },
333
  {
334
+ "epoch": 0.4336513443191674,
335
+ "grad_norm": 4.186630725860596,
336
+ "learning_rate": 4.277247759468054e-05,
337
+ "loss": 0.2574,
338
  "step": 23500
339
  },
340
  {
341
+ "epoch": 0.44287796866638374,
342
+ "grad_norm": 2.1121768951416016,
343
+ "learning_rate": 4.261870052222694e-05,
344
+ "loss": 0.2552,
345
  "step": 24000
346
  },
347
  {
348
+ "epoch": 0.45210459301360006,
349
+ "grad_norm": 3.940450429916382,
350
+ "learning_rate": 4.246492344977333e-05,
351
+ "loss": 0.2449,
352
  "step": 24500
353
  },
354
  {
355
+ "epoch": 0.4613312173608164,
356
+ "grad_norm": 3.8467142581939697,
357
+ "learning_rate": 4.2311146377319725e-05,
358
+ "loss": 0.2497,
359
  "step": 25000
360
  },
361
  {
362
+ "epoch": 0.4705578417080327,
363
+ "grad_norm": 4.122659683227539,
364
+ "learning_rate": 4.2157369304866124e-05,
365
+ "loss": 0.2502,
366
  "step": 25500
367
  },
368
  {
369
+ "epoch": 0.47978446605524905,
370
+ "grad_norm": 4.005275249481201,
371
+ "learning_rate": 4.200359223241252e-05,
372
+ "loss": 0.239,
373
  "step": 26000
374
  },
375
  {
376
+ "epoch": 0.4890110904024654,
377
+ "grad_norm": 3.944265365600586,
378
+ "learning_rate": 4.184981515995891e-05,
379
+ "loss": 0.2495,
380
  "step": 26500
381
  },
382
  {
383
+ "epoch": 0.4982377147496817,
384
+ "grad_norm": 5.242092609405518,
385
+ "learning_rate": 4.1696038087505306e-05,
386
+ "loss": 0.2504,
387
  "step": 27000
388
  },
389
  {
390
+ "epoch": 0.507464339096898,
391
+ "grad_norm": 3.0890393257141113,
392
+ "learning_rate": 4.1542261015051705e-05,
393
+ "loss": 0.2424,
 
 
 
 
 
 
 
 
 
394
  "step": 27500
395
  },
396
  {
397
+ "epoch": 0.5166909634441144,
398
+ "grad_norm": 2.5902299880981445,
399
+ "learning_rate": 4.1388483942598097e-05,
400
+ "loss": 0.2432,
401
  "step": 28000
402
  },
403
  {
404
+ "epoch": 0.5259175877913307,
405
+ "grad_norm": 2.1534225940704346,
406
+ "learning_rate": 4.123470687014449e-05,
407
+ "loss": 0.2423,
408
  "step": 28500
409
  },
410
  {
411
+ "epoch": 0.535144212138547,
412
+ "grad_norm": 4.093803405761719,
413
+ "learning_rate": 4.108092979769089e-05,
414
+ "loss": 0.2393,
415
  "step": 29000
416
  },
417
  {
418
+ "epoch": 0.5443708364857633,
419
+ "grad_norm": 5.0820722579956055,
420
+ "learning_rate": 4.092715272523728e-05,
421
+ "loss": 0.2355,
422
  "step": 29500
423
  },
424
  {
425
+ "epoch": 0.5535974608329797,
426
+ "grad_norm": 3.2006969451904297,
427
+ "learning_rate": 4.077337565278367e-05,
428
+ "loss": 0.2378,
429
  "step": 30000
430
  },
431
  {
432
+ "epoch": 0.562824085180196,
433
+ "grad_norm": 2.7393364906311035,
434
+ "learning_rate": 4.061959858033007e-05,
435
+ "loss": 0.2391,
436
  "step": 30500
437
  },
438
  {
439
+ "epoch": 0.5720507095274123,
440
+ "grad_norm": 5.7313361167907715,
441
+ "learning_rate": 4.046582150787646e-05,
442
+ "loss": 0.2378,
443
  "step": 31000
444
  },
445
  {
446
+ "epoch": 0.5812773338746287,
447
+ "grad_norm": 3.5704684257507324,
448
+ "learning_rate": 4.031204443542286e-05,
449
+ "loss": 0.2416,
450
  "step": 31500
451
  },
452
  {
453
+ "epoch": 0.590503958221845,
454
+ "grad_norm": 3.010260820388794,
455
+ "learning_rate": 4.015826736296925e-05,
456
+ "loss": 0.2315,
457
  "step": 32000
458
  },
459
  {
460
+ "epoch": 0.5997305825690613,
461
+ "grad_norm": 6.030303001403809,
462
+ "learning_rate": 4.0004490290515644e-05,
463
+ "loss": 0.2346,
464
  "step": 32500
465
  },
466
  {
467
+ "epoch": 0.6089572069162776,
468
+ "grad_norm": 2.6332879066467285,
469
+ "learning_rate": 3.985071321806204e-05,
470
+ "loss": 0.2387,
471
  "step": 33000
472
  },
473
  {
474
+ "epoch": 0.618183831263494,
475
+ "grad_norm": 3.79506254196167,
476
+ "learning_rate": 3.9696936145608434e-05,
477
+ "loss": 0.2314,
478
  "step": 33500
479
  },
480
  {
481
+ "epoch": 0.6274104556107103,
482
+ "grad_norm": 3.9026734828948975,
483
+ "learning_rate": 3.9543159073154826e-05,
484
+ "loss": 0.2265,
485
  "step": 34000
486
  },
487
  {
488
+ "epoch": 0.6366370799579266,
489
+ "grad_norm": 7.885356426239014,
490
+ "learning_rate": 3.9389382000701225e-05,
491
+ "loss": 0.2288,
492
  "step": 34500
493
  },
494
  {
495
+ "epoch": 0.645863704305143,
496
+ "grad_norm": 3.634693145751953,
497
+ "learning_rate": 3.9235604928247623e-05,
498
+ "loss": 0.2269,
499
  "step": 35000
500
  },
501
  {
502
+ "epoch": 0.6550903286523593,
503
+ "grad_norm": 4.571321487426758,
504
+ "learning_rate": 3.9081827855794015e-05,
505
+ "loss": 0.226,
506
  "step": 35500
507
  },
508
  {
509
+ "epoch": 0.6643169529995756,
510
+ "grad_norm": 4.4402337074279785,
511
+ "learning_rate": 3.892805078334041e-05,
512
+ "loss": 0.2227,
513
  "step": 36000
514
  },
515
  {
516
+ "epoch": 0.6735435773467919,
517
+ "grad_norm": 2.3273956775665283,
518
+ "learning_rate": 3.8774273710886806e-05,
519
+ "loss": 0.2328,
520
  "step": 36500
521
  },
522
  {
523
+ "epoch": 0.6827702016940083,
524
+ "grad_norm": 7.7202372550964355,
525
+ "learning_rate": 3.86204966384332e-05,
526
+ "loss": 0.2242,
527
  "step": 37000
528
  },
529
  {
530
+ "epoch": 0.6919968260412246,
531
+ "grad_norm": 3.037423849105835,
532
+ "learning_rate": 3.846671956597959e-05,
533
+ "loss": 0.2219,
534
  "step": 37500
535
  },
536
  {
537
+ "epoch": 0.7012234503884409,
538
+ "grad_norm": 3.3124380111694336,
539
+ "learning_rate": 3.831294249352599e-05,
540
+ "loss": 0.2217,
541
  "step": 38000
542
  },
543
  {
544
+ "epoch": 0.7104500747356572,
545
+ "grad_norm": 1.5552330017089844,
546
+ "learning_rate": 3.815916542107238e-05,
547
+ "loss": 0.2237,
548
  "step": 38500
549
  },
550
  {
551
+ "epoch": 0.7196766990828736,
552
+ "grad_norm": 3.6003737449645996,
553
+ "learning_rate": 3.800538834861878e-05,
554
+ "loss": 0.2212,
555
  "step": 39000
556
  },
557
  {
558
+ "epoch": 0.7289033234300899,
559
+ "grad_norm": 2.323984146118164,
560
+ "learning_rate": 3.785161127616517e-05,
561
+ "loss": 0.2217,
562
  "step": 39500
563
  },
564
  {
565
+ "epoch": 0.7381299477773062,
566
+ "grad_norm": 4.002011775970459,
567
+ "learning_rate": 3.769783420371156e-05,
568
+ "loss": 0.2178,
569
  "step": 40000
570
  },
571
  {
572
+ "epoch": 0.7473565721245226,
573
+ "grad_norm": 9.153217315673828,
574
+ "learning_rate": 3.754405713125796e-05,
575
+ "loss": 0.2156,
576
  "step": 40500
577
  },
578
+ {
579
+ "epoch": 0.7565831964717389,
580
+ "grad_norm": 4.3000712394714355,
581
+ "learning_rate": 3.739028005880435e-05,
582
+ "loss": 0.2202,
583
+ "step": 41000
584
+ },
585
+ {
586
+ "epoch": 0.7658098208189552,
587
+ "grad_norm": 5.20850944519043,
588
+ "learning_rate": 3.7236502986350745e-05,
589
+ "loss": 0.2156,
590
+ "step": 41500
591
+ },
592
+ {
593
+ "epoch": 0.7750364451661715,
594
+ "grad_norm": 3.736025810241699,
595
+ "learning_rate": 3.7082725913897144e-05,
596
+ "loss": 0.2106,
597
+ "step": 42000
598
+ },
599
+ {
600
+ "epoch": 0.7842630695133879,
601
+ "grad_norm": 4.413645267486572,
602
+ "learning_rate": 3.692894884144354e-05,
603
+ "loss": 0.2154,
604
+ "step": 42500
605
+ },
606
+ {
607
+ "epoch": 0.7934896938606042,
608
+ "grad_norm": 3.298003911972046,
609
+ "learning_rate": 3.677517176898993e-05,
610
+ "loss": 0.2106,
611
+ "step": 43000
612
+ },
613
+ {
614
+ "epoch": 0.8027163182078205,
615
+ "grad_norm": 2.9312047958374023,
616
+ "learning_rate": 3.6621394696536326e-05,
617
+ "loss": 0.2043,
618
+ "step": 43500
619
+ },
620
+ {
621
+ "epoch": 0.8119429425550369,
622
+ "grad_norm": 4.253361701965332,
623
+ "learning_rate": 3.6467617624082725e-05,
624
+ "loss": 0.2131,
625
+ "step": 44000
626
+ },
627
+ {
628
+ "epoch": 0.8211695669022532,
629
+ "grad_norm": 2.0434412956237793,
630
+ "learning_rate": 3.6313840551629117e-05,
631
+ "loss": 0.2144,
632
+ "step": 44500
633
+ },
634
+ {
635
+ "epoch": 0.8303961912494695,
636
+ "grad_norm": 3.0040202140808105,
637
+ "learning_rate": 3.616006347917551e-05,
638
+ "loss": 0.2124,
639
+ "step": 45000
640
+ },
641
+ {
642
+ "epoch": 0.8396228155966858,
643
+ "grad_norm": 3.3966643810272217,
644
+ "learning_rate": 3.600628640672191e-05,
645
+ "loss": 0.2077,
646
+ "step": 45500
647
+ },
648
+ {
649
+ "epoch": 0.8488494399439022,
650
+ "grad_norm": 2.4415907859802246,
651
+ "learning_rate": 3.58525093342683e-05,
652
+ "loss": 0.2049,
653
+ "step": 46000
654
+ },
655
+ {
656
+ "epoch": 0.8580760642911185,
657
+ "grad_norm": 3.1614882946014404,
658
+ "learning_rate": 3.569873226181469e-05,
659
+ "loss": 0.2073,
660
+ "step": 46500
661
+ },
662
+ {
663
+ "epoch": 0.8673026886383348,
664
+ "grad_norm": 4.641379356384277,
665
+ "learning_rate": 3.554495518936109e-05,
666
+ "loss": 0.2025,
667
+ "step": 47000
668
+ },
669
+ {
670
+ "epoch": 0.8765293129855511,
671
+ "grad_norm": 3.275320529937744,
672
+ "learning_rate": 3.539117811690748e-05,
673
+ "loss": 0.206,
674
+ "step": 47500
675
+ },
676
+ {
677
+ "epoch": 0.8857559373327675,
678
+ "grad_norm": 2.602555274963379,
679
+ "learning_rate": 3.523740104445388e-05,
680
+ "loss": 0.2053,
681
+ "step": 48000
682
+ },
683
+ {
684
+ "epoch": 0.8949825616799838,
685
+ "grad_norm": 3.3625969886779785,
686
+ "learning_rate": 3.508362397200027e-05,
687
+ "loss": 0.2031,
688
+ "step": 48500
689
+ },
690
+ {
691
+ "epoch": 0.9042091860272001,
692
+ "grad_norm": 2.0234267711639404,
693
+ "learning_rate": 3.4929846899546664e-05,
694
+ "loss": 0.1981,
695
+ "step": 49000
696
+ },
697
+ {
698
+ "epoch": 0.9134358103744165,
699
+ "grad_norm": 2.6035192012786865,
700
+ "learning_rate": 3.477606982709306e-05,
701
+ "loss": 0.2013,
702
+ "step": 49500
703
+ },
704
+ {
705
+ "epoch": 0.9226624347216328,
706
+ "grad_norm": 5.516040802001953,
707
+ "learning_rate": 3.4622292754639454e-05,
708
+ "loss": 0.2063,
709
+ "step": 50000
710
+ },
711
+ {
712
+ "epoch": 0.9318890590688491,
713
+ "grad_norm": 4.573687553405762,
714
+ "learning_rate": 3.4468515682185846e-05,
715
+ "loss": 0.2044,
716
+ "step": 50500
717
+ },
718
+ {
719
+ "epoch": 0.9411156834160654,
720
+ "grad_norm": 3.124086856842041,
721
+ "learning_rate": 3.4314738609732245e-05,
722
+ "loss": 0.1937,
723
+ "step": 51000
724
+ },
725
+ {
726
+ "epoch": 0.9503423077632818,
727
+ "grad_norm": 4.916173458099365,
728
+ "learning_rate": 3.4160961537278643e-05,
729
+ "loss": 0.1959,
730
+ "step": 51500
731
+ },
732
+ {
733
+ "epoch": 0.9595689321104981,
734
+ "grad_norm": 3.445047378540039,
735
+ "learning_rate": 3.400718446482503e-05,
736
+ "loss": 0.1999,
737
+ "step": 52000
738
+ },
739
+ {
740
+ "epoch": 0.9687955564577144,
741
+ "grad_norm": 2.2390198707580566,
742
+ "learning_rate": 3.385340739237143e-05,
743
+ "loss": 0.1887,
744
+ "step": 52500
745
+ },
746
+ {
747
+ "epoch": 0.9780221808049308,
748
+ "grad_norm": 6.404945373535156,
749
+ "learning_rate": 3.3699630319917826e-05,
750
+ "loss": 0.1963,
751
+ "step": 53000
752
+ },
753
+ {
754
+ "epoch": 0.9872488051521471,
755
+ "grad_norm": 3.268970251083374,
756
+ "learning_rate": 3.354585324746422e-05,
757
+ "loss": 0.1958,
758
+ "step": 53500
759
+ },
760
+ {
761
+ "epoch": 0.9964754294993634,
762
+ "grad_norm": 2.5354039669036865,
763
+ "learning_rate": 3.339207617501061e-05,
764
+ "loss": 0.1898,
765
+ "step": 54000
766
+ },
767
+ {
768
+ "epoch": 1.0,
769
+ "eval_loss": 0.23220877349376678,
770
+ "eval_mse": 0.2322087733155601,
771
+ "eval_runtime": 114.203,
772
+ "eval_samples_per_second": 1687.136,
773
+ "eval_steps_per_second": 210.896,
774
+ "step": 54191
775
+ },
776
+ {
777
+ "epoch": 1.0057020538465797,
778
+ "grad_norm": 2.1543655395507812,
779
+ "learning_rate": 3.323829910255701e-05,
780
+ "loss": 0.1712,
781
+ "step": 54500
782
+ },
783
+ {
784
+ "epoch": 1.014928678193796,
785
+ "grad_norm": 2.780333995819092,
786
+ "learning_rate": 3.30845220301034e-05,
787
+ "loss": 0.1574,
788
+ "step": 55000
789
+ },
790
+ {
791
+ "epoch": 1.0241553025410124,
792
+ "grad_norm": 5.817172527313232,
793
+ "learning_rate": 3.293074495764979e-05,
794
+ "loss": 0.1505,
795
+ "step": 55500
796
+ },
797
+ {
798
+ "epoch": 1.0333819268882287,
799
+ "grad_norm": 5.431843280792236,
800
+ "learning_rate": 3.277696788519619e-05,
801
+ "loss": 0.1551,
802
+ "step": 56000
803
+ },
804
+ {
805
+ "epoch": 1.042608551235445,
806
+ "grad_norm": 2.024513006210327,
807
+ "learning_rate": 3.262319081274258e-05,
808
+ "loss": 0.1541,
809
+ "step": 56500
810
+ },
811
+ {
812
+ "epoch": 1.0518351755826614,
813
+ "grad_norm": 5.155509948730469,
814
+ "learning_rate": 3.246941374028898e-05,
815
+ "loss": 0.1538,
816
+ "step": 57000
817
+ },
818
+ {
819
+ "epoch": 1.0610617999298777,
820
+ "grad_norm": 1.8281043767929077,
821
+ "learning_rate": 3.231563666783537e-05,
822
+ "loss": 0.1503,
823
+ "step": 57500
824
+ },
825
+ {
826
+ "epoch": 1.070288424277094,
827
+ "grad_norm": 3.030827283859253,
828
+ "learning_rate": 3.2161859595381765e-05,
829
+ "loss": 0.1535,
830
+ "step": 58000
831
+ },
832
+ {
833
+ "epoch": 1.0795150486243104,
834
+ "grad_norm": 3.2830984592437744,
835
+ "learning_rate": 3.2008082522928164e-05,
836
+ "loss": 0.1567,
837
+ "step": 58500
838
+ },
839
+ {
840
+ "epoch": 1.0887416729715267,
841
+ "grad_norm": 2.756232500076294,
842
+ "learning_rate": 3.1854305450474555e-05,
843
+ "loss": 0.1576,
844
+ "step": 59000
845
+ },
846
+ {
847
+ "epoch": 1.097968297318743,
848
+ "grad_norm": 2.0984957218170166,
849
+ "learning_rate": 3.170052837802095e-05,
850
+ "loss": 0.161,
851
+ "step": 59500
852
+ },
853
+ {
854
+ "epoch": 1.1071949216659593,
855
+ "grad_norm": 2.4525437355041504,
856
+ "learning_rate": 3.1546751305567346e-05,
857
+ "loss": 0.1542,
858
+ "step": 60000
859
+ },
860
+ {
861
+ "epoch": 1.1164215460131757,
862
+ "grad_norm": 2.31719970703125,
863
+ "learning_rate": 3.1392974233113745e-05,
864
+ "loss": 0.1528,
865
+ "step": 60500
866
+ },
867
+ {
868
+ "epoch": 1.125648170360392,
869
+ "grad_norm": 3.3912220001220703,
870
+ "learning_rate": 3.123919716066013e-05,
871
+ "loss": 0.1551,
872
+ "step": 61000
873
+ },
874
+ {
875
+ "epoch": 1.1348747947076083,
876
+ "grad_norm": 3.2458841800689697,
877
+ "learning_rate": 3.108542008820653e-05,
878
+ "loss": 0.1508,
879
+ "step": 61500
880
+ },
881
+ {
882
+ "epoch": 1.1441014190548247,
883
+ "grad_norm": 3.3046302795410156,
884
+ "learning_rate": 3.093164301575293e-05,
885
+ "loss": 0.1465,
886
+ "step": 62000
887
+ },
888
+ {
889
+ "epoch": 1.153328043402041,
890
+ "grad_norm": 4.0332183837890625,
891
+ "learning_rate": 3.077786594329932e-05,
892
+ "loss": 0.1535,
893
+ "step": 62500
894
+ },
895
+ {
896
+ "epoch": 1.1625546677492573,
897
+ "grad_norm": 2.0470728874206543,
898
+ "learning_rate": 3.062408887084571e-05,
899
+ "loss": 0.1501,
900
+ "step": 63000
901
+ },
902
+ {
903
+ "epoch": 1.1717812920964736,
904
+ "grad_norm": 4.00844669342041,
905
+ "learning_rate": 3.047031179839211e-05,
906
+ "loss": 0.1556,
907
+ "step": 63500
908
+ },
909
+ {
910
+ "epoch": 1.18100791644369,
911
+ "grad_norm": 2.260006904602051,
912
+ "learning_rate": 3.03165347259385e-05,
913
+ "loss": 0.1514,
914
+ "step": 64000
915
+ },
916
+ {
917
+ "epoch": 1.1902345407909063,
918
+ "grad_norm": 1.3348864316940308,
919
+ "learning_rate": 3.0162757653484897e-05,
920
+ "loss": 0.1436,
921
+ "step": 64500
922
+ },
923
+ {
924
+ "epoch": 1.1994611651381226,
925
+ "grad_norm": 5.925819396972656,
926
+ "learning_rate": 3.0008980581031292e-05,
927
+ "loss": 0.1521,
928
+ "step": 65000
929
+ },
930
+ {
931
+ "epoch": 1.208687789485339,
932
+ "grad_norm": 4.659446716308594,
933
+ "learning_rate": 2.9855203508577684e-05,
934
+ "loss": 0.1434,
935
+ "step": 65500
936
+ },
937
+ {
938
+ "epoch": 1.2179144138325553,
939
+ "grad_norm": 4.0146164894104,
940
+ "learning_rate": 2.970142643612408e-05,
941
+ "loss": 0.1503,
942
+ "step": 66000
943
+ },
944
+ {
945
+ "epoch": 1.2271410381797716,
946
+ "grad_norm": 1.715017557144165,
947
+ "learning_rate": 2.9547649363670478e-05,
948
+ "loss": 0.1499,
949
+ "step": 66500
950
+ },
951
+ {
952
+ "epoch": 1.236367662526988,
953
+ "grad_norm": 4.178813457489014,
954
+ "learning_rate": 2.9393872291216866e-05,
955
+ "loss": 0.1504,
956
+ "step": 67000
957
+ },
958
+ {
959
+ "epoch": 1.2455942868742043,
960
+ "grad_norm": 2.155510663986206,
961
+ "learning_rate": 2.9240095218763265e-05,
962
+ "loss": 0.1423,
963
+ "step": 67500
964
+ },
965
+ {
966
+ "epoch": 1.2548209112214206,
967
+ "grad_norm": 1.8401468992233276,
968
+ "learning_rate": 2.908631814630966e-05,
969
+ "loss": 0.1534,
970
+ "step": 68000
971
+ },
972
+ {
973
+ "epoch": 1.264047535568637,
974
+ "grad_norm": 3.5961029529571533,
975
+ "learning_rate": 2.8932541073856055e-05,
976
+ "loss": 0.1422,
977
+ "step": 68500
978
+ },
979
+ {
980
+ "epoch": 1.2732741599158532,
981
+ "grad_norm": 2.855060338973999,
982
+ "learning_rate": 2.8778764001402447e-05,
983
+ "loss": 0.1497,
984
+ "step": 69000
985
+ },
986
+ {
987
+ "epoch": 1.2825007842630696,
988
+ "grad_norm": 2.705552816390991,
989
+ "learning_rate": 2.8624986928948842e-05,
990
+ "loss": 0.1482,
991
+ "step": 69500
992
+ },
993
+ {
994
+ "epoch": 1.291727408610286,
995
+ "grad_norm": 3.748999834060669,
996
+ "learning_rate": 2.847120985649524e-05,
997
+ "loss": 0.1516,
998
+ "step": 70000
999
+ },
1000
+ {
1001
+ "epoch": 1.3009540329575022,
1002
+ "grad_norm": 2.6836044788360596,
1003
+ "learning_rate": 2.831743278404163e-05,
1004
+ "loss": 0.1476,
1005
+ "step": 70500
1006
+ },
1007
+ {
1008
+ "epoch": 1.3101806573047186,
1009
+ "grad_norm": 1.9708038568496704,
1010
+ "learning_rate": 2.8163655711588028e-05,
1011
+ "loss": 0.1469,
1012
+ "step": 71000
1013
+ },
1014
+ {
1015
+ "epoch": 1.3194072816519349,
1016
+ "grad_norm": 2.0082767009735107,
1017
+ "learning_rate": 2.8009878639134424e-05,
1018
+ "loss": 0.1473,
1019
+ "step": 71500
1020
+ },
1021
+ {
1022
+ "epoch": 1.3286339059991512,
1023
+ "grad_norm": 5.9193830490112305,
1024
+ "learning_rate": 2.7856101566680815e-05,
1025
+ "loss": 0.148,
1026
+ "step": 72000
1027
+ },
1028
+ {
1029
+ "epoch": 1.3378605303463675,
1030
+ "grad_norm": 2.226789951324463,
1031
+ "learning_rate": 2.770232449422721e-05,
1032
+ "loss": 0.1479,
1033
+ "step": 72500
1034
+ },
1035
+ {
1036
+ "epoch": 1.3470871546935839,
1037
+ "grad_norm": 2.320139169692993,
1038
+ "learning_rate": 2.7548547421773606e-05,
1039
+ "loss": 0.141,
1040
+ "step": 73000
1041
+ },
1042
+ {
1043
+ "epoch": 1.3563137790408002,
1044
+ "grad_norm": 1.762904405593872,
1045
+ "learning_rate": 2.7394770349319998e-05,
1046
+ "loss": 0.143,
1047
+ "step": 73500
1048
+ },
1049
+ {
1050
+ "epoch": 1.3655404033880165,
1051
+ "grad_norm": 1.4634217023849487,
1052
+ "learning_rate": 2.7240993276866393e-05,
1053
+ "loss": 0.1417,
1054
+ "step": 74000
1055
+ },
1056
+ {
1057
+ "epoch": 1.3747670277352328,
1058
+ "grad_norm": 1.4410927295684814,
1059
+ "learning_rate": 2.7087216204412792e-05,
1060
+ "loss": 0.1417,
1061
+ "step": 74500
1062
+ },
1063
+ {
1064
+ "epoch": 1.3839936520824492,
1065
+ "grad_norm": 2.7735280990600586,
1066
+ "learning_rate": 2.693343913195918e-05,
1067
+ "loss": 0.1439,
1068
+ "step": 75000
1069
+ },
1070
+ {
1071
+ "epoch": 1.3932202764296655,
1072
+ "grad_norm": 2.384705066680908,
1073
+ "learning_rate": 2.677966205950558e-05,
1074
+ "loss": 0.1437,
1075
+ "step": 75500
1076
+ },
1077
+ {
1078
+ "epoch": 1.4024469007768818,
1079
+ "grad_norm": 3.4809861183166504,
1080
+ "learning_rate": 2.6625884987051974e-05,
1081
+ "loss": 0.1408,
1082
+ "step": 76000
1083
+ },
1084
+ {
1085
+ "epoch": 1.4116735251240982,
1086
+ "grad_norm": 2.29471492767334,
1087
+ "learning_rate": 2.6472107914598366e-05,
1088
+ "loss": 0.1459,
1089
+ "step": 76500
1090
+ },
1091
+ {
1092
+ "epoch": 1.4209001494713145,
1093
+ "grad_norm": 3.0202510356903076,
1094
+ "learning_rate": 2.631833084214476e-05,
1095
+ "loss": 0.1402,
1096
+ "step": 77000
1097
+ },
1098
+ {
1099
+ "epoch": 1.4301267738185308,
1100
+ "grad_norm": 2.7061448097229004,
1101
+ "learning_rate": 2.6164553769691157e-05,
1102
+ "loss": 0.1408,
1103
+ "step": 77500
1104
+ },
1105
+ {
1106
+ "epoch": 1.4393533981657471,
1107
+ "grad_norm": 1.499624252319336,
1108
+ "learning_rate": 2.601077669723755e-05,
1109
+ "loss": 0.1443,
1110
+ "step": 78000
1111
+ },
1112
+ {
1113
+ "epoch": 1.4485800225129635,
1114
+ "grad_norm": 8.131513595581055,
1115
+ "learning_rate": 2.5856999624783944e-05,
1116
+ "loss": 0.1374,
1117
+ "step": 78500
1118
+ },
1119
+ {
1120
+ "epoch": 1.4578066468601798,
1121
+ "grad_norm": 1.652654767036438,
1122
+ "learning_rate": 2.5703222552330342e-05,
1123
+ "loss": 0.1401,
1124
+ "step": 79000
1125
+ },
1126
+ {
1127
+ "epoch": 1.4670332712073961,
1128
+ "grad_norm": 2.2545433044433594,
1129
+ "learning_rate": 2.554944547987673e-05,
1130
+ "loss": 0.1388,
1131
+ "step": 79500
1132
+ },
1133
+ {
1134
+ "epoch": 1.4762598955546125,
1135
+ "grad_norm": 2.1318209171295166,
1136
+ "learning_rate": 2.539566840742313e-05,
1137
+ "loss": 0.1434,
1138
+ "step": 80000
1139
+ },
1140
+ {
1141
+ "epoch": 1.4854865199018288,
1142
+ "grad_norm": 1.8352861404418945,
1143
+ "learning_rate": 2.5241891334969525e-05,
1144
+ "loss": 0.142,
1145
+ "step": 80500
1146
+ },
1147
+ {
1148
+ "epoch": 1.4947131442490451,
1149
+ "grad_norm": 2.1764025688171387,
1150
+ "learning_rate": 2.5088114262515917e-05,
1151
+ "loss": 0.1366,
1152
+ "step": 81000
1153
+ },
1154
+ {
1155
+ "epoch": 1.5039397685962612,
1156
+ "grad_norm": 2.425063371658325,
1157
+ "learning_rate": 2.4934337190062312e-05,
1158
+ "loss": 0.1435,
1159
+ "step": 81500
1160
+ },
1161
+ {
1162
+ "epoch": 1.5131663929434778,
1163
+ "grad_norm": 1.579362154006958,
1164
+ "learning_rate": 2.4780560117608707e-05,
1165
+ "loss": 0.1383,
1166
+ "step": 82000
1167
+ },
1168
+ {
1169
+ "epoch": 1.5223930172906939,
1170
+ "grad_norm": 1.9185165166854858,
1171
+ "learning_rate": 2.46267830451551e-05,
1172
+ "loss": 0.1357,
1173
+ "step": 82500
1174
+ },
1175
+ {
1176
+ "epoch": 1.5316196416379104,
1177
+ "grad_norm": 1.506785273551941,
1178
+ "learning_rate": 2.4473005972701498e-05,
1179
+ "loss": 0.1366,
1180
+ "step": 83000
1181
+ },
1182
+ {
1183
+ "epoch": 1.5408462659851265,
1184
+ "grad_norm": 2.999217987060547,
1185
+ "learning_rate": 2.431922890024789e-05,
1186
+ "loss": 0.1374,
1187
+ "step": 83500
1188
+ },
1189
+ {
1190
+ "epoch": 1.550072890332343,
1191
+ "grad_norm": 1.4639360904693604,
1192
+ "learning_rate": 2.4165451827794285e-05,
1193
+ "loss": 0.1339,
1194
+ "step": 84000
1195
+ },
1196
+ {
1197
+ "epoch": 1.5592995146795592,
1198
+ "grad_norm": 3.4754111766815186,
1199
+ "learning_rate": 2.401167475534068e-05,
1200
+ "loss": 0.1288,
1201
+ "step": 84500
1202
+ },
1203
+ {
1204
+ "epoch": 1.5685261390267757,
1205
+ "grad_norm": 2.0212953090667725,
1206
+ "learning_rate": 2.3857897682887072e-05,
1207
+ "loss": 0.1379,
1208
+ "step": 85000
1209
+ },
1210
+ {
1211
+ "epoch": 1.5777527633739918,
1212
+ "grad_norm": 14.00969409942627,
1213
+ "learning_rate": 2.3704120610433467e-05,
1214
+ "loss": 0.135,
1215
+ "step": 85500
1216
+ },
1217
+ {
1218
+ "epoch": 1.5869793877212084,
1219
+ "grad_norm": 2.084036111831665,
1220
+ "learning_rate": 2.3550343537979862e-05,
1221
+ "loss": 0.1406,
1222
+ "step": 86000
1223
+ },
1224
+ {
1225
+ "epoch": 1.5962060120684245,
1226
+ "grad_norm": 1.8672277927398682,
1227
+ "learning_rate": 2.3396566465526258e-05,
1228
+ "loss": 0.131,
1229
+ "step": 86500
1230
+ },
1231
+ {
1232
+ "epoch": 1.605432636415641,
1233
+ "grad_norm": 1.3933255672454834,
1234
+ "learning_rate": 2.324278939307265e-05,
1235
+ "loss": 0.1346,
1236
+ "step": 87000
1237
+ },
1238
+ {
1239
+ "epoch": 1.6146592607628572,
1240
+ "grad_norm": 4.199204921722412,
1241
+ "learning_rate": 2.3089012320619048e-05,
1242
+ "loss": 0.1345,
1243
+ "step": 87500
1244
+ },
1245
+ {
1246
+ "epoch": 1.6238858851100737,
1247
+ "grad_norm": 2.914705276489258,
1248
+ "learning_rate": 2.293523524816544e-05,
1249
+ "loss": 0.1331,
1250
+ "step": 88000
1251
+ },
1252
+ {
1253
+ "epoch": 1.6331125094572898,
1254
+ "grad_norm": 2.8266611099243164,
1255
+ "learning_rate": 2.2781458175711835e-05,
1256
+ "loss": 0.1331,
1257
+ "step": 88500
1258
+ },
1259
+ {
1260
+ "epoch": 1.6423391338045064,
1261
+ "grad_norm": 2.148892402648926,
1262
+ "learning_rate": 2.262768110325823e-05,
1263
+ "loss": 0.1353,
1264
+ "step": 89000
1265
+ },
1266
+ {
1267
+ "epoch": 1.6515657581517225,
1268
+ "grad_norm": 3.0781641006469727,
1269
+ "learning_rate": 2.2473904030804623e-05,
1270
+ "loss": 0.1312,
1271
+ "step": 89500
1272
+ },
1273
+ {
1274
+ "epoch": 1.660792382498939,
1275
+ "grad_norm": 1.3129165172576904,
1276
+ "learning_rate": 2.2320126958351018e-05,
1277
+ "loss": 0.1287,
1278
+ "step": 90000
1279
+ },
1280
+ {
1281
+ "epoch": 1.6700190068461551,
1282
+ "grad_norm": 2.6767327785491943,
1283
+ "learning_rate": 2.2166349885897413e-05,
1284
+ "loss": 0.1307,
1285
+ "step": 90500
1286
+ },
1287
+ {
1288
+ "epoch": 1.6792456311933717,
1289
+ "grad_norm": 2.783486843109131,
1290
+ "learning_rate": 2.201257281344381e-05,
1291
+ "loss": 0.1307,
1292
+ "step": 91000
1293
+ },
1294
+ {
1295
+ "epoch": 1.6884722555405878,
1296
+ "grad_norm": 4.483890056610107,
1297
+ "learning_rate": 2.18587957409902e-05,
1298
+ "loss": 0.1311,
1299
+ "step": 91500
1300
+ },
1301
+ {
1302
+ "epoch": 1.6976988798878043,
1303
+ "grad_norm": 2.766557216644287,
1304
+ "learning_rate": 2.17050186685366e-05,
1305
+ "loss": 0.1327,
1306
+ "step": 92000
1307
+ },
1308
+ {
1309
+ "epoch": 1.7069255042350204,
1310
+ "grad_norm": 3.863123893737793,
1311
+ "learning_rate": 2.155124159608299e-05,
1312
+ "loss": 0.1358,
1313
+ "step": 92500
1314
+ },
1315
+ {
1316
+ "epoch": 1.716152128582237,
1317
+ "grad_norm": 3.8993873596191406,
1318
+ "learning_rate": 2.1397464523629386e-05,
1319
+ "loss": 0.1306,
1320
+ "step": 93000
1321
+ },
1322
+ {
1323
+ "epoch": 1.725378752929453,
1324
+ "grad_norm": 3.616542100906372,
1325
+ "learning_rate": 2.124368745117578e-05,
1326
+ "loss": 0.1306,
1327
+ "step": 93500
1328
+ },
1329
+ {
1330
+ "epoch": 1.7346053772766696,
1331
+ "grad_norm": 2.784503698348999,
1332
+ "learning_rate": 2.1089910378722173e-05,
1333
+ "loss": 0.1316,
1334
+ "step": 94000
1335
+ },
1336
+ {
1337
+ "epoch": 1.7438320016238857,
1338
+ "grad_norm": 2.199709415435791,
1339
+ "learning_rate": 2.093613330626857e-05,
1340
+ "loss": 0.13,
1341
+ "step": 94500
1342
+ },
1343
+ {
1344
+ "epoch": 1.7530586259711023,
1345
+ "grad_norm": 1.9818087816238403,
1346
+ "learning_rate": 2.0782356233814964e-05,
1347
+ "loss": 0.1308,
1348
+ "step": 95000
1349
+ },
1350
+ {
1351
+ "epoch": 1.7622852503183184,
1352
+ "grad_norm": 1.3748022317886353,
1353
+ "learning_rate": 2.062857916136136e-05,
1354
+ "loss": 0.1279,
1355
+ "step": 95500
1356
+ },
1357
+ {
1358
+ "epoch": 1.771511874665535,
1359
+ "grad_norm": 2.4911797046661377,
1360
+ "learning_rate": 2.047480208890775e-05,
1361
+ "loss": 0.1287,
1362
+ "step": 96000
1363
+ },
1364
+ {
1365
+ "epoch": 1.780738499012751,
1366
+ "grad_norm": 2.5785412788391113,
1367
+ "learning_rate": 2.032102501645415e-05,
1368
+ "loss": 0.1293,
1369
+ "step": 96500
1370
+ },
1371
+ {
1372
+ "epoch": 1.7899651233599676,
1373
+ "grad_norm": 3.9389474391937256,
1374
+ "learning_rate": 2.016724794400054e-05,
1375
+ "loss": 0.1276,
1376
+ "step": 97000
1377
+ },
1378
+ {
1379
+ "epoch": 1.7991917477071837,
1380
+ "grad_norm": 3.8254497051239014,
1381
+ "learning_rate": 2.0013470871546937e-05,
1382
+ "loss": 0.1259,
1383
+ "step": 97500
1384
+ },
1385
+ {
1386
+ "epoch": 1.8084183720544003,
1387
+ "grad_norm": 2.5958099365234375,
1388
+ "learning_rate": 1.9859693799093332e-05,
1389
+ "loss": 0.1264,
1390
+ "step": 98000
1391
+ },
1392
+ {
1393
+ "epoch": 1.8176449964016164,
1394
+ "grad_norm": 5.190915107727051,
1395
+ "learning_rate": 1.9705916726639727e-05,
1396
+ "loss": 0.1288,
1397
+ "step": 98500
1398
+ },
1399
+ {
1400
+ "epoch": 1.826871620748833,
1401
+ "grad_norm": 1.9603300094604492,
1402
+ "learning_rate": 1.955213965418612e-05,
1403
+ "loss": 0.1271,
1404
+ "step": 99000
1405
+ },
1406
+ {
1407
+ "epoch": 1.836098245096049,
1408
+ "grad_norm": 2.722358226776123,
1409
+ "learning_rate": 1.9398362581732514e-05,
1410
+ "loss": 0.1278,
1411
+ "step": 99500
1412
+ },
1413
+ {
1414
+ "epoch": 1.8453248694432656,
1415
+ "grad_norm": 1.6586706638336182,
1416
+ "learning_rate": 1.924458550927891e-05,
1417
+ "loss": 0.1248,
1418
+ "step": 100000
1419
+ },
1420
+ {
1421
+ "epoch": 1.8545514937904817,
1422
+ "grad_norm": 2.985854148864746,
1423
+ "learning_rate": 1.90908084368253e-05,
1424
+ "loss": 0.1266,
1425
+ "step": 100500
1426
+ },
1427
+ {
1428
+ "epoch": 1.8637781181376982,
1429
+ "grad_norm": 2.7211902141571045,
1430
+ "learning_rate": 1.89370313643717e-05,
1431
+ "loss": 0.1267,
1432
+ "step": 101000
1433
+ },
1434
+ {
1435
+ "epoch": 1.8730047424849143,
1436
+ "grad_norm": 2.373112678527832,
1437
+ "learning_rate": 1.8783254291918092e-05,
1438
+ "loss": 0.1239,
1439
+ "step": 101500
1440
+ },
1441
+ {
1442
+ "epoch": 1.8822313668321309,
1443
+ "grad_norm": 2.1845340728759766,
1444
+ "learning_rate": 1.862947721946449e-05,
1445
+ "loss": 0.1259,
1446
+ "step": 102000
1447
+ },
1448
+ {
1449
+ "epoch": 1.891457991179347,
1450
+ "grad_norm": 2.6702089309692383,
1451
+ "learning_rate": 1.8475700147010882e-05,
1452
+ "loss": 0.1267,
1453
+ "step": 102500
1454
+ },
1455
+ {
1456
+ "epoch": 1.9006846155265635,
1457
+ "grad_norm": 1.2957886457443237,
1458
+ "learning_rate": 1.8321923074557278e-05,
1459
+ "loss": 0.1231,
1460
+ "step": 103000
1461
+ },
1462
+ {
1463
+ "epoch": 1.9099112398737796,
1464
+ "grad_norm": 2.2960615158081055,
1465
+ "learning_rate": 1.8168146002103673e-05,
1466
+ "loss": 0.1242,
1467
+ "step": 103500
1468
+ },
1469
+ {
1470
+ "epoch": 1.9191378642209962,
1471
+ "grad_norm": 1.4060367345809937,
1472
+ "learning_rate": 1.8014368929650065e-05,
1473
+ "loss": 0.1217,
1474
+ "step": 104000
1475
+ },
1476
+ {
1477
+ "epoch": 1.9283644885682123,
1478
+ "grad_norm": 1.8247722387313843,
1479
+ "learning_rate": 1.786059185719646e-05,
1480
+ "loss": 0.1247,
1481
+ "step": 104500
1482
+ },
1483
+ {
1484
+ "epoch": 1.9375911129154288,
1485
+ "grad_norm": 4.583653450012207,
1486
+ "learning_rate": 1.7706814784742855e-05,
1487
+ "loss": 0.1224,
1488
+ "step": 105000
1489
+ },
1490
+ {
1491
+ "epoch": 1.946817737262645,
1492
+ "grad_norm": 1.7650556564331055,
1493
+ "learning_rate": 1.755303771228925e-05,
1494
+ "loss": 0.1235,
1495
+ "step": 105500
1496
+ },
1497
+ {
1498
+ "epoch": 1.9560443616098615,
1499
+ "grad_norm": 2.088684320449829,
1500
+ "learning_rate": 1.7399260639835643e-05,
1501
+ "loss": 0.1203,
1502
+ "step": 106000
1503
+ },
1504
+ {
1505
+ "epoch": 1.9652709859570776,
1506
+ "grad_norm": 2.448063850402832,
1507
+ "learning_rate": 1.724548356738204e-05,
1508
+ "loss": 0.1209,
1509
+ "step": 106500
1510
+ },
1511
+ {
1512
+ "epoch": 1.9744976103042942,
1513
+ "grad_norm": 4.1177778244018555,
1514
+ "learning_rate": 1.7091706494928433e-05,
1515
+ "loss": 0.1188,
1516
+ "step": 107000
1517
+ },
1518
+ {
1519
+ "epoch": 1.9837242346515103,
1520
+ "grad_norm": 4.088508129119873,
1521
+ "learning_rate": 1.693792942247483e-05,
1522
+ "loss": 0.1206,
1523
+ "step": 107500
1524
+ },
1525
+ {
1526
+ "epoch": 1.9929508589987268,
1527
+ "grad_norm": 2.3093175888061523,
1528
+ "learning_rate": 1.6784152350021224e-05,
1529
+ "loss": 0.1186,
1530
+ "step": 108000
1531
+ },
1532
+ {
1533
+ "epoch": 2.0,
1534
+ "eval_loss": 0.2138589769601822,
1535
+ "eval_mse": 0.21385898989204177,
1536
+ "eval_runtime": 125.4291,
1537
+ "eval_samples_per_second": 1536.134,
1538
+ "eval_steps_per_second": 192.021,
1539
+ "step": 108382
1540
+ },
1541
+ {
1542
+ "epoch": 2.002177483345943,
1543
+ "grad_norm": 2.2715179920196533,
1544
+ "learning_rate": 1.6630375277567615e-05,
1545
+ "loss": 0.114,
1546
+ "step": 108500
1547
+ },
1548
+ {
1549
+ "epoch": 2.0114041076931595,
1550
+ "grad_norm": 2.236180543899536,
1551
+ "learning_rate": 1.647659820511401e-05,
1552
+ "loss": 0.0994,
1553
+ "step": 109000
1554
+ },
1555
+ {
1556
+ "epoch": 2.0206307320403756,
1557
+ "grad_norm": 2.335440158843994,
1558
+ "learning_rate": 1.6322821132660406e-05,
1559
+ "loss": 0.0975,
1560
+ "step": 109500
1561
+ },
1562
+ {
1563
+ "epoch": 2.029857356387592,
1564
+ "grad_norm": 4.5400519371032715,
1565
+ "learning_rate": 1.61690440602068e-05,
1566
+ "loss": 0.0972,
1567
+ "step": 110000
1568
+ },
1569
+ {
1570
+ "epoch": 2.0390839807348082,
1571
+ "grad_norm": 2.633301258087158,
1572
+ "learning_rate": 1.6015266987753193e-05,
1573
+ "loss": 0.0982,
1574
+ "step": 110500
1575
+ },
1576
+ {
1577
+ "epoch": 2.048310605082025,
1578
+ "grad_norm": 1.150661826133728,
1579
+ "learning_rate": 1.5861489915299592e-05,
1580
+ "loss": 0.0999,
1581
+ "step": 111000
1582
+ },
1583
+ {
1584
+ "epoch": 2.057537229429241,
1585
+ "grad_norm": 1.9149357080459595,
1586
+ "learning_rate": 1.5707712842845984e-05,
1587
+ "loss": 0.0979,
1588
+ "step": 111500
1589
+ },
1590
+ {
1591
+ "epoch": 2.0667638537764574,
1592
+ "grad_norm": 1.996846079826355,
1593
+ "learning_rate": 1.555393577039238e-05,
1594
+ "loss": 0.0996,
1595
+ "step": 112000
1596
+ },
1597
+ {
1598
+ "epoch": 2.0759904781236735,
1599
+ "grad_norm": 1.5708836317062378,
1600
+ "learning_rate": 1.5400158697938774e-05,
1601
+ "loss": 0.0969,
1602
+ "step": 112500
1603
+ },
1604
+ {
1605
+ "epoch": 2.08521710247089,
1606
+ "grad_norm": 1.5404409170150757,
1607
+ "learning_rate": 1.5246381625485168e-05,
1608
+ "loss": 0.0963,
1609
+ "step": 113000
1610
+ },
1611
+ {
1612
+ "epoch": 2.094443726818106,
1613
+ "grad_norm": 1.6409614086151123,
1614
+ "learning_rate": 1.5092604553031561e-05,
1615
+ "loss": 0.0977,
1616
+ "step": 113500
1617
+ },
1618
+ {
1619
+ "epoch": 2.1036703511653227,
1620
+ "grad_norm": 1.7960460186004639,
1621
+ "learning_rate": 1.4938827480577958e-05,
1622
+ "loss": 0.0964,
1623
+ "step": 114000
1624
+ },
1625
+ {
1626
+ "epoch": 2.112896975512539,
1627
+ "grad_norm": 1.685120701789856,
1628
+ "learning_rate": 1.4785050408124352e-05,
1629
+ "loss": 0.0989,
1630
+ "step": 114500
1631
+ },
1632
+ {
1633
+ "epoch": 2.1221235998597554,
1634
+ "grad_norm": 3.500861644744873,
1635
+ "learning_rate": 1.4631273335670745e-05,
1636
+ "loss": 0.0943,
1637
+ "step": 115000
1638
+ },
1639
+ {
1640
+ "epoch": 2.1313502242069715,
1641
+ "grad_norm": 2.3654606342315674,
1642
+ "learning_rate": 1.447749626321714e-05,
1643
+ "loss": 0.0963,
1644
+ "step": 115500
1645
+ },
1646
+ {
1647
+ "epoch": 2.140576848554188,
1648
+ "grad_norm": 3.000051975250244,
1649
+ "learning_rate": 1.4323719190763534e-05,
1650
+ "loss": 0.0922,
1651
+ "step": 116000
1652
+ },
1653
+ {
1654
+ "epoch": 2.149803472901404,
1655
+ "grad_norm": 2.384732961654663,
1656
+ "learning_rate": 1.4169942118309928e-05,
1657
+ "loss": 0.0971,
1658
+ "step": 116500
1659
+ },
1660
+ {
1661
+ "epoch": 2.1590300972486207,
1662
+ "grad_norm": 1.3965630531311035,
1663
+ "learning_rate": 1.4016165045856325e-05,
1664
+ "loss": 0.0971,
1665
+ "step": 117000
1666
+ },
1667
+ {
1668
+ "epoch": 2.168256721595837,
1669
+ "grad_norm": 1.745569109916687,
1670
+ "learning_rate": 1.3862387973402718e-05,
1671
+ "loss": 0.0977,
1672
+ "step": 117500
1673
+ },
1674
+ {
1675
+ "epoch": 2.1774833459430534,
1676
+ "grad_norm": 2.326707363128662,
1677
+ "learning_rate": 1.3708610900949112e-05,
1678
+ "loss": 0.0937,
1679
+ "step": 118000
1680
+ },
1681
+ {
1682
+ "epoch": 2.1867099702902695,
1683
+ "grad_norm": 1.6542750597000122,
1684
+ "learning_rate": 1.3554833828495509e-05,
1685
+ "loss": 0.0943,
1686
+ "step": 118500
1687
+ },
1688
+ {
1689
+ "epoch": 2.195936594637486,
1690
+ "grad_norm": 1.1322625875473022,
1691
+ "learning_rate": 1.3401056756041902e-05,
1692
+ "loss": 0.0937,
1693
+ "step": 119000
1694
+ },
1695
+ {
1696
+ "epoch": 2.205163218984702,
1697
+ "grad_norm": 1.815834641456604,
1698
+ "learning_rate": 1.3247279683588296e-05,
1699
+ "loss": 0.0938,
1700
+ "step": 119500
1701
+ },
1702
+ {
1703
+ "epoch": 2.2143898433319187,
1704
+ "grad_norm": 4.64595890045166,
1705
+ "learning_rate": 1.3093502611134691e-05,
1706
+ "loss": 0.0939,
1707
+ "step": 120000
1708
+ },
1709
+ {
1710
+ "epoch": 2.223616467679135,
1711
+ "grad_norm": 2.1671462059020996,
1712
+ "learning_rate": 1.2939725538681085e-05,
1713
+ "loss": 0.093,
1714
+ "step": 120500
1715
+ },
1716
+ {
1717
+ "epoch": 2.2328430920263513,
1718
+ "grad_norm": 1.636570692062378,
1719
+ "learning_rate": 1.2785948466227478e-05,
1720
+ "loss": 0.0928,
1721
+ "step": 121000
1722
+ },
1723
+ {
1724
+ "epoch": 2.2420697163735674,
1725
+ "grad_norm": 3.4394800662994385,
1726
+ "learning_rate": 1.2632171393773875e-05,
1727
+ "loss": 0.0936,
1728
+ "step": 121500
1729
+ },
1730
+ {
1731
+ "epoch": 2.251296340720784,
1732
+ "grad_norm": 2.013307571411133,
1733
+ "learning_rate": 1.2478394321320269e-05,
1734
+ "loss": 0.0954,
1735
+ "step": 122000
1736
+ },
1737
+ {
1738
+ "epoch": 2.260522965068,
1739
+ "grad_norm": 3.2544264793395996,
1740
+ "learning_rate": 1.2324617248866664e-05,
1741
+ "loss": 0.0987,
1742
+ "step": 122500
1743
+ },
1744
+ {
1745
+ "epoch": 2.2697495894152167,
1746
+ "grad_norm": 2.9892079830169678,
1747
+ "learning_rate": 1.2170840176413058e-05,
1748
+ "loss": 0.0931,
1749
+ "step": 123000
1750
+ },
1751
+ {
1752
+ "epoch": 2.2789762137624328,
1753
+ "grad_norm": 3.113938331604004,
1754
+ "learning_rate": 1.2017063103959453e-05,
1755
+ "loss": 0.0945,
1756
+ "step": 123500
1757
+ },
1758
+ {
1759
+ "epoch": 2.2882028381096493,
1760
+ "grad_norm": 1.7884827852249146,
1761
+ "learning_rate": 1.1863286031505848e-05,
1762
+ "loss": 0.0935,
1763
+ "step": 124000
1764
+ },
1765
+ {
1766
+ "epoch": 2.2974294624568654,
1767
+ "grad_norm": 2.059272527694702,
1768
+ "learning_rate": 1.1709508959052242e-05,
1769
+ "loss": 0.0962,
1770
+ "step": 124500
1771
+ },
1772
+ {
1773
+ "epoch": 2.306656086804082,
1774
+ "grad_norm": 1.7323048114776611,
1775
+ "learning_rate": 1.1555731886598637e-05,
1776
+ "loss": 0.0928,
1777
+ "step": 125000
1778
+ },
1779
+ {
1780
+ "epoch": 2.315882711151298,
1781
+ "grad_norm": 1.6812376976013184,
1782
+ "learning_rate": 1.140195481414503e-05,
1783
+ "loss": 0.0918,
1784
+ "step": 125500
1785
+ },
1786
+ {
1787
+ "epoch": 2.3251093354985146,
1788
+ "grad_norm": 1.550013780593872,
1789
+ "learning_rate": 1.1248177741691424e-05,
1790
+ "loss": 0.0944,
1791
+ "step": 126000
1792
+ },
1793
+ {
1794
+ "epoch": 2.3343359598457307,
1795
+ "grad_norm": 2.913409948348999,
1796
+ "learning_rate": 1.109440066923782e-05,
1797
+ "loss": 0.0957,
1798
+ "step": 126500
1799
+ },
1800
+ {
1801
+ "epoch": 2.3435625841929473,
1802
+ "grad_norm": 1.515856146812439,
1803
+ "learning_rate": 1.0940623596784215e-05,
1804
+ "loss": 0.0929,
1805
+ "step": 127000
1806
+ },
1807
+ {
1808
+ "epoch": 2.3527892085401634,
1809
+ "grad_norm": 1.571866512298584,
1810
+ "learning_rate": 1.0786846524330608e-05,
1811
+ "loss": 0.0925,
1812
+ "step": 127500
1813
+ },
1814
+ {
1815
+ "epoch": 2.36201583288738,
1816
+ "grad_norm": 2.379932403564453,
1817
+ "learning_rate": 1.0633069451877004e-05,
1818
+ "loss": 0.0927,
1819
+ "step": 128000
1820
+ },
1821
+ {
1822
+ "epoch": 2.371242457234596,
1823
+ "grad_norm": 3.373950958251953,
1824
+ "learning_rate": 1.0479292379423399e-05,
1825
+ "loss": 0.0908,
1826
+ "step": 128500
1827
+ },
1828
+ {
1829
+ "epoch": 2.3804690815818126,
1830
+ "grad_norm": 2.3678219318389893,
1831
+ "learning_rate": 1.0325515306969792e-05,
1832
+ "loss": 0.0898,
1833
+ "step": 129000
1834
+ },
1835
+ {
1836
+ "epoch": 2.3896957059290287,
1837
+ "grad_norm": 2.636244058609009,
1838
+ "learning_rate": 1.0171738234516188e-05,
1839
+ "loss": 0.0892,
1840
+ "step": 129500
1841
+ },
1842
+ {
1843
+ "epoch": 2.3989223302762452,
1844
+ "grad_norm": 2.6495730876922607,
1845
+ "learning_rate": 1.0017961162062581e-05,
1846
+ "loss": 0.0886,
1847
+ "step": 130000
1848
+ },
1849
+ {
1850
+ "epoch": 2.4081489546234613,
1851
+ "grad_norm": 2.9955618381500244,
1852
+ "learning_rate": 9.864184089608975e-06,
1853
+ "loss": 0.0897,
1854
+ "step": 130500
1855
+ },
1856
+ {
1857
+ "epoch": 2.417375578970678,
1858
+ "grad_norm": 3.0076186656951904,
1859
+ "learning_rate": 9.71040701715537e-06,
1860
+ "loss": 0.0895,
1861
+ "step": 131000
1862
+ },
1863
+ {
1864
+ "epoch": 2.426602203317894,
1865
+ "grad_norm": 2.0592894554138184,
1866
+ "learning_rate": 9.556629944701765e-06,
1867
+ "loss": 0.0905,
1868
+ "step": 131500
1869
+ },
1870
+ {
1871
+ "epoch": 2.4358288276651106,
1872
+ "grad_norm": 1.5429611206054688,
1873
+ "learning_rate": 9.402852872248159e-06,
1874
+ "loss": 0.0937,
1875
+ "step": 132000
1876
+ },
1877
+ {
1878
+ "epoch": 2.4450554520123267,
1879
+ "grad_norm": 2.048470973968506,
1880
+ "learning_rate": 9.249075799794554e-06,
1881
+ "loss": 0.0903,
1882
+ "step": 132500
1883
+ },
1884
+ {
1885
+ "epoch": 2.454282076359543,
1886
+ "grad_norm": 1.8051766157150269,
1887
+ "learning_rate": 9.09529872734095e-06,
1888
+ "loss": 0.091,
1889
+ "step": 133000
1890
+ },
1891
+ {
1892
+ "epoch": 2.4635087007067593,
1893
+ "grad_norm": 1.5680794715881348,
1894
+ "learning_rate": 8.941521654887343e-06,
1895
+ "loss": 0.0892,
1896
+ "step": 133500
1897
+ },
1898
+ {
1899
+ "epoch": 2.472735325053976,
1900
+ "grad_norm": 1.979874610900879,
1901
+ "learning_rate": 8.787744582433738e-06,
1902
+ "loss": 0.0877,
1903
+ "step": 134000
1904
+ },
1905
+ {
1906
+ "epoch": 2.481961949401192,
1907
+ "grad_norm": 2.7211787700653076,
1908
+ "learning_rate": 8.633967509980134e-06,
1909
+ "loss": 0.0925,
1910
+ "step": 134500
1911
+ },
1912
+ {
1913
+ "epoch": 2.4911885737484085,
1914
+ "grad_norm": 1.0742968320846558,
1915
+ "learning_rate": 8.480190437526527e-06,
1916
+ "loss": 0.0881,
1917
+ "step": 135000
1918
+ },
1919
+ {
1920
+ "epoch": 2.5004151980956246,
1921
+ "grad_norm": 2.0518765449523926,
1922
+ "learning_rate": 8.32641336507292e-06,
1923
+ "loss": 0.0943,
1924
+ "step": 135500
1925
+ },
1926
+ {
1927
+ "epoch": 2.509641822442841,
1928
+ "grad_norm": 1.9672821760177612,
1929
+ "learning_rate": 8.172636292619316e-06,
1930
+ "loss": 0.0898,
1931
+ "step": 136000
1932
+ },
1933
+ {
1934
+ "epoch": 2.5188684467900573,
1935
+ "grad_norm": 1.2716307640075684,
1936
+ "learning_rate": 8.01885922016571e-06,
1937
+ "loss": 0.0875,
1938
+ "step": 136500
1939
+ },
1940
+ {
1941
+ "epoch": 2.528095071137274,
1942
+ "grad_norm": 2.617617607116699,
1943
+ "learning_rate": 7.865082147712105e-06,
1944
+ "loss": 0.0889,
1945
+ "step": 137000
1946
+ },
1947
+ {
1948
+ "epoch": 2.53732169548449,
1949
+ "grad_norm": 0.8945909738540649,
1950
+ "learning_rate": 7.7113050752585e-06,
1951
+ "loss": 0.0909,
1952
+ "step": 137500
1953
+ },
1954
+ {
1955
+ "epoch": 2.5465483198317065,
1956
+ "grad_norm": 1.661537766456604,
1957
+ "learning_rate": 7.557528002804894e-06,
1958
+ "loss": 0.0878,
1959
+ "step": 138000
1960
+ },
1961
+ {
1962
+ "epoch": 2.5557749441789226,
1963
+ "grad_norm": 3.6078097820281982,
1964
+ "learning_rate": 7.403750930351289e-06,
1965
+ "loss": 0.0903,
1966
+ "step": 138500
1967
+ },
1968
+ {
1969
+ "epoch": 2.565001568526139,
1970
+ "grad_norm": 1.483906626701355,
1971
+ "learning_rate": 7.249973857897683e-06,
1972
+ "loss": 0.0862,
1973
+ "step": 139000
1974
+ },
1975
+ {
1976
+ "epoch": 2.5742281928733552,
1977
+ "grad_norm": 1.867789626121521,
1978
+ "learning_rate": 7.096196785444077e-06,
1979
+ "loss": 0.0891,
1980
+ "step": 139500
1981
+ },
1982
+ {
1983
+ "epoch": 2.583454817220572,
1984
+ "grad_norm": 2.8336305618286133,
1985
+ "learning_rate": 6.942419712990472e-06,
1986
+ "loss": 0.0901,
1987
+ "step": 140000
1988
+ },
1989
+ {
1990
+ "epoch": 2.592681441567788,
1991
+ "grad_norm": 1.5188074111938477,
1992
+ "learning_rate": 6.7886426405368675e-06,
1993
+ "loss": 0.0903,
1994
+ "step": 140500
1995
+ },
1996
+ {
1997
+ "epoch": 2.6019080659150045,
1998
+ "grad_norm": 2.809237480163574,
1999
+ "learning_rate": 6.634865568083261e-06,
2000
+ "loss": 0.0892,
2001
+ "step": 141000
2002
+ },
2003
+ {
2004
+ "epoch": 2.6111346902622206,
2005
+ "grad_norm": 1.773245096206665,
2006
+ "learning_rate": 6.4810884956296555e-06,
2007
+ "loss": 0.0909,
2008
+ "step": 141500
2009
+ },
2010
+ {
2011
+ "epoch": 2.620361314609437,
2012
+ "grad_norm": 1.85002863407135,
2013
+ "learning_rate": 6.327311423176051e-06,
2014
+ "loss": 0.092,
2015
+ "step": 142000
2016
+ },
2017
+ {
2018
+ "epoch": 2.629587938956653,
2019
+ "grad_norm": 0.9777950048446655,
2020
+ "learning_rate": 6.173534350722445e-06,
2021
+ "loss": 0.0888,
2022
+ "step": 142500
2023
+ },
2024
+ {
2025
+ "epoch": 2.6388145633038698,
2026
+ "grad_norm": 2.261619806289673,
2027
+ "learning_rate": 6.0197572782688396e-06,
2028
+ "loss": 0.0879,
2029
+ "step": 143000
2030
+ },
2031
+ {
2032
+ "epoch": 2.648041187651086,
2033
+ "grad_norm": 2.093942642211914,
2034
+ "learning_rate": 5.865980205815234e-06,
2035
+ "loss": 0.0866,
2036
+ "step": 143500
2037
+ },
2038
+ {
2039
+ "epoch": 2.6572678119983024,
2040
+ "grad_norm": 3.01939058303833,
2041
+ "learning_rate": 5.712203133361628e-06,
2042
+ "loss": 0.0882,
2043
+ "step": 144000
2044
+ },
2045
+ {
2046
+ "epoch": 2.6664944363455185,
2047
+ "grad_norm": 2.6572530269622803,
2048
+ "learning_rate": 5.558426060908023e-06,
2049
+ "loss": 0.0889,
2050
+ "step": 144500
2051
+ },
2052
+ {
2053
+ "epoch": 2.675721060692735,
2054
+ "grad_norm": 1.3037127256393433,
2055
+ "learning_rate": 5.404648988454418e-06,
2056
+ "loss": 0.0907,
2057
+ "step": 145000
2058
+ },
2059
+ {
2060
+ "epoch": 2.684947685039951,
2061
+ "grad_norm": 1.2352185249328613,
2062
+ "learning_rate": 5.2508719160008125e-06,
2063
+ "loss": 0.0888,
2064
+ "step": 145500
2065
+ },
2066
+ {
2067
+ "epoch": 2.6941743093871677,
2068
+ "grad_norm": 1.6539493799209595,
2069
+ "learning_rate": 5.097094843547207e-06,
2070
+ "loss": 0.0886,
2071
+ "step": 146000
2072
+ },
2073
+ {
2074
+ "epoch": 2.703400933734384,
2075
+ "grad_norm": 1.900009036064148,
2076
+ "learning_rate": 4.943317771093601e-06,
2077
+ "loss": 0.0896,
2078
+ "step": 146500
2079
+ },
2080
+ {
2081
+ "epoch": 2.7126275580816004,
2082
+ "grad_norm": 1.3108474016189575,
2083
+ "learning_rate": 4.789540698639996e-06,
2084
+ "loss": 0.0874,
2085
+ "step": 147000
2086
+ },
2087
+ {
2088
+ "epoch": 2.7218541824288165,
2089
+ "grad_norm": 0.9704590439796448,
2090
+ "learning_rate": 4.63576362618639e-06,
2091
+ "loss": 0.087,
2092
+ "step": 147500
2093
+ },
2094
+ {
2095
+ "epoch": 2.731080806776033,
2096
+ "grad_norm": 1.0830601453781128,
2097
+ "learning_rate": 4.481986553732785e-06,
2098
+ "loss": 0.0881,
2099
+ "step": 148000
2100
+ },
2101
+ {
2102
+ "epoch": 2.740307431123249,
2103
+ "grad_norm": 1.5252071619033813,
2104
+ "learning_rate": 4.32820948127918e-06,
2105
+ "loss": 0.0892,
2106
+ "step": 148500
2107
+ },
2108
+ {
2109
+ "epoch": 2.7495340554704657,
2110
+ "grad_norm": 1.7691118717193604,
2111
+ "learning_rate": 4.174432408825573e-06,
2112
+ "loss": 0.0859,
2113
+ "step": 149000
2114
+ },
2115
+ {
2116
+ "epoch": 2.758760679817682,
2117
+ "grad_norm": 7.3577494621276855,
2118
+ "learning_rate": 4.020655336371969e-06,
2119
+ "loss": 0.0856,
2120
+ "step": 149500
2121
+ },
2122
+ {
2123
+ "epoch": 2.7679873041648984,
2124
+ "grad_norm": 1.1883918046951294,
2125
+ "learning_rate": 3.866878263918363e-06,
2126
+ "loss": 0.086,
2127
+ "step": 150000
2128
+ },
2129
+ {
2130
+ "epoch": 2.7772139285121145,
2131
+ "grad_norm": 2.320882797241211,
2132
+ "learning_rate": 3.7131011914647575e-06,
2133
+ "loss": 0.0868,
2134
+ "step": 150500
2135
+ },
2136
+ {
2137
+ "epoch": 2.786440552859331,
2138
+ "grad_norm": 2.119135618209839,
2139
+ "learning_rate": 3.5593241190111523e-06,
2140
+ "loss": 0.0837,
2141
+ "step": 151000
2142
+ },
2143
+ {
2144
+ "epoch": 2.795667177206547,
2145
+ "grad_norm": 2.0826363563537598,
2146
+ "learning_rate": 3.4055470465575468e-06,
2147
+ "loss": 0.0871,
2148
+ "step": 151500
2149
+ },
2150
+ {
2151
+ "epoch": 2.8048938015537637,
2152
+ "grad_norm": 1.4801201820373535,
2153
+ "learning_rate": 3.2517699741039408e-06,
2154
+ "loss": 0.0855,
2155
+ "step": 152000
2156
+ },
2157
+ {
2158
+ "epoch": 2.8141204259009798,
2159
+ "grad_norm": 3.352520227432251,
2160
+ "learning_rate": 3.0979929016503356e-06,
2161
+ "loss": 0.0907,
2162
+ "step": 152500
2163
+ },
2164
+ {
2165
+ "epoch": 2.8233470502481963,
2166
+ "grad_norm": 3.1500301361083984,
2167
+ "learning_rate": 2.94421582919673e-06,
2168
+ "loss": 0.0858,
2169
+ "step": 153000
2170
+ },
2171
+ {
2172
+ "epoch": 2.8325736745954124,
2173
+ "grad_norm": 1.9149506092071533,
2174
+ "learning_rate": 2.790438756743125e-06,
2175
+ "loss": 0.0856,
2176
+ "step": 153500
2177
+ },
2178
+ {
2179
+ "epoch": 2.841800298942629,
2180
+ "grad_norm": 2.150416612625122,
2181
+ "learning_rate": 2.6366616842895193e-06,
2182
+ "loss": 0.0849,
2183
+ "step": 154000
2184
+ },
2185
+ {
2186
+ "epoch": 2.851026923289845,
2187
+ "grad_norm": 1.613443374633789,
2188
+ "learning_rate": 2.4828846118359137e-06,
2189
+ "loss": 0.084,
2190
+ "step": 154500
2191
+ },
2192
+ {
2193
+ "epoch": 2.8602535476370616,
2194
+ "grad_norm": 4.109127998352051,
2195
+ "learning_rate": 2.3291075393823085e-06,
2196
+ "loss": 0.0859,
2197
+ "step": 155000
2198
+ },
2199
+ {
2200
+ "epoch": 2.8694801719842777,
2201
+ "grad_norm": 2.9541776180267334,
2202
+ "learning_rate": 2.175330466928703e-06,
2203
+ "loss": 0.0869,
2204
+ "step": 155500
2205
+ },
2206
+ {
2207
+ "epoch": 2.8787067963314943,
2208
+ "grad_norm": 2.9944493770599365,
2209
+ "learning_rate": 2.0215533944750974e-06,
2210
+ "loss": 0.0861,
2211
+ "step": 156000
2212
+ },
2213
+ {
2214
+ "epoch": 2.8879334206787104,
2215
+ "grad_norm": 2.072777271270752,
2216
+ "learning_rate": 1.867776322021492e-06,
2217
+ "loss": 0.084,
2218
+ "step": 156500
2219
+ },
2220
+ {
2221
+ "epoch": 2.897160045025927,
2222
+ "grad_norm": 2.4962828159332275,
2223
+ "learning_rate": 1.7139992495678866e-06,
2224
+ "loss": 0.0826,
2225
+ "step": 157000
2226
+ },
2227
+ {
2228
+ "epoch": 2.906386669373143,
2229
+ "grad_norm": 1.9871286153793335,
2230
+ "learning_rate": 1.560222177114281e-06,
2231
+ "loss": 0.086,
2232
+ "step": 157500
2233
+ },
2234
+ {
2235
+ "epoch": 2.9156132937203596,
2236
+ "grad_norm": 1.9906572103500366,
2237
+ "learning_rate": 1.4064451046606757e-06,
2238
+ "loss": 0.0838,
2239
+ "step": 158000
2240
+ },
2241
+ {
2242
+ "epoch": 2.9248399180675757,
2243
+ "grad_norm": 2.1322317123413086,
2244
+ "learning_rate": 1.25266803220707e-06,
2245
+ "loss": 0.0844,
2246
+ "step": 158500
2247
+ },
2248
+ {
2249
+ "epoch": 2.9340665424147923,
2250
+ "grad_norm": 2.26415753364563,
2251
+ "learning_rate": 1.0988909597534647e-06,
2252
+ "loss": 0.0839,
2253
+ "step": 159000
2254
+ },
2255
+ {
2256
+ "epoch": 2.9432931667620084,
2257
+ "grad_norm": 1.9201428890228271,
2258
+ "learning_rate": 9.451138872998592e-07,
2259
+ "loss": 0.0861,
2260
+ "step": 159500
2261
+ },
2262
+ {
2263
+ "epoch": 2.952519791109225,
2264
+ "grad_norm": 1.4225293397903442,
2265
+ "learning_rate": 7.913368148462537e-07,
2266
+ "loss": 0.0867,
2267
+ "step": 160000
2268
+ },
2269
+ {
2270
+ "epoch": 2.961746415456441,
2271
+ "grad_norm": 4.265661239624023,
2272
+ "learning_rate": 6.375597423926483e-07,
2273
+ "loss": 0.0864,
2274
+ "step": 160500
2275
+ },
2276
+ {
2277
+ "epoch": 2.9709730398036576,
2278
+ "grad_norm": 3.4527368545532227,
2279
+ "learning_rate": 4.837826699390428e-07,
2280
+ "loss": 0.0853,
2281
+ "step": 161000
2282
+ },
2283
+ {
2284
+ "epoch": 2.9801996641508737,
2285
+ "grad_norm": 1.618511438369751,
2286
+ "learning_rate": 3.300055974854373e-07,
2287
+ "loss": 0.0862,
2288
+ "step": 161500
2289
+ },
2290
+ {
2291
+ "epoch": 2.9894262884980902,
2292
+ "grad_norm": 1.6494286060333252,
2293
+ "learning_rate": 1.7622852503183185e-07,
2294
+ "loss": 0.0827,
2295
+ "step": 162000
2296
+ },
2297
+ {
2298
+ "epoch": 2.9986529128453063,
2299
+ "grad_norm": 1.5462067127227783,
2300
+ "learning_rate": 2.24514525782264e-08,
2301
+ "loss": 0.0861,
2302
+ "step": 162500
2303
+ },
2304
  {
2305
  "epoch": 3.0,
2306
+ "eval_loss": 0.20824576914310455,
2307
+ "eval_mse": 0.20824573578672098,
2308
+ "eval_runtime": 124.5011,
2309
+ "eval_samples_per_second": 1547.584,
2310
+ "eval_steps_per_second": 193.452,
2311
+ "step": 162573
2312
  },
2313
  {
2314
  "epoch": 3.0,
2315
+ "step": 162573,
2316
+ "total_flos": 8.56700972907817e+16,
2317
+ "train_loss": 0.16141534491764947,
2318
+ "train_runtime": 8977.4486,
2319
+ "train_samples_per_second": 579.48,
2320
+ "train_steps_per_second": 18.109
2321
  }
2322
  ],
2323
  "logging_steps": 500,
2324
+ "max_steps": 162573,
2325
  "num_input_tokens_seen": 0,
2326
  "num_train_epochs": 3,
2327
  "save_steps": 500,
 
2337
  "attributes": {}
2338
  }
2339
  },
2340
+ "total_flos": 8.56700972907817e+16,
2341
+ "train_batch_size": 32,
2342
  "trial_name": null,
2343
  "trial_params": null
2344
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ebe7b7b05490c20b53fbc39ffdeec0c1f0c552cf79f29de5652b1d14465d395
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ab0472f329dda31a741344576f9001dc9064737abfb75a6baa4c9a1bdeb39ed
3
  size 5368