CocoRoF commited on
Commit
a9f05ab
·
verified ·
1 Parent(s): 0c2eecf

Training in progress, step 500, checkpoint

Browse files
last-checkpoint/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "x2bee/KoModernBERT-base-mlm-v03-retry-ckp03",
3
+ "architectures": [
4
+ "ModernBERTSimCSE"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 50281,
9
+ "classifier_activation": "gelu",
10
+ "classifier_bias": false,
11
+ "classifier_dropout": 0.0,
12
+ "classifier_pooling": "mean",
13
+ "cls_token_id": 50281,
14
+ "decoder_bias": true,
15
+ "deterministic_flash_attn": false,
16
+ "embedding_dropout": 0.0,
17
+ "eos_token_id": 50282,
18
+ "global_attn_every_n_layers": 3,
19
+ "global_rope_theta": 160000.0,
20
+ "gradient_checkpointing": false,
21
+ "hidden_activation": "gelu",
22
+ "hidden_size": 768,
23
+ "initializer_cutoff_factor": 2.0,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 1152,
26
+ "layer_norm_eps": 1e-05,
27
+ "local_attention": 128,
28
+ "local_rope_theta": 10000.0,
29
+ "max_position_embeddings": 8192,
30
+ "mlp_bias": false,
31
+ "mlp_dropout": 0.0,
32
+ "model_type": "modernbert",
33
+ "norm_bias": false,
34
+ "norm_eps": 1e-05,
35
+ "num_attention_heads": 12,
36
+ "num_hidden_layers": 22,
37
+ "pad_token_id": 50283,
38
+ "position_embedding_type": "absolute",
39
+ "reference_compile": false,
40
+ "repad_logits_with_grad": false,
41
+ "sep_token_id": 50282,
42
+ "sparse_pred_ignore_index": -100,
43
+ "sparse_prediction": false,
44
+ "torch_dtype": "float32",
45
+ "transformers_version": "4.48.0.dev0",
46
+ "vocab_size": 95663
47
+ }
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a10f2f471ddb68aeb84eaedafdfdcf37d93db8b63aabad73c90654c0ff2b5c6c
3
- size 735217848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a902e03f78d73a65a225e2670272f5ab30fa9243753252da0d81bf6ae1ab88f8
3
+ size 737580392
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:786bac647cdfb3e95caf79a33fc77addeb1581415ccb52e3e6a59e8aa9baf708
3
- size 1470521978
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c2f7579a0d7be4b045d3119455fa8aacab3cc5e2ee7588dffdebcb0ee31366a
3
+ size 1475248442
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e929bfb8d61dabc9ff9440d99be02b793be97dcf206c259cdc957e3702b21cb
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:835f869ea325fd6edf27b48b589309fb66641cb92b45f2fc13d1bb6e8814106c
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0470658ec86377919cf99d26abf4d40a040955151803301b44a6b653da17e99d
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:228d14efa38075e5075e5f3ea1c158f27661d545dab61c548dfe15e36f9e3d44
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,1839 +1,362 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
- "eval_steps": 100,
6
- "global_step": 2134,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.004686035613870665,
13
- "grad_norm": 3.68461537361145,
14
  "learning_rate": 4.997071227741331e-05,
15
- "loss": 2.1809,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.00937207122774133,
20
- "grad_norm": 3.1624999046325684,
21
  "learning_rate": 4.994142455482662e-05,
22
- "loss": 1.2093,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.014058106841611996,
27
- "grad_norm": 2.7569260597229004,
28
  "learning_rate": 4.991213683223993e-05,
29
- "loss": 1.0028,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.01874414245548266,
34
- "grad_norm": 2.407806396484375,
35
  "learning_rate": 4.9882849109653237e-05,
36
- "loss": 0.8416,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.023430178069353328,
41
- "grad_norm": 2.227578639984131,
42
  "learning_rate": 4.9853561387066545e-05,
43
- "loss": 0.7806,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.028116213683223992,
48
- "grad_norm": 2.4283745288848877,
49
  "learning_rate": 4.9824273664479854e-05,
50
- "loss": 0.6355,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.03280224929709466,
55
- "grad_norm": 2.0886402130126953,
56
  "learning_rate": 4.979498594189316e-05,
57
- "loss": 0.575,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.03748828491096532,
62
- "grad_norm": 2.8892505168914795,
63
  "learning_rate": 4.9765698219306464e-05,
64
- "loss": 0.6248,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.04217432052483599,
69
- "grad_norm": 1.957230806350708,
70
  "learning_rate": 4.973641049671978e-05,
71
- "loss": 0.5712,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.046860356138706656,
76
- "grad_norm": 1.9060039520263672,
77
  "learning_rate": 4.970712277413309e-05,
78
- "loss": 0.6339,
79
- "step": 100
80
- },
81
- {
82
- "epoch": 0.046860356138706656,
83
- "eval_loss": 0.09424024820327759,
84
- "eval_pearson_cosine": 0.7833384278521862,
85
- "eval_pearson_dot": 0.710012200190576,
86
- "eval_pearson_euclidean": 0.7816309148374074,
87
- "eval_pearson_manhattan": 0.7821511170378841,
88
- "eval_runtime": 5.0357,
89
- "eval_samples_per_second": 297.871,
90
- "eval_spearman_cosine": 0.7817326074827855,
91
- "eval_spearman_dot": 0.7028556042735588,
92
- "eval_spearman_euclidean": 0.7865135559191494,
93
- "eval_spearman_manhattan": 0.7875187958651506,
94
- "eval_steps_per_second": 18.667,
95
  "step": 100
96
  },
97
  {
98
  "epoch": 0.05154639175257732,
99
- "grad_norm": 2.1410257816314697,
100
  "learning_rate": 4.9677835051546396e-05,
101
- "loss": 0.5549,
102
  "step": 110
103
  },
104
  {
105
  "epoch": 0.056232427366447985,
106
- "grad_norm": 1.8480626344680786,
107
  "learning_rate": 4.9648547328959705e-05,
108
- "loss": 0.5176,
109
  "step": 120
110
  },
111
  {
112
  "epoch": 0.06091846298031865,
113
- "grad_norm": 1.9234317541122437,
114
  "learning_rate": 4.961925960637301e-05,
115
- "loss": 0.5483,
116
  "step": 130
117
  },
118
  {
119
  "epoch": 0.06560449859418932,
120
- "grad_norm": 1.7899746894836426,
121
  "learning_rate": 4.9589971883786315e-05,
122
- "loss": 0.4993,
123
  "step": 140
124
  },
125
  {
126
  "epoch": 0.07029053420805999,
127
- "grad_norm": 1.9671216011047363,
128
  "learning_rate": 4.956068416119963e-05,
129
- "loss": 0.5575,
130
  "step": 150
131
  },
132
  {
133
  "epoch": 0.07497656982193064,
134
- "grad_norm": 1.8711061477661133,
135
  "learning_rate": 4.953139643861294e-05,
136
- "loss": 0.5471,
137
  "step": 160
138
  },
139
  {
140
  "epoch": 0.07966260543580131,
141
- "grad_norm": 2.2522668838500977,
142
  "learning_rate": 4.950210871602625e-05,
143
- "loss": 0.5239,
144
  "step": 170
145
  },
146
  {
147
  "epoch": 0.08434864104967198,
148
- "grad_norm": 1.8598068952560425,
149
  "learning_rate": 4.947282099343955e-05,
150
- "loss": 0.4534,
151
  "step": 180
152
  },
153
  {
154
  "epoch": 0.08903467666354264,
155
- "grad_norm": 2.039216995239258,
156
  "learning_rate": 4.944353327085286e-05,
157
- "loss": 0.5406,
158
  "step": 190
159
  },
160
  {
161
  "epoch": 0.09372071227741331,
162
- "grad_norm": 2.0879595279693604,
163
  "learning_rate": 4.9414245548266166e-05,
164
- "loss": 0.448,
165
- "step": 200
166
- },
167
- {
168
- "epoch": 0.09372071227741331,
169
- "eval_loss": 0.08082367479801178,
170
- "eval_pearson_cosine": 0.7856181201462533,
171
- "eval_pearson_dot": 0.7399407398880271,
172
- "eval_pearson_euclidean": 0.7873226689476169,
173
- "eval_pearson_manhattan": 0.7886999419523848,
174
- "eval_runtime": 4.9124,
175
- "eval_samples_per_second": 305.349,
176
- "eval_spearman_cosine": 0.7882418126384709,
177
- "eval_spearman_dot": 0.7354032626087063,
178
- "eval_spearman_euclidean": 0.7930010817448562,
179
- "eval_spearman_manhattan": 0.7945256304098883,
180
- "eval_steps_per_second": 19.135,
181
  "step": 200
182
  },
183
  {
184
  "epoch": 0.09840674789128398,
185
- "grad_norm": 2.2569146156311035,
186
  "learning_rate": 4.938495782567948e-05,
187
- "loss": 0.5107,
188
  "step": 210
189
  },
190
  {
191
  "epoch": 0.10309278350515463,
192
- "grad_norm": 1.7923479080200195,
193
  "learning_rate": 4.935567010309279e-05,
194
- "loss": 0.4541,
195
  "step": 220
196
  },
197
  {
198
  "epoch": 0.1077788191190253,
199
- "grad_norm": 2.0883235931396484,
200
  "learning_rate": 4.932638238050609e-05,
201
- "loss": 0.482,
202
  "step": 230
203
  },
204
  {
205
  "epoch": 0.11246485473289597,
206
- "grad_norm": 1.5482354164123535,
207
  "learning_rate": 4.92970946579194e-05,
208
- "loss": 0.4346,
209
  "step": 240
210
  },
211
  {
212
  "epoch": 0.11715089034676664,
213
- "grad_norm": 1.3946720361709595,
214
  "learning_rate": 4.926780693533271e-05,
215
- "loss": 0.447,
216
  "step": 250
217
  },
218
  {
219
  "epoch": 0.1218369259606373,
220
- "grad_norm": 1.547633409500122,
221
  "learning_rate": 4.923851921274602e-05,
222
- "loss": 0.4725,
223
  "step": 260
224
  },
225
  {
226
  "epoch": 0.12652296157450796,
227
- "grad_norm": 2.2962911128997803,
228
  "learning_rate": 4.920923149015933e-05,
229
- "loss": 0.4205,
230
  "step": 270
231
  },
232
  {
233
  "epoch": 0.13120899718837864,
234
- "grad_norm": 1.879306435585022,
235
  "learning_rate": 4.9179943767572635e-05,
236
- "loss": 0.4513,
237
  "step": 280
238
  },
239
  {
240
  "epoch": 0.1358950328022493,
241
- "grad_norm": 2.1697516441345215,
242
  "learning_rate": 4.9150656044985943e-05,
243
- "loss": 0.4619,
244
  "step": 290
245
  },
246
  {
247
  "epoch": 0.14058106841611998,
248
- "grad_norm": 1.8957735300064087,
249
  "learning_rate": 4.912136832239925e-05,
250
- "loss": 0.427,
251
- "step": 300
252
- },
253
- {
254
- "epoch": 0.14058106841611998,
255
- "eval_loss": 0.06987904757261276,
256
- "eval_pearson_cosine": 0.8036511544079303,
257
- "eval_pearson_dot": 0.7505297006445062,
258
- "eval_pearson_euclidean": 0.8010123967565619,
259
- "eval_pearson_manhattan": 0.8020924463783778,
260
- "eval_runtime": 6.2685,
261
- "eval_samples_per_second": 239.29,
262
- "eval_spearman_cosine": 0.802906284496355,
263
- "eval_spearman_dot": 0.74611539957357,
264
- "eval_spearman_euclidean": 0.8076860955607743,
265
- "eval_spearman_manhattan": 0.8089628010364828,
266
- "eval_steps_per_second": 14.996,
267
  "step": 300
268
  },
269
  {
270
  "epoch": 0.14526710402999063,
271
- "grad_norm": 1.6581153869628906,
272
  "learning_rate": 4.909208059981256e-05,
273
- "loss": 0.4209,
274
  "step": 310
275
  },
276
  {
277
  "epoch": 0.14995313964386128,
278
- "grad_norm": 2.2879538536071777,
279
  "learning_rate": 4.906279287722587e-05,
280
- "loss": 0.4214,
281
  "step": 320
282
  },
283
  {
284
  "epoch": 0.15463917525773196,
285
- "grad_norm": 1.905806541442871,
286
  "learning_rate": 4.903350515463918e-05,
287
- "loss": 0.4595,
288
  "step": 330
289
  },
290
  {
291
  "epoch": 0.15932521087160262,
292
- "grad_norm": 1.512032151222229,
293
  "learning_rate": 4.9004217432052486e-05,
294
- "loss": 0.4357,
295
  "step": 340
296
  },
297
  {
298
  "epoch": 0.1640112464854733,
299
- "grad_norm": 1.5765775442123413,
300
  "learning_rate": 4.8974929709465795e-05,
301
- "loss": 0.4414,
302
  "step": 350
303
  },
304
  {
305
  "epoch": 0.16869728209934395,
306
- "grad_norm": 2.138056993484497,
307
  "learning_rate": 4.89456419868791e-05,
308
- "loss": 0.4321,
309
  "step": 360
310
  },
311
  {
312
  "epoch": 0.1733833177132146,
313
- "grad_norm": 2.256223678588867,
314
  "learning_rate": 4.891635426429241e-05,
315
- "loss": 0.3994,
316
  "step": 370
317
  },
318
  {
319
  "epoch": 0.1780693533270853,
320
- "grad_norm": 1.7170028686523438,
321
  "learning_rate": 4.888706654170572e-05,
322
- "loss": 0.3941,
323
  "step": 380
324
  },
325
  {
326
  "epoch": 0.18275538894095594,
327
- "grad_norm": 1.492506504058838,
328
  "learning_rate": 4.885777881911903e-05,
329
- "loss": 0.4062,
330
  "step": 390
331
  },
332
  {
333
  "epoch": 0.18744142455482662,
334
- "grad_norm": 1.8897664546966553,
335
  "learning_rate": 4.882849109653234e-05,
336
- "loss": 0.4052,
337
- "step": 400
338
- },
339
- {
340
- "epoch": 0.18744142455482662,
341
- "eval_loss": 0.06156951189041138,
342
- "eval_pearson_cosine": 0.8108872920119552,
343
- "eval_pearson_dot": 0.7663492619892054,
344
- "eval_pearson_euclidean": 0.8079787711457271,
345
- "eval_pearson_manhattan": 0.8094246591845007,
346
- "eval_runtime": 6.0053,
347
- "eval_samples_per_second": 249.78,
348
- "eval_spearman_cosine": 0.8096398271545133,
349
- "eval_spearman_dot": 0.7619990554899813,
350
- "eval_spearman_euclidean": 0.8128327163681044,
351
- "eval_spearman_manhattan": 0.8143278410022097,
352
- "eval_steps_per_second": 15.653,
353
  "step": 400
354
  },
355
  {
356
  "epoch": 0.19212746016869728,
357
- "grad_norm": 2.6643662452697754,
358
  "learning_rate": 4.8799203373945646e-05,
359
- "loss": 0.4478,
360
  "step": 410
361
  },
362
  {
363
  "epoch": 0.19681349578256796,
364
- "grad_norm": 1.837944746017456,
365
  "learning_rate": 4.8769915651358954e-05,
366
- "loss": 0.3994,
367
  "step": 420
368
  },
369
  {
370
  "epoch": 0.2014995313964386,
371
- "grad_norm": 2.388124465942383,
372
  "learning_rate": 4.8740627928772256e-05,
373
- "loss": 0.4322,
374
  "step": 430
375
  },
376
  {
377
  "epoch": 0.20618556701030927,
378
- "grad_norm": 1.4240330457687378,
379
  "learning_rate": 4.871134020618557e-05,
380
- "loss": 0.3781,
381
  "step": 440
382
  },
383
  {
384
  "epoch": 0.21087160262417995,
385
- "grad_norm": 2.3366851806640625,
386
  "learning_rate": 4.868205248359888e-05,
387
- "loss": 0.3797,
388
  "step": 450
389
  },
390
  {
391
  "epoch": 0.2155576382380506,
392
- "grad_norm": 1.5920908451080322,
393
  "learning_rate": 4.865276476101219e-05,
394
- "loss": 0.4065,
395
  "step": 460
396
  },
397
  {
398
  "epoch": 0.22024367385192128,
399
- "grad_norm": 1.6287914514541626,
400
  "learning_rate": 4.86234770384255e-05,
401
- "loss": 0.4042,
402
  "step": 470
403
  },
404
  {
405
  "epoch": 0.22492970946579194,
406
- "grad_norm": 2.13320255279541,
407
  "learning_rate": 4.85941893158388e-05,
408
- "loss": 0.3608,
409
  "step": 480
410
  },
411
  {
412
  "epoch": 0.2296157450796626,
413
- "grad_norm": 1.6816534996032715,
414
  "learning_rate": 4.856490159325211e-05,
415
- "loss": 0.3721,
416
  "step": 490
417
  },
418
  {
419
  "epoch": 0.23430178069353327,
420
- "grad_norm": 1.275868535041809,
421
  "learning_rate": 4.853561387066542e-05,
422
- "loss": 0.4023,
423
- "step": 500
424
- },
425
- {
426
- "epoch": 0.23430178069353327,
427
- "eval_loss": 0.0611547976732254,
428
- "eval_pearson_cosine": 0.8109234308318634,
429
- "eval_pearson_dot": 0.7739396599918535,
430
- "eval_pearson_euclidean": 0.8112683151419446,
431
- "eval_pearson_manhattan": 0.8132717499846649,
432
- "eval_runtime": 5.9704,
433
- "eval_samples_per_second": 251.239,
434
- "eval_spearman_cosine": 0.8134716423487094,
435
- "eval_spearman_dot": 0.7705186875238829,
436
- "eval_spearman_euclidean": 0.8167514840119909,
437
- "eval_spearman_manhattan": 0.8187222998801444,
438
- "eval_steps_per_second": 15.744,
439
  "step": 500
440
- },
441
- {
442
- "epoch": 0.23898781630740393,
443
- "grad_norm": 1.9643880128860474,
444
- "learning_rate": 4.850632614807873e-05,
445
- "loss": 0.4231,
446
- "step": 510
447
- },
448
- {
449
- "epoch": 0.2436738519212746,
450
- "grad_norm": 1.7972699403762817,
451
- "learning_rate": 4.847703842549204e-05,
452
- "loss": 0.3893,
453
- "step": 520
454
- },
455
- {
456
- "epoch": 0.24835988753514526,
457
- "grad_norm": 1.6312799453735352,
458
- "learning_rate": 4.844775070290534e-05,
459
- "loss": 0.3869,
460
- "step": 530
461
- },
462
- {
463
- "epoch": 0.2530459231490159,
464
- "grad_norm": 1.8009634017944336,
465
- "learning_rate": 4.841846298031865e-05,
466
- "loss": 0.4,
467
- "step": 540
468
- },
469
- {
470
- "epoch": 0.25773195876288657,
471
- "grad_norm": 1.1453664302825928,
472
- "learning_rate": 4.838917525773196e-05,
473
- "loss": 0.3568,
474
- "step": 550
475
- },
476
- {
477
- "epoch": 0.2624179943767573,
478
- "grad_norm": 1.683673620223999,
479
- "learning_rate": 4.8359887535145274e-05,
480
- "loss": 0.3674,
481
- "step": 560
482
- },
483
- {
484
- "epoch": 0.26710402999062793,
485
- "grad_norm": 1.3695913553237915,
486
- "learning_rate": 4.833059981255858e-05,
487
- "loss": 0.3817,
488
- "step": 570
489
- },
490
- {
491
- "epoch": 0.2717900656044986,
492
- "grad_norm": 1.8631620407104492,
493
- "learning_rate": 4.8301312089971884e-05,
494
- "loss": 0.3546,
495
- "step": 580
496
- },
497
- {
498
- "epoch": 0.27647610121836924,
499
- "grad_norm": 1.5883185863494873,
500
- "learning_rate": 4.827202436738519e-05,
501
- "loss": 0.3973,
502
- "step": 590
503
- },
504
- {
505
- "epoch": 0.28116213683223995,
506
- "grad_norm": 1.7056660652160645,
507
- "learning_rate": 4.82427366447985e-05,
508
- "loss": 0.3754,
509
- "step": 600
510
- },
511
- {
512
- "epoch": 0.28116213683223995,
513
- "eval_loss": 0.059529613703489304,
514
- "eval_pearson_cosine": 0.8104839357699305,
515
- "eval_pearson_dot": 0.7706455959096417,
516
- "eval_pearson_euclidean": 0.8087417307856555,
517
- "eval_pearson_manhattan": 0.8101299227665919,
518
- "eval_runtime": 6.0212,
519
- "eval_samples_per_second": 249.121,
520
- "eval_spearman_cosine": 0.8125251228747598,
521
- "eval_spearman_dot": 0.7680727600884657,
522
- "eval_spearman_euclidean": 0.8146128693278114,
523
- "eval_spearman_manhattan": 0.8161621120875591,
524
- "eval_steps_per_second": 15.612,
525
- "step": 600
526
- },
527
- {
528
- "epoch": 0.2858481724461106,
529
- "grad_norm": 1.8564058542251587,
530
- "learning_rate": 4.821344892221181e-05,
531
- "loss": 0.4016,
532
- "step": 610
533
- },
534
- {
535
- "epoch": 0.29053420805998126,
536
- "grad_norm": 1.467993974685669,
537
- "learning_rate": 4.818416119962512e-05,
538
- "loss": 0.3858,
539
- "step": 620
540
- },
541
- {
542
- "epoch": 0.2952202436738519,
543
- "grad_norm": 2.3624465465545654,
544
- "learning_rate": 4.815487347703843e-05,
545
- "loss": 0.3796,
546
- "step": 630
547
- },
548
- {
549
- "epoch": 0.29990627928772257,
550
- "grad_norm": 1.588629126548767,
551
- "learning_rate": 4.8125585754451736e-05,
552
- "loss": 0.3667,
553
- "step": 640
554
- },
555
- {
556
- "epoch": 0.3045923149015933,
557
- "grad_norm": 1.380112886428833,
558
- "learning_rate": 4.8096298031865044e-05,
559
- "loss": 0.3453,
560
- "step": 650
561
- },
562
- {
563
- "epoch": 0.30927835051546393,
564
- "grad_norm": 1.4270693063735962,
565
- "learning_rate": 4.806701030927835e-05,
566
- "loss": 0.3345,
567
- "step": 660
568
- },
569
- {
570
- "epoch": 0.3139643861293346,
571
- "grad_norm": 2.204744338989258,
572
- "learning_rate": 4.803772258669166e-05,
573
- "loss": 0.395,
574
- "step": 670
575
- },
576
- {
577
- "epoch": 0.31865042174320524,
578
- "grad_norm": 1.4480923414230347,
579
- "learning_rate": 4.800843486410497e-05,
580
- "loss": 0.3691,
581
- "step": 680
582
- },
583
- {
584
- "epoch": 0.3233364573570759,
585
- "grad_norm": 1.8864325284957886,
586
- "learning_rate": 4.797914714151828e-05,
587
- "loss": 0.3986,
588
- "step": 690
589
- },
590
- {
591
- "epoch": 0.3280224929709466,
592
- "grad_norm": 1.3784370422363281,
593
- "learning_rate": 4.794985941893159e-05,
594
- "loss": 0.3729,
595
- "step": 700
596
- },
597
- {
598
- "epoch": 0.3280224929709466,
599
- "eval_loss": 0.061924997717142105,
600
- "eval_pearson_cosine": 0.8155259960165324,
601
- "eval_pearson_dot": 0.7761366153485074,
602
- "eval_pearson_euclidean": 0.8127568794877789,
603
- "eval_pearson_manhattan": 0.8144288026347226,
604
- "eval_runtime": 6.0626,
605
- "eval_samples_per_second": 247.42,
606
- "eval_spearman_cosine": 0.8175981152530937,
607
- "eval_spearman_dot": 0.7736443532595881,
608
- "eval_spearman_euclidean": 0.8195662973032031,
609
- "eval_spearman_manhattan": 0.8212465310439688,
610
- "eval_steps_per_second": 15.505,
611
- "step": 700
612
- },
613
- {
614
- "epoch": 0.33270852858481725,
615
- "grad_norm": 1.7109017372131348,
616
- "learning_rate": 4.7920571696344895e-05,
617
- "loss": 0.3345,
618
- "step": 710
619
- },
620
- {
621
- "epoch": 0.3373945641986879,
622
- "grad_norm": 1.8547511100769043,
623
- "learning_rate": 4.7891283973758204e-05,
624
- "loss": 0.3735,
625
- "step": 720
626
- },
627
- {
628
- "epoch": 0.34208059981255856,
629
- "grad_norm": 1.5369923114776611,
630
- "learning_rate": 4.786199625117151e-05,
631
- "loss": 0.3304,
632
- "step": 730
633
- },
634
- {
635
- "epoch": 0.3467666354264292,
636
- "grad_norm": 1.308568000793457,
637
- "learning_rate": 4.783270852858482e-05,
638
- "loss": 0.3717,
639
- "step": 740
640
- },
641
- {
642
- "epoch": 0.3514526710402999,
643
- "grad_norm": 1.3743574619293213,
644
- "learning_rate": 4.780342080599813e-05,
645
- "loss": 0.3381,
646
- "step": 750
647
- },
648
- {
649
- "epoch": 0.3561387066541706,
650
- "grad_norm": 1.874657154083252,
651
- "learning_rate": 4.777413308341144e-05,
652
- "loss": 0.3193,
653
- "step": 760
654
- },
655
- {
656
- "epoch": 0.36082474226804123,
657
- "grad_norm": 1.4700101613998413,
658
- "learning_rate": 4.774484536082475e-05,
659
- "loss": 0.3799,
660
- "step": 770
661
- },
662
- {
663
- "epoch": 0.3655107778819119,
664
- "grad_norm": 1.5662988424301147,
665
- "learning_rate": 4.771555763823805e-05,
666
- "loss": 0.3453,
667
- "step": 780
668
- },
669
- {
670
- "epoch": 0.3701968134957826,
671
- "grad_norm": 1.4666754007339478,
672
- "learning_rate": 4.768626991565136e-05,
673
- "loss": 0.3175,
674
- "step": 790
675
- },
676
- {
677
- "epoch": 0.37488284910965325,
678
- "grad_norm": 1.3993242979049683,
679
- "learning_rate": 4.765698219306467e-05,
680
- "loss": 0.341,
681
- "step": 800
682
- },
683
- {
684
- "epoch": 0.37488284910965325,
685
- "eval_loss": 0.05296875163912773,
686
- "eval_pearson_cosine": 0.8137295797811834,
687
- "eval_pearson_dot": 0.7695932846417932,
688
- "eval_pearson_euclidean": 0.8189567419998482,
689
- "eval_pearson_manhattan": 0.8200667930673546,
690
- "eval_runtime": 6.3175,
691
- "eval_samples_per_second": 237.436,
692
- "eval_spearman_cosine": 0.815541427803139,
693
- "eval_spearman_dot": 0.7663341686268886,
694
- "eval_spearman_euclidean": 0.8233566840888671,
695
- "eval_spearman_manhattan": 0.8246092914965037,
696
- "eval_steps_per_second": 14.879,
697
- "step": 800
698
- },
699
- {
700
- "epoch": 0.3795688847235239,
701
- "grad_norm": 1.4209802150726318,
702
- "learning_rate": 4.762769447047798e-05,
703
- "loss": 0.3831,
704
- "step": 810
705
- },
706
- {
707
- "epoch": 0.38425492033739456,
708
- "grad_norm": 1.4097892045974731,
709
- "learning_rate": 4.759840674789129e-05,
710
- "loss": 0.3318,
711
- "step": 820
712
- },
713
- {
714
- "epoch": 0.3889409559512652,
715
- "grad_norm": 1.706900715827942,
716
- "learning_rate": 4.756911902530459e-05,
717
- "loss": 0.3328,
718
- "step": 830
719
- },
720
- {
721
- "epoch": 0.3936269915651359,
722
- "grad_norm": 1.610275149345398,
723
- "learning_rate": 4.75398313027179e-05,
724
- "loss": 0.3575,
725
- "step": 840
726
- },
727
- {
728
- "epoch": 0.3983130271790066,
729
- "grad_norm": 1.4575105905532837,
730
- "learning_rate": 4.751054358013121e-05,
731
- "loss": 0.307,
732
- "step": 850
733
- },
734
- {
735
- "epoch": 0.4029990627928772,
736
- "grad_norm": 1.638424277305603,
737
- "learning_rate": 4.7481255857544524e-05,
738
- "loss": 0.3504,
739
- "step": 860
740
- },
741
- {
742
- "epoch": 0.4076850984067479,
743
- "grad_norm": 1.8157601356506348,
744
- "learning_rate": 4.745196813495783e-05,
745
- "loss": 0.3931,
746
- "step": 870
747
- },
748
- {
749
- "epoch": 0.41237113402061853,
750
- "grad_norm": 1.6680104732513428,
751
- "learning_rate": 4.7422680412371134e-05,
752
- "loss": 0.362,
753
- "step": 880
754
- },
755
- {
756
- "epoch": 0.41705716963448924,
757
- "grad_norm": 1.4331028461456299,
758
- "learning_rate": 4.739339268978444e-05,
759
- "loss": 0.3451,
760
- "step": 890
761
- },
762
- {
763
- "epoch": 0.4217432052483599,
764
- "grad_norm": 1.3940101861953735,
765
- "learning_rate": 4.736410496719775e-05,
766
- "loss": 0.3161,
767
- "step": 900
768
- },
769
- {
770
- "epoch": 0.4217432052483599,
771
- "eval_loss": 0.05680527910590172,
772
- "eval_pearson_cosine": 0.816164907471336,
773
- "eval_pearson_dot": 0.7659985241939467,
774
- "eval_pearson_euclidean": 0.8198292531320703,
775
- "eval_pearson_manhattan": 0.8209187797411488,
776
- "eval_runtime": 6.6335,
777
- "eval_samples_per_second": 226.126,
778
- "eval_spearman_cosine": 0.8181742542924034,
779
- "eval_spearman_dot": 0.7624851760530289,
780
- "eval_spearman_euclidean": 0.8251528076462932,
781
- "eval_spearman_manhattan": 0.8261936560831687,
782
- "eval_steps_per_second": 14.171,
783
- "step": 900
784
- },
785
- {
786
- "epoch": 0.42642924086223055,
787
- "grad_norm": 1.5849499702453613,
788
- "learning_rate": 4.733481724461106e-05,
789
- "loss": 0.2852,
790
- "step": 910
791
- },
792
- {
793
- "epoch": 0.4311152764761012,
794
- "grad_norm": 1.8611364364624023,
795
- "learning_rate": 4.7305529522024375e-05,
796
- "loss": 0.3517,
797
- "step": 920
798
- },
799
- {
800
- "epoch": 0.43580131208997186,
801
- "grad_norm": 1.759479284286499,
802
- "learning_rate": 4.727624179943768e-05,
803
- "loss": 0.3309,
804
- "step": 930
805
- },
806
- {
807
- "epoch": 0.44048734770384257,
808
- "grad_norm": 1.3715683221817017,
809
- "learning_rate": 4.7246954076850985e-05,
810
- "loss": 0.2964,
811
- "step": 940
812
- },
813
- {
814
- "epoch": 0.4451733833177132,
815
- "grad_norm": 1.6326545476913452,
816
- "learning_rate": 4.7217666354264294e-05,
817
- "loss": 0.3501,
818
- "step": 950
819
- },
820
- {
821
- "epoch": 0.4498594189315839,
822
- "grad_norm": 1.238206148147583,
823
- "learning_rate": 4.71883786316776e-05,
824
- "loss": 0.3366,
825
- "step": 960
826
- },
827
- {
828
- "epoch": 0.45454545454545453,
829
- "grad_norm": 1.6656396389007568,
830
- "learning_rate": 4.715909090909091e-05,
831
- "loss": 0.3594,
832
- "step": 970
833
- },
834
- {
835
- "epoch": 0.4592314901593252,
836
- "grad_norm": 1.5264825820922852,
837
- "learning_rate": 4.712980318650422e-05,
838
- "loss": 0.3309,
839
- "step": 980
840
- },
841
- {
842
- "epoch": 0.4639175257731959,
843
- "grad_norm": 1.4031989574432373,
844
- "learning_rate": 4.710051546391753e-05,
845
- "loss": 0.3616,
846
- "step": 990
847
- },
848
- {
849
- "epoch": 0.46860356138706655,
850
- "grad_norm": 1.439453125,
851
- "learning_rate": 4.7071227741330836e-05,
852
- "loss": 0.3122,
853
- "step": 1000
854
- },
855
- {
856
- "epoch": 0.46860356138706655,
857
- "eval_loss": 0.05414344370365143,
858
- "eval_pearson_cosine": 0.8215390057088641,
859
- "eval_pearson_dot": 0.7789934072191471,
860
- "eval_pearson_euclidean": 0.8206818537339018,
861
- "eval_pearson_manhattan": 0.8219733991381624,
862
- "eval_runtime": 6.2607,
863
- "eval_samples_per_second": 239.588,
864
- "eval_spearman_cosine": 0.8235945278831797,
865
- "eval_spearman_dot": 0.7745226194646113,
866
- "eval_spearman_euclidean": 0.8268444005248111,
867
- "eval_spearman_manhattan": 0.8284194308491212,
868
- "eval_steps_per_second": 15.014,
869
- "step": 1000
870
- },
871
- {
872
- "epoch": 0.4732895970009372,
873
- "grad_norm": 2.064291477203369,
874
- "learning_rate": 4.7041940018744145e-05,
875
- "loss": 0.3188,
876
- "step": 1010
877
- },
878
- {
879
- "epoch": 0.47797563261480785,
880
- "grad_norm": 1.338891625404358,
881
- "learning_rate": 4.7012652296157454e-05,
882
- "loss": 0.3381,
883
- "step": 1020
884
- },
885
- {
886
- "epoch": 0.48266166822867856,
887
- "grad_norm": 1.4479578733444214,
888
- "learning_rate": 4.698336457357076e-05,
889
- "loss": 0.3187,
890
- "step": 1030
891
- },
892
- {
893
- "epoch": 0.4873477038425492,
894
- "grad_norm": 2.0868189334869385,
895
- "learning_rate": 4.695407685098407e-05,
896
- "loss": 0.3697,
897
- "step": 1040
898
- },
899
- {
900
- "epoch": 0.49203373945641987,
901
- "grad_norm": 1.9820175170898438,
902
- "learning_rate": 4.692478912839738e-05,
903
- "loss": 0.3079,
904
- "step": 1050
905
- },
906
- {
907
- "epoch": 0.4967197750702905,
908
- "grad_norm": 1.2479910850524902,
909
- "learning_rate": 4.689550140581069e-05,
910
- "loss": 0.3129,
911
- "step": 1060
912
- },
913
- {
914
- "epoch": 0.5014058106841612,
915
- "grad_norm": 1.5005191564559937,
916
- "learning_rate": 4.6866213683223996e-05,
917
- "loss": 0.3588,
918
- "step": 1070
919
- },
920
- {
921
- "epoch": 0.5060918462980318,
922
- "grad_norm": 1.730153203010559,
923
- "learning_rate": 4.68369259606373e-05,
924
- "loss": 0.3511,
925
- "step": 1080
926
- },
927
- {
928
- "epoch": 0.5107778819119025,
929
- "grad_norm": 1.8256272077560425,
930
- "learning_rate": 4.680763823805061e-05,
931
- "loss": 0.3483,
932
- "step": 1090
933
- },
934
- {
935
- "epoch": 0.5154639175257731,
936
- "grad_norm": 1.8275713920593262,
937
- "learning_rate": 4.677835051546392e-05,
938
- "loss": 0.3301,
939
- "step": 1100
940
- },
941
- {
942
- "epoch": 0.5154639175257731,
943
- "eval_loss": 0.06174962595105171,
944
- "eval_pearson_cosine": 0.8116036079365685,
945
- "eval_pearson_dot": 0.7637833872485942,
946
- "eval_pearson_euclidean": 0.8160585823410784,
947
- "eval_pearson_manhattan": 0.8176746469698344,
948
- "eval_runtime": 6.1383,
949
- "eval_samples_per_second": 244.369,
950
- "eval_spearman_cosine": 0.8149952741898824,
951
- "eval_spearman_dot": 0.759784369983796,
952
- "eval_spearman_euclidean": 0.8211588786730816,
953
- "eval_spearman_manhattan": 0.8228445193252625,
954
- "eval_steps_per_second": 15.314,
955
- "step": 1100
956
- },
957
- {
958
- "epoch": 0.5201499531396439,
959
- "grad_norm": 1.7808656692504883,
960
- "learning_rate": 4.674906279287723e-05,
961
- "loss": 0.3378,
962
- "step": 1110
963
- },
964
- {
965
- "epoch": 0.5248359887535146,
966
- "grad_norm": 1.3912303447723389,
967
- "learning_rate": 4.671977507029054e-05,
968
- "loss": 0.3247,
969
- "step": 1120
970
- },
971
- {
972
- "epoch": 0.5295220243673852,
973
- "grad_norm": 1.619547724723816,
974
- "learning_rate": 4.669048734770384e-05,
975
- "loss": 0.3548,
976
- "step": 1130
977
- },
978
- {
979
- "epoch": 0.5342080599812559,
980
- "grad_norm": 1.6785143613815308,
981
- "learning_rate": 4.666119962511715e-05,
982
- "loss": 0.4056,
983
- "step": 1140
984
- },
985
- {
986
- "epoch": 0.5388940955951266,
987
- "grad_norm": 1.4282417297363281,
988
- "learning_rate": 4.6631911902530465e-05,
989
- "loss": 0.3136,
990
- "step": 1150
991
- },
992
- {
993
- "epoch": 0.5435801312089972,
994
- "grad_norm": 1.5950373411178589,
995
- "learning_rate": 4.660262417994377e-05,
996
- "loss": 0.3094,
997
- "step": 1160
998
- },
999
- {
1000
- "epoch": 0.5482661668228679,
1001
- "grad_norm": 1.9235565662384033,
1002
- "learning_rate": 4.657333645735708e-05,
1003
- "loss": 0.3409,
1004
- "step": 1170
1005
- },
1006
- {
1007
- "epoch": 0.5529522024367385,
1008
- "grad_norm": 1.2192574739456177,
1009
- "learning_rate": 4.6544048734770383e-05,
1010
- "loss": 0.3387,
1011
- "step": 1180
1012
- },
1013
- {
1014
- "epoch": 0.5576382380506092,
1015
- "grad_norm": 1.5550990104675293,
1016
- "learning_rate": 4.651476101218369e-05,
1017
- "loss": 0.3184,
1018
- "step": 1190
1019
- },
1020
- {
1021
- "epoch": 0.5623242736644799,
1022
- "grad_norm": 1.8576079607009888,
1023
- "learning_rate": 4.6485473289597e-05,
1024
- "loss": 0.3637,
1025
- "step": 1200
1026
- },
1027
- {
1028
- "epoch": 0.5623242736644799,
1029
- "eval_loss": 0.05324321612715721,
1030
- "eval_pearson_cosine": 0.8107929850304814,
1031
- "eval_pearson_dot": 0.768063847349957,
1032
- "eval_pearson_euclidean": 0.8155502077488883,
1033
- "eval_pearson_manhattan": 0.8174981555238503,
1034
- "eval_runtime": 5.9644,
1035
- "eval_samples_per_second": 251.492,
1036
- "eval_spearman_cosine": 0.8145222586962418,
1037
- "eval_spearman_dot": 0.7642997219390888,
1038
- "eval_spearman_euclidean": 0.8201735536723759,
1039
- "eval_spearman_manhattan": 0.8222186632592043,
1040
- "eval_steps_per_second": 15.76,
1041
- "step": 1200
1042
- },
1043
- {
1044
- "epoch": 0.5670103092783505,
1045
- "grad_norm": 1.41835618019104,
1046
- "learning_rate": 4.6456185567010316e-05,
1047
- "loss": 0.3039,
1048
- "step": 1210
1049
- },
1050
- {
1051
- "epoch": 0.5716963448922212,
1052
- "grad_norm": 1.3850994110107422,
1053
- "learning_rate": 4.6426897844423624e-05,
1054
- "loss": 0.3187,
1055
- "step": 1220
1056
- },
1057
- {
1058
- "epoch": 0.5763823805060918,
1059
- "grad_norm": 1.437373399734497,
1060
- "learning_rate": 4.6397610121836926e-05,
1061
- "loss": 0.3337,
1062
- "step": 1230
1063
- },
1064
- {
1065
- "epoch": 0.5810684161199625,
1066
- "grad_norm": 1.2328146696090698,
1067
- "learning_rate": 4.6368322399250235e-05,
1068
- "loss": 0.2975,
1069
- "step": 1240
1070
- },
1071
- {
1072
- "epoch": 0.5857544517338332,
1073
- "grad_norm": 1.6191329956054688,
1074
- "learning_rate": 4.633903467666354e-05,
1075
- "loss": 0.3275,
1076
- "step": 1250
1077
- },
1078
- {
1079
- "epoch": 0.5904404873477038,
1080
- "grad_norm": 1.695470929145813,
1081
- "learning_rate": 4.630974695407685e-05,
1082
- "loss": 0.3485,
1083
- "step": 1260
1084
- },
1085
- {
1086
- "epoch": 0.5951265229615745,
1087
- "grad_norm": 1.6120591163635254,
1088
- "learning_rate": 4.628045923149017e-05,
1089
- "loss": 0.3515,
1090
- "step": 1270
1091
- },
1092
- {
1093
- "epoch": 0.5998125585754451,
1094
- "grad_norm": 1.6157792806625366,
1095
- "learning_rate": 4.625117150890347e-05,
1096
- "loss": 0.3043,
1097
- "step": 1280
1098
- },
1099
- {
1100
- "epoch": 0.6044985941893158,
1101
- "grad_norm": 1.1465294361114502,
1102
- "learning_rate": 4.622188378631678e-05,
1103
- "loss": 0.2884,
1104
- "step": 1290
1105
- },
1106
- {
1107
- "epoch": 0.6091846298031866,
1108
- "grad_norm": 1.583688497543335,
1109
- "learning_rate": 4.6192596063730086e-05,
1110
- "loss": 0.2885,
1111
- "step": 1300
1112
- },
1113
- {
1114
- "epoch": 0.6091846298031866,
1115
- "eval_loss": 0.04511857405304909,
1116
- "eval_pearson_cosine": 0.8272008065998051,
1117
- "eval_pearson_dot": 0.7924789666171037,
1118
- "eval_pearson_euclidean": 0.8268065548805623,
1119
- "eval_pearson_manhattan": 0.827500349653536,
1120
- "eval_runtime": 6.0244,
1121
- "eval_samples_per_second": 248.988,
1122
- "eval_spearman_cosine": 0.8278338594350843,
1123
- "eval_spearman_dot": 0.7887595412839734,
1124
- "eval_spearman_euclidean": 0.8317669408319824,
1125
- "eval_spearman_manhattan": 0.8323949761116776,
1126
- "eval_steps_per_second": 15.603,
1127
- "step": 1300
1128
- },
1129
- {
1130
- "epoch": 0.6138706654170571,
1131
- "grad_norm": 1.081640362739563,
1132
- "learning_rate": 4.6163308341143395e-05,
1133
- "loss": 0.3526,
1134
- "step": 1310
1135
- },
1136
- {
1137
- "epoch": 0.6185567010309279,
1138
- "grad_norm": 1.760512351989746,
1139
- "learning_rate": 4.61340206185567e-05,
1140
- "loss": 0.3113,
1141
- "step": 1320
1142
- },
1143
- {
1144
- "epoch": 0.6232427366447985,
1145
- "grad_norm": 1.106444239616394,
1146
- "learning_rate": 4.610473289597001e-05,
1147
- "loss": 0.3126,
1148
- "step": 1330
1149
- },
1150
- {
1151
- "epoch": 0.6279287722586692,
1152
- "grad_norm": 1.3500837087631226,
1153
- "learning_rate": 4.607544517338332e-05,
1154
- "loss": 0.3094,
1155
- "step": 1340
1156
- },
1157
- {
1158
- "epoch": 0.6326148078725399,
1159
- "grad_norm": 1.727953553199768,
1160
- "learning_rate": 4.604615745079663e-05,
1161
- "loss": 0.3304,
1162
- "step": 1350
1163
- },
1164
- {
1165
- "epoch": 0.6373008434864105,
1166
- "grad_norm": 1.4341022968292236,
1167
- "learning_rate": 4.601686972820994e-05,
1168
- "loss": 0.2804,
1169
- "step": 1360
1170
- },
1171
- {
1172
- "epoch": 0.6419868791002812,
1173
- "grad_norm": 1.4479708671569824,
1174
- "learning_rate": 4.5987582005623246e-05,
1175
- "loss": 0.31,
1176
- "step": 1370
1177
- },
1178
- {
1179
- "epoch": 0.6466729147141518,
1180
- "grad_norm": 1.5667890310287476,
1181
- "learning_rate": 4.5958294283036554e-05,
1182
- "loss": 0.3149,
1183
- "step": 1380
1184
- },
1185
- {
1186
- "epoch": 0.6513589503280225,
1187
- "grad_norm": 1.7333146333694458,
1188
- "learning_rate": 4.592900656044986e-05,
1189
- "loss": 0.3247,
1190
- "step": 1390
1191
- },
1192
- {
1193
- "epoch": 0.6560449859418932,
1194
- "grad_norm": 1.914392113685608,
1195
- "learning_rate": 4.589971883786317e-05,
1196
- "loss": 0.2852,
1197
- "step": 1400
1198
- },
1199
- {
1200
- "epoch": 0.6560449859418932,
1201
- "eval_loss": 0.04731455817818642,
1202
- "eval_pearson_cosine": 0.8245641713392331,
1203
- "eval_pearson_dot": 0.7893189374890994,
1204
- "eval_pearson_euclidean": 0.8220644314223797,
1205
- "eval_pearson_manhattan": 0.8227839674683928,
1206
- "eval_runtime": 6.0521,
1207
- "eval_samples_per_second": 247.846,
1208
- "eval_spearman_cosine": 0.8264178003782281,
1209
- "eval_spearman_dot": 0.7874134051082518,
1210
- "eval_spearman_euclidean": 0.8274821508565314,
1211
- "eval_spearman_manhattan": 0.8280999297389011,
1212
- "eval_steps_per_second": 15.532,
1213
- "step": 1400
1214
- },
1215
- {
1216
- "epoch": 0.6607310215557638,
1217
- "grad_norm": 1.2458995580673218,
1218
- "learning_rate": 4.587043111527648e-05,
1219
- "loss": 0.3068,
1220
- "step": 1410
1221
- },
1222
- {
1223
- "epoch": 0.6654170571696345,
1224
- "grad_norm": 1.6540151834487915,
1225
- "learning_rate": 4.584114339268979e-05,
1226
- "loss": 0.3034,
1227
- "step": 1420
1228
- },
1229
- {
1230
- "epoch": 0.6701030927835051,
1231
- "grad_norm": 1.2585715055465698,
1232
- "learning_rate": 4.581185567010309e-05,
1233
- "loss": 0.3297,
1234
- "step": 1430
1235
- },
1236
- {
1237
- "epoch": 0.6747891283973758,
1238
- "grad_norm": 1.5088609457015991,
1239
- "learning_rate": 4.5782567947516406e-05,
1240
- "loss": 0.3444,
1241
- "step": 1440
1242
- },
1243
- {
1244
- "epoch": 0.6794751640112465,
1245
- "grad_norm": 1.3120390176773071,
1246
- "learning_rate": 4.5753280224929714e-05,
1247
- "loss": 0.2882,
1248
- "step": 1450
1249
- },
1250
- {
1251
- "epoch": 0.6841611996251171,
1252
- "grad_norm": 1.1074262857437134,
1253
- "learning_rate": 4.572399250234302e-05,
1254
- "loss": 0.287,
1255
- "step": 1460
1256
- },
1257
- {
1258
- "epoch": 0.6888472352389878,
1259
- "grad_norm": 1.5284086465835571,
1260
- "learning_rate": 4.569470477975633e-05,
1261
- "loss": 0.3175,
1262
- "step": 1470
1263
- },
1264
- {
1265
- "epoch": 0.6935332708528584,
1266
- "grad_norm": 1.5610471963882446,
1267
- "learning_rate": 4.566541705716963e-05,
1268
- "loss": 0.3033,
1269
- "step": 1480
1270
- },
1271
- {
1272
- "epoch": 0.6982193064667291,
1273
- "grad_norm": 1.1839112043380737,
1274
- "learning_rate": 4.563612933458294e-05,
1275
- "loss": 0.2917,
1276
- "step": 1490
1277
- },
1278
- {
1279
- "epoch": 0.7029053420805998,
1280
- "grad_norm": 1.2611138820648193,
1281
- "learning_rate": 4.560684161199626e-05,
1282
- "loss": 0.3225,
1283
- "step": 1500
1284
- },
1285
- {
1286
- "epoch": 0.7029053420805998,
1287
- "eval_loss": 0.05073302239179611,
1288
- "eval_pearson_cosine": 0.8258767050389224,
1289
- "eval_pearson_dot": 0.7737186817222579,
1290
- "eval_pearson_euclidean": 0.826299226589029,
1291
- "eval_pearson_manhattan": 0.8274116157485736,
1292
- "eval_runtime": 6.1756,
1293
- "eval_samples_per_second": 242.892,
1294
- "eval_spearman_cosine": 0.828410027637777,
1295
- "eval_spearman_dot": 0.7707599871747091,
1296
- "eval_spearman_euclidean": 0.8324611004376368,
1297
- "eval_spearman_manhattan": 0.8335300441487923,
1298
- "eval_steps_per_second": 15.221,
1299
- "step": 1500
1300
- },
1301
- {
1302
- "epoch": 0.7075913776944704,
1303
- "grad_norm": 1.3322237730026245,
1304
- "learning_rate": 4.5577553889409565e-05,
1305
- "loss": 0.27,
1306
- "step": 1510
1307
- },
1308
- {
1309
- "epoch": 0.7122774133083412,
1310
- "grad_norm": 1.707694172859192,
1311
- "learning_rate": 4.5548266166822874e-05,
1312
- "loss": 0.3249,
1313
- "step": 1520
1314
- },
1315
- {
1316
- "epoch": 0.7169634489222118,
1317
- "grad_norm": 1.280220627784729,
1318
- "learning_rate": 4.5518978444236176e-05,
1319
- "loss": 0.3021,
1320
- "step": 1530
1321
- },
1322
- {
1323
- "epoch": 0.7216494845360825,
1324
- "grad_norm": 1.2478505373001099,
1325
- "learning_rate": 4.5489690721649484e-05,
1326
- "loss": 0.3033,
1327
- "step": 1540
1328
- },
1329
- {
1330
- "epoch": 0.7263355201499532,
1331
- "grad_norm": 1.5291355848312378,
1332
- "learning_rate": 4.546040299906279e-05,
1333
- "loss": 0.2789,
1334
- "step": 1550
1335
- },
1336
- {
1337
- "epoch": 0.7310215557638238,
1338
- "grad_norm": 1.6631042957305908,
1339
- "learning_rate": 4.543111527647611e-05,
1340
- "loss": 0.3271,
1341
- "step": 1560
1342
- },
1343
- {
1344
- "epoch": 0.7357075913776945,
1345
- "grad_norm": 1.5178686380386353,
1346
- "learning_rate": 4.540182755388942e-05,
1347
- "loss": 0.3092,
1348
- "step": 1570
1349
- },
1350
- {
1351
- "epoch": 0.7403936269915652,
1352
- "grad_norm": 1.043636441230774,
1353
- "learning_rate": 4.537253983130272e-05,
1354
- "loss": 0.2863,
1355
- "step": 1580
1356
- },
1357
- {
1358
- "epoch": 0.7450796626054358,
1359
- "grad_norm": 1.2474050521850586,
1360
- "learning_rate": 4.534325210871603e-05,
1361
- "loss": 0.3184,
1362
- "step": 1590
1363
- },
1364
- {
1365
- "epoch": 0.7497656982193065,
1366
- "grad_norm": 1.5337306261062622,
1367
- "learning_rate": 4.5313964386129336e-05,
1368
- "loss": 0.3201,
1369
- "step": 1600
1370
- },
1371
- {
1372
- "epoch": 0.7497656982193065,
1373
- "eval_loss": 0.04666765406727791,
1374
- "eval_pearson_cosine": 0.8247560159233132,
1375
- "eval_pearson_dot": 0.7799855816644161,
1376
- "eval_pearson_euclidean": 0.8221947952436466,
1377
- "eval_pearson_manhattan": 0.823206574209081,
1378
- "eval_runtime": 6.3641,
1379
- "eval_samples_per_second": 235.697,
1380
- "eval_spearman_cosine": 0.8268195350952193,
1381
- "eval_spearman_dot": 0.7772130800409067,
1382
- "eval_spearman_euclidean": 0.8274185345815266,
1383
- "eval_spearman_manhattan": 0.8282217123816773,
1384
- "eval_steps_per_second": 14.77,
1385
- "step": 1600
1386
- },
1387
- {
1388
- "epoch": 0.7544517338331771,
1389
- "grad_norm": 1.8070961236953735,
1390
- "learning_rate": 4.5284676663542644e-05,
1391
- "loss": 0.2939,
1392
- "step": 1610
1393
- },
1394
- {
1395
- "epoch": 0.7591377694470478,
1396
- "grad_norm": 1.3418357372283936,
1397
- "learning_rate": 4.525538894095596e-05,
1398
- "loss": 0.3122,
1399
- "step": 1620
1400
- },
1401
- {
1402
- "epoch": 0.7638238050609185,
1403
- "grad_norm": 1.4164036512374878,
1404
- "learning_rate": 4.522610121836926e-05,
1405
- "loss": 0.294,
1406
- "step": 1630
1407
- },
1408
- {
1409
- "epoch": 0.7685098406747891,
1410
- "grad_norm": 1.4862034320831299,
1411
- "learning_rate": 4.519681349578257e-05,
1412
- "loss": 0.3087,
1413
- "step": 1640
1414
- },
1415
- {
1416
- "epoch": 0.7731958762886598,
1417
- "grad_norm": 1.3497341871261597,
1418
- "learning_rate": 4.516752577319588e-05,
1419
- "loss": 0.3135,
1420
- "step": 1650
1421
- },
1422
- {
1423
- "epoch": 0.7778819119025304,
1424
- "grad_norm": 1.4912623167037964,
1425
- "learning_rate": 4.513823805060919e-05,
1426
- "loss": 0.3556,
1427
- "step": 1660
1428
- },
1429
- {
1430
- "epoch": 0.7825679475164011,
1431
- "grad_norm": 1.4625390768051147,
1432
- "learning_rate": 4.5108950328022495e-05,
1433
- "loss": 0.2913,
1434
- "step": 1670
1435
- },
1436
- {
1437
- "epoch": 0.7872539831302718,
1438
- "grad_norm": 1.7304317951202393,
1439
- "learning_rate": 4.5079662605435804e-05,
1440
- "loss": 0.3139,
1441
- "step": 1680
1442
- },
1443
- {
1444
- "epoch": 0.7919400187441424,
1445
- "grad_norm": 1.4902634620666504,
1446
- "learning_rate": 4.505037488284911e-05,
1447
- "loss": 0.3286,
1448
- "step": 1690
1449
- },
1450
- {
1451
- "epoch": 0.7966260543580131,
1452
- "grad_norm": 1.2882981300354004,
1453
- "learning_rate": 4.502108716026242e-05,
1454
- "loss": 0.3199,
1455
- "step": 1700
1456
- },
1457
- {
1458
- "epoch": 0.7966260543580131,
1459
- "eval_loss": 0.05109428986907005,
1460
- "eval_pearson_cosine": 0.821506482740503,
1461
- "eval_pearson_dot": 0.7701997371934368,
1462
- "eval_pearson_euclidean": 0.8256940312734855,
1463
- "eval_pearson_manhattan": 0.8266471352779732,
1464
- "eval_runtime": 5.9909,
1465
- "eval_samples_per_second": 250.381,
1466
- "eval_spearman_cosine": 0.8238836226780246,
1467
- "eval_spearman_dot": 0.7658343447023538,
1468
- "eval_spearman_euclidean": 0.8307515392696511,
1469
- "eval_spearman_manhattan": 0.8322184136990032,
1470
- "eval_steps_per_second": 15.691,
1471
- "step": 1700
1472
- },
1473
- {
1474
- "epoch": 0.8013120899718837,
1475
- "grad_norm": 1.4731862545013428,
1476
- "learning_rate": 4.499179943767573e-05,
1477
- "loss": 0.3262,
1478
- "step": 1710
1479
- },
1480
- {
1481
- "epoch": 0.8059981255857545,
1482
- "grad_norm": 1.440738320350647,
1483
- "learning_rate": 4.496251171508904e-05,
1484
- "loss": 0.2911,
1485
- "step": 1720
1486
- },
1487
- {
1488
- "epoch": 0.8106841611996252,
1489
- "grad_norm": 1.594072699546814,
1490
- "learning_rate": 4.4933223992502347e-05,
1491
- "loss": 0.3067,
1492
- "step": 1730
1493
- },
1494
- {
1495
- "epoch": 0.8153701968134958,
1496
- "grad_norm": 1.2838362455368042,
1497
- "learning_rate": 4.4903936269915655e-05,
1498
- "loss": 0.2976,
1499
- "step": 1740
1500
- },
1501
- {
1502
- "epoch": 0.8200562324273665,
1503
- "grad_norm": 1.3946473598480225,
1504
- "learning_rate": 4.4874648547328964e-05,
1505
- "loss": 0.2945,
1506
- "step": 1750
1507
- },
1508
- {
1509
- "epoch": 0.8247422680412371,
1510
- "grad_norm": 1.5584791898727417,
1511
- "learning_rate": 4.484536082474227e-05,
1512
- "loss": 0.3196,
1513
- "step": 1760
1514
- },
1515
- {
1516
- "epoch": 0.8294283036551078,
1517
- "grad_norm": 1.5150123834609985,
1518
- "learning_rate": 4.481607310215558e-05,
1519
- "loss": 0.2792,
1520
- "step": 1770
1521
- },
1522
- {
1523
- "epoch": 0.8341143392689785,
1524
- "grad_norm": 1.5679230690002441,
1525
- "learning_rate": 4.478678537956888e-05,
1526
- "loss": 0.289,
1527
- "step": 1780
1528
- },
1529
- {
1530
- "epoch": 0.8388003748828491,
1531
- "grad_norm": 1.100917100906372,
1532
- "learning_rate": 4.47574976569822e-05,
1533
- "loss": 0.3021,
1534
- "step": 1790
1535
- },
1536
- {
1537
- "epoch": 0.8434864104967198,
1538
- "grad_norm": 1.6804919242858887,
1539
- "learning_rate": 4.4728209934395506e-05,
1540
- "loss": 0.2431,
1541
- "step": 1800
1542
- },
1543
- {
1544
- "epoch": 0.8434864104967198,
1545
- "eval_loss": 0.04816513508558273,
1546
- "eval_pearson_cosine": 0.8270891703367056,
1547
- "eval_pearson_dot": 0.7790513762200462,
1548
- "eval_pearson_euclidean": 0.8276507097787587,
1549
- "eval_pearson_manhattan": 0.8282151299272726,
1550
- "eval_runtime": 6.0115,
1551
- "eval_samples_per_second": 249.523,
1552
- "eval_spearman_cosine": 0.8286758008396278,
1553
- "eval_spearman_dot": 0.7748761234075037,
1554
- "eval_spearman_euclidean": 0.8326421047981132,
1555
- "eval_spearman_manhattan": 0.8332581593954519,
1556
- "eval_steps_per_second": 15.637,
1557
- "step": 1800
1558
- },
1559
- {
1560
- "epoch": 0.8481724461105904,
1561
- "grad_norm": 1.3295296430587769,
1562
- "learning_rate": 4.4698922211808815e-05,
1563
- "loss": 0.2877,
1564
- "step": 1810
1565
- },
1566
- {
1567
- "epoch": 0.8528584817244611,
1568
- "grad_norm": 1.3233381509780884,
1569
- "learning_rate": 4.466963448922212e-05,
1570
- "loss": 0.2982,
1571
- "step": 1820
1572
- },
1573
- {
1574
- "epoch": 0.8575445173383318,
1575
- "grad_norm": 1.5737247467041016,
1576
- "learning_rate": 4.4640346766635425e-05,
1577
- "loss": 0.2939,
1578
- "step": 1830
1579
- },
1580
- {
1581
- "epoch": 0.8622305529522024,
1582
- "grad_norm": 1.4237866401672363,
1583
- "learning_rate": 4.4611059044048734e-05,
1584
- "loss": 0.3154,
1585
- "step": 1840
1586
- },
1587
- {
1588
- "epoch": 0.8669165885660731,
1589
- "grad_norm": 1.4213505983352661,
1590
- "learning_rate": 4.458177132146205e-05,
1591
- "loss": 0.3085,
1592
- "step": 1850
1593
- },
1594
- {
1595
- "epoch": 0.8716026241799437,
1596
- "grad_norm": 1.4691981077194214,
1597
- "learning_rate": 4.455248359887536e-05,
1598
- "loss": 0.3141,
1599
- "step": 1860
1600
- },
1601
- {
1602
- "epoch": 0.8762886597938144,
1603
- "grad_norm": 1.2567983865737915,
1604
- "learning_rate": 4.452319587628866e-05,
1605
- "loss": 0.2998,
1606
- "step": 1870
1607
- },
1608
- {
1609
- "epoch": 0.8809746954076851,
1610
- "grad_norm": 1.359161615371704,
1611
- "learning_rate": 4.449390815370197e-05,
1612
- "loss": 0.2891,
1613
- "step": 1880
1614
- },
1615
- {
1616
- "epoch": 0.8856607310215557,
1617
- "grad_norm": 1.4557381868362427,
1618
- "learning_rate": 4.4464620431115277e-05,
1619
- "loss": 0.3068,
1620
- "step": 1890
1621
- },
1622
- {
1623
- "epoch": 0.8903467666354264,
1624
- "grad_norm": 1.2976425886154175,
1625
- "learning_rate": 4.4435332708528585e-05,
1626
- "loss": 0.3051,
1627
- "step": 1900
1628
- },
1629
- {
1630
- "epoch": 0.8903467666354264,
1631
- "eval_loss": 0.04645664617419243,
1632
- "eval_pearson_cosine": 0.8277422687657179,
1633
- "eval_pearson_dot": 0.7813512072475213,
1634
- "eval_pearson_euclidean": 0.8249433703124964,
1635
- "eval_pearson_manhattan": 0.8257048412620271,
1636
- "eval_runtime": 6.5216,
1637
- "eval_samples_per_second": 230.006,
1638
- "eval_spearman_cosine": 0.829450894301767,
1639
- "eval_spearman_dot": 0.778239364994612,
1640
- "eval_spearman_euclidean": 0.8318708700161123,
1641
- "eval_spearman_manhattan": 0.8323977739737524,
1642
- "eval_steps_per_second": 14.414,
1643
- "step": 1900
1644
- },
1645
- {
1646
- "epoch": 0.895032802249297,
1647
- "grad_norm": 2.003398895263672,
1648
- "learning_rate": 4.44060449859419e-05,
1649
- "loss": 0.3261,
1650
- "step": 1910
1651
- },
1652
- {
1653
- "epoch": 0.8997188378631678,
1654
- "grad_norm": 1.5879777669906616,
1655
- "learning_rate": 4.43767572633552e-05,
1656
- "loss": 0.2905,
1657
- "step": 1920
1658
- },
1659
- {
1660
- "epoch": 0.9044048734770385,
1661
- "grad_norm": 1.239495873451233,
1662
- "learning_rate": 4.434746954076851e-05,
1663
- "loss": 0.3005,
1664
- "step": 1930
1665
- },
1666
- {
1667
- "epoch": 0.9090909090909091,
1668
- "grad_norm": 1.643388271331787,
1669
- "learning_rate": 4.431818181818182e-05,
1670
- "loss": 0.2901,
1671
- "step": 1940
1672
- },
1673
- {
1674
- "epoch": 0.9137769447047798,
1675
- "grad_norm": 1.5589861869812012,
1676
- "learning_rate": 4.428889409559513e-05,
1677
- "loss": 0.269,
1678
- "step": 1950
1679
- },
1680
- {
1681
- "epoch": 0.9184629803186504,
1682
- "grad_norm": 1.9895987510681152,
1683
- "learning_rate": 4.4259606373008436e-05,
1684
- "loss": 0.3354,
1685
- "step": 1960
1686
- },
1687
- {
1688
- "epoch": 0.9231490159325211,
1689
- "grad_norm": 1.5158389806747437,
1690
- "learning_rate": 4.4230318650421745e-05,
1691
- "loss": 0.2987,
1692
- "step": 1970
1693
- },
1694
- {
1695
- "epoch": 0.9278350515463918,
1696
- "grad_norm": 1.498703956604004,
1697
- "learning_rate": 4.4201030927835053e-05,
1698
- "loss": 0.2703,
1699
- "step": 1980
1700
- },
1701
- {
1702
- "epoch": 0.9325210871602624,
1703
- "grad_norm": 1.609595537185669,
1704
- "learning_rate": 4.417174320524836e-05,
1705
- "loss": 0.3511,
1706
- "step": 1990
1707
- },
1708
- {
1709
- "epoch": 0.9372071227741331,
1710
- "grad_norm": 1.5775402784347534,
1711
- "learning_rate": 4.414245548266167e-05,
1712
- "loss": 0.3287,
1713
- "step": 2000
1714
- },
1715
- {
1716
- "epoch": 0.9372071227741331,
1717
- "eval_loss": 0.05509389936923981,
1718
- "eval_pearson_cosine": 0.8207199970543648,
1719
- "eval_pearson_dot": 0.7620359249952955,
1720
- "eval_pearson_euclidean": 0.8229490445973937,
1721
- "eval_pearson_manhattan": 0.8238150410767417,
1722
- "eval_runtime": 6.0216,
1723
- "eval_samples_per_second": 249.103,
1724
- "eval_spearman_cosine": 0.8243786401215701,
1725
- "eval_spearman_dot": 0.7569267777069086,
1726
- "eval_spearman_euclidean": 0.8286694906125902,
1727
- "eval_spearman_manhattan": 0.8295593456330026,
1728
- "eval_steps_per_second": 15.61,
1729
- "step": 2000
1730
- },
1731
- {
1732
- "epoch": 0.9418931583880038,
1733
- "grad_norm": 1.1670805215835571,
1734
- "learning_rate": 4.411316776007498e-05,
1735
- "loss": 0.2544,
1736
- "step": 2010
1737
- },
1738
- {
1739
- "epoch": 0.9465791940018744,
1740
- "grad_norm": 1.6743440628051758,
1741
- "learning_rate": 4.408388003748829e-05,
1742
- "loss": 0.2866,
1743
- "step": 2020
1744
- },
1745
- {
1746
- "epoch": 0.9512652296157451,
1747
- "grad_norm": 1.5439425706863403,
1748
- "learning_rate": 4.4054592314901596e-05,
1749
- "loss": 0.286,
1750
- "step": 2030
1751
- },
1752
- {
1753
- "epoch": 0.9559512652296157,
1754
- "grad_norm": 1.317328691482544,
1755
- "learning_rate": 4.4025304592314905e-05,
1756
- "loss": 0.2674,
1757
- "step": 2040
1758
- },
1759
- {
1760
- "epoch": 0.9606373008434864,
1761
- "grad_norm": 1.3477058410644531,
1762
- "learning_rate": 4.399601686972821e-05,
1763
- "loss": 0.3005,
1764
- "step": 2050
1765
- },
1766
- {
1767
- "epoch": 0.9653233364573571,
1768
- "grad_norm": 1.8991141319274902,
1769
- "learning_rate": 4.396672914714152e-05,
1770
- "loss": 0.2924,
1771
- "step": 2060
1772
- },
1773
- {
1774
- "epoch": 0.9700093720712277,
1775
- "grad_norm": 1.406160593032837,
1776
- "learning_rate": 4.3937441424554824e-05,
1777
- "loss": 0.2977,
1778
- "step": 2070
1779
- },
1780
- {
1781
- "epoch": 0.9746954076850984,
1782
- "grad_norm": 1.7128149271011353,
1783
- "learning_rate": 4.390815370196814e-05,
1784
- "loss": 0.3003,
1785
- "step": 2080
1786
- },
1787
- {
1788
- "epoch": 0.979381443298969,
1789
- "grad_norm": 1.2714091539382935,
1790
- "learning_rate": 4.387886597938145e-05,
1791
- "loss": 0.2807,
1792
- "step": 2090
1793
- },
1794
- {
1795
- "epoch": 0.9840674789128397,
1796
- "grad_norm": 1.1022099256515503,
1797
- "learning_rate": 4.3849578256794756e-05,
1798
- "loss": 0.2889,
1799
- "step": 2100
1800
- },
1801
- {
1802
- "epoch": 0.9840674789128397,
1803
- "eval_loss": 0.04628630727529526,
1804
- "eval_pearson_cosine": 0.8264380630920911,
1805
- "eval_pearson_dot": 0.765122231613347,
1806
- "eval_pearson_euclidean": 0.8297712335935188,
1807
- "eval_pearson_manhattan": 0.8303968843751832,
1808
- "eval_runtime": 5.8458,
1809
- "eval_samples_per_second": 256.595,
1810
- "eval_spearman_cosine": 0.8280517554752953,
1811
- "eval_spearman_dot": 0.7620029087019149,
1812
- "eval_spearman_euclidean": 0.8362970969318975,
1813
- "eval_spearman_manhattan": 0.8368418361210694,
1814
- "eval_steps_per_second": 16.08,
1815
- "step": 2100
1816
- },
1817
- {
1818
- "epoch": 0.9887535145267105,
1819
- "grad_norm": 1.5780407190322876,
1820
- "learning_rate": 4.3820290534208064e-05,
1821
- "loss": 0.3019,
1822
- "step": 2110
1823
- },
1824
- {
1825
- "epoch": 0.993439550140581,
1826
- "grad_norm": 1.5804523229599,
1827
- "learning_rate": 4.3791002811621366e-05,
1828
- "loss": 0.254,
1829
- "step": 2120
1830
- },
1831
- {
1832
- "epoch": 0.9981255857544518,
1833
- "grad_norm": 1.5754133462905884,
1834
- "learning_rate": 4.3761715089034675e-05,
1835
- "loss": 0.2948,
1836
- "step": 2130
1837
  }
1838
  ],
1839
  "logging_steps": 10,
@@ -1848,7 +371,7 @@
1848
  "should_evaluate": false,
1849
  "should_log": false,
1850
  "should_save": true,
1851
- "should_training_stop": true
1852
  },
1853
  "attributes": {}
1854
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.23430178069353327,
5
+ "eval_steps": 2.0,
6
+ "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.004686035613870665,
13
+ "grad_norm": 4.305652141571045,
14
  "learning_rate": 4.997071227741331e-05,
15
+ "loss": 2.2476,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.00937207122774133,
20
+ "grad_norm": 3.343132972717285,
21
  "learning_rate": 4.994142455482662e-05,
22
+ "loss": 1.2208,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.014058106841611996,
27
+ "grad_norm": 3.07961368560791,
28
  "learning_rate": 4.991213683223993e-05,
29
+ "loss": 0.9737,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.01874414245548266,
34
+ "grad_norm": 2.348374128341675,
35
  "learning_rate": 4.9882849109653237e-05,
36
+ "loss": 0.8232,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.023430178069353328,
41
+ "grad_norm": 2.149184465408325,
42
  "learning_rate": 4.9853561387066545e-05,
43
+ "loss": 0.7357,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.028116213683223992,
48
+ "grad_norm": 2.793274164199829,
49
  "learning_rate": 4.9824273664479854e-05,
50
+ "loss": 0.6115,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.03280224929709466,
55
+ "grad_norm": 2.3996222019195557,
56
  "learning_rate": 4.979498594189316e-05,
57
+ "loss": 0.5681,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.03748828491096532,
62
+ "grad_norm": 2.7164816856384277,
63
  "learning_rate": 4.9765698219306464e-05,
64
+ "loss": 0.6132,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.04217432052483599,
69
+ "grad_norm": 2.3677186965942383,
70
  "learning_rate": 4.973641049671978e-05,
71
+ "loss": 0.5407,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.046860356138706656,
76
+ "grad_norm": 2.1071460247039795,
77
  "learning_rate": 4.970712277413309e-05,
78
+ "loss": 0.5947,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.05154639175257732,
83
+ "grad_norm": 2.226364850997925,
84
  "learning_rate": 4.9677835051546396e-05,
85
+ "loss": 0.532,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.056232427366447985,
90
+ "grad_norm": 1.9689487218856812,
91
  "learning_rate": 4.9648547328959705e-05,
92
+ "loss": 0.4874,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.06091846298031865,
97
+ "grad_norm": 2.2253146171569824,
98
  "learning_rate": 4.961925960637301e-05,
99
+ "loss": 0.5034,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.06560449859418932,
104
+ "grad_norm": 1.8077352046966553,
105
  "learning_rate": 4.9589971883786315e-05,
106
+ "loss": 0.4774,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.07029053420805999,
111
+ "grad_norm": 1.9207241535186768,
112
  "learning_rate": 4.956068416119963e-05,
113
+ "loss": 0.5066,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.07497656982193064,
118
+ "grad_norm": 2.000474691390991,
119
  "learning_rate": 4.953139643861294e-05,
120
+ "loss": 0.5007,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.07966260543580131,
125
+ "grad_norm": 2.3965399265289307,
126
  "learning_rate": 4.950210871602625e-05,
127
+ "loss": 0.4752,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.08434864104967198,
132
+ "grad_norm": 2.164004325866699,
133
  "learning_rate": 4.947282099343955e-05,
134
+ "loss": 0.4241,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.08903467666354264,
139
+ "grad_norm": 2.105633497238159,
140
  "learning_rate": 4.944353327085286e-05,
141
+ "loss": 0.4975,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.09372071227741331,
146
+ "grad_norm": 2.2528748512268066,
147
  "learning_rate": 4.9414245548266166e-05,
148
+ "loss": 0.4117,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.09840674789128398,
153
+ "grad_norm": 2.4957821369171143,
154
  "learning_rate": 4.938495782567948e-05,
155
+ "loss": 0.4842,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.10309278350515463,
160
+ "grad_norm": 1.6977312564849854,
161
  "learning_rate": 4.935567010309279e-05,
162
+ "loss": 0.4407,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.1077788191190253,
167
+ "grad_norm": 2.498537302017212,
168
  "learning_rate": 4.932638238050609e-05,
169
+ "loss": 0.4402,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.11246485473289597,
174
+ "grad_norm": 1.9550998210906982,
175
  "learning_rate": 4.92970946579194e-05,
176
+ "loss": 0.4062,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.11715089034676664,
181
+ "grad_norm": 1.692822813987732,
182
  "learning_rate": 4.926780693533271e-05,
183
+ "loss": 0.4135,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.1218369259606373,
188
+ "grad_norm": 1.936856985092163,
189
  "learning_rate": 4.923851921274602e-05,
190
+ "loss": 0.4518,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.12652296157450796,
195
+ "grad_norm": 2.509472370147705,
196
  "learning_rate": 4.920923149015933e-05,
197
+ "loss": 0.4065,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.13120899718837864,
202
+ "grad_norm": 1.993790864944458,
203
  "learning_rate": 4.9179943767572635e-05,
204
+ "loss": 0.4252,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.1358950328022493,
209
+ "grad_norm": 2.542051315307617,
210
  "learning_rate": 4.9150656044985943e-05,
211
+ "loss": 0.4342,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.14058106841611998,
216
+ "grad_norm": 2.0401690006256104,
217
  "learning_rate": 4.912136832239925e-05,
218
+ "loss": 0.4005,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.14526710402999063,
223
+ "grad_norm": 2.234344005584717,
224
  "learning_rate": 4.909208059981256e-05,
225
+ "loss": 0.4044,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.14995313964386128,
230
+ "grad_norm": 2.4048752784729004,
231
  "learning_rate": 4.906279287722587e-05,
232
+ "loss": 0.3832,
233
  "step": 320
234
  },
235
  {
236
  "epoch": 0.15463917525773196,
237
+ "grad_norm": 2.027322769165039,
238
  "learning_rate": 4.903350515463918e-05,
239
+ "loss": 0.4425,
240
  "step": 330
241
  },
242
  {
243
  "epoch": 0.15932521087160262,
244
+ "grad_norm": 1.7849469184875488,
245
  "learning_rate": 4.9004217432052486e-05,
246
+ "loss": 0.4034,
247
  "step": 340
248
  },
249
  {
250
  "epoch": 0.1640112464854733,
251
+ "grad_norm": 1.865513563156128,
252
  "learning_rate": 4.8974929709465795e-05,
253
+ "loss": 0.4256,
254
  "step": 350
255
  },
256
  {
257
  "epoch": 0.16869728209934395,
258
+ "grad_norm": 2.17820143699646,
259
  "learning_rate": 4.89456419868791e-05,
260
+ "loss": 0.388,
261
  "step": 360
262
  },
263
  {
264
  "epoch": 0.1733833177132146,
265
+ "grad_norm": 2.6553549766540527,
266
  "learning_rate": 4.891635426429241e-05,
267
+ "loss": 0.3645,
268
  "step": 370
269
  },
270
  {
271
  "epoch": 0.1780693533270853,
272
+ "grad_norm": 2.155061960220337,
273
  "learning_rate": 4.888706654170572e-05,
274
+ "loss": 0.3819,
275
  "step": 380
276
  },
277
  {
278
  "epoch": 0.18275538894095594,
279
+ "grad_norm": 1.9706778526306152,
280
  "learning_rate": 4.885777881911903e-05,
281
+ "loss": 0.3959,
282
  "step": 390
283
  },
284
  {
285
  "epoch": 0.18744142455482662,
286
+ "grad_norm": 2.111262321472168,
287
  "learning_rate": 4.882849109653234e-05,
288
+ "loss": 0.3929,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  "step": 400
290
  },
291
  {
292
  "epoch": 0.19212746016869728,
293
+ "grad_norm": 2.65875244140625,
294
  "learning_rate": 4.8799203373945646e-05,
295
+ "loss": 0.4151,
296
  "step": 410
297
  },
298
  {
299
  "epoch": 0.19681349578256796,
300
+ "grad_norm": 1.8690752983093262,
301
  "learning_rate": 4.8769915651358954e-05,
302
+ "loss": 0.3823,
303
  "step": 420
304
  },
305
  {
306
  "epoch": 0.2014995313964386,
307
+ "grad_norm": 2.35809326171875,
308
  "learning_rate": 4.8740627928772256e-05,
309
+ "loss": 0.4079,
310
  "step": 430
311
  },
312
  {
313
  "epoch": 0.20618556701030927,
314
+ "grad_norm": 1.4293204545974731,
315
  "learning_rate": 4.871134020618557e-05,
316
+ "loss": 0.3732,
317
  "step": 440
318
  },
319
  {
320
  "epoch": 0.21087160262417995,
321
+ "grad_norm": 2.2345097064971924,
322
  "learning_rate": 4.868205248359888e-05,
323
+ "loss": 0.3513,
324
  "step": 450
325
  },
326
  {
327
  "epoch": 0.2155576382380506,
328
+ "grad_norm": 1.7603412866592407,
329
  "learning_rate": 4.865276476101219e-05,
330
+ "loss": 0.3872,
331
  "step": 460
332
  },
333
  {
334
  "epoch": 0.22024367385192128,
335
+ "grad_norm": 1.8551238775253296,
336
  "learning_rate": 4.86234770384255e-05,
337
+ "loss": 0.377,
338
  "step": 470
339
  },
340
  {
341
  "epoch": 0.22492970946579194,
342
+ "grad_norm": 2.2718453407287598,
343
  "learning_rate": 4.85941893158388e-05,
344
+ "loss": 0.3466,
345
  "step": 480
346
  },
347
  {
348
  "epoch": 0.2296157450796626,
349
+ "grad_norm": 2.021726608276367,
350
  "learning_rate": 4.856490159325211e-05,
351
+ "loss": 0.3778,
352
  "step": 490
353
  },
354
  {
355
  "epoch": 0.23430178069353327,
356
+ "grad_norm": 1.4741500616073608,
357
  "learning_rate": 4.853561387066542e-05,
358
+ "loss": 0.3862,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
  }
361
  ],
362
  "logging_steps": 10,
 
371
  "should_evaluate": false,
372
  "should_log": false,
373
  "should_save": true,
374
+ "should_training_stop": false
375
  },
376
  "attributes": {}
377
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67f2a3ae5812e7bbaeb6ebb48a282b22fea35d0af73c1dd88733eb450fcd0add
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7be9e20c2c0889091baaa0347720d7888707c1703a6f97836e6beae35fe15fe
3
  size 5368