nielsbantilan commited on
Commit
47f4d80
1 Parent(s): 6eea387

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,16 +1,32 @@
1
  ---
2
- datasets:
3
- - yahma/alpaca-cleaned
4
- language:
5
- - en
6
- license: apache-2.0
7
- tags:
8
- - pytorch
9
- - causal-lm
10
- - llama2
11
- - fine-tuning
12
- - alpaca
13
-
14
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- # Llama-2-7b fine-tuned on LoRA alpaca-cleaned
 
1
  ---
2
+ library_name: peft
 
 
 
 
 
 
 
 
 
 
 
3
  ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - load_in_8bit: True
9
+ - load_in_4bit: False
10
+ - llm_int8_threshold: 6.0
11
+ - llm_int8_skip_modules: None
12
+ - llm_int8_enable_fp32_cpu_offload: False
13
+ - llm_int8_has_fp16_weight: False
14
+ - bnb_4bit_quant_type: fp4
15
+ - bnb_4bit_use_double_quant: False
16
+ - bnb_4bit_compute_dtype: float32
17
+
18
+ The following `bitsandbytes` quantization config was used during training:
19
+ - load_in_8bit: True
20
+ - load_in_4bit: False
21
+ - llm_int8_threshold: 6.0
22
+ - llm_int8_skip_modules: None
23
+ - llm_int8_enable_fp32_cpu_offload: False
24
+ - llm_int8_has_fp16_weight: False
25
+ - bnb_4bit_quant_type: fp4
26
+ - bnb_4bit_use_double_quant: False
27
+ - bnb_4bit_compute_dtype: float32
28
+ ### Framework versions
29
+
30
+ - PEFT 0.5.0.dev0
31
 
32
+ - PEFT 0.5.0.dev0
checkpoint-300/README.md CHANGED
@@ -5,15 +5,15 @@ library_name: peft
5
 
6
 
7
  The following `bitsandbytes` quantization config was used during training:
8
- - load_in_8bit: False
9
- - load_in_4bit: True
10
  - llm_int8_threshold: 6.0
11
  - llm_int8_skip_modules: None
12
  - llm_int8_enable_fp32_cpu_offload: False
13
  - llm_int8_has_fp16_weight: False
14
- - bnb_4bit_quant_type: nf4
15
- - bnb_4bit_use_double_quant: True
16
- - bnb_4bit_compute_dtype: float16
17
  ### Framework versions
18
 
19
 
 
5
 
6
 
7
  The following `bitsandbytes` quantization config was used during training:
8
+ - load_in_8bit: True
9
+ - load_in_4bit: False
10
  - llm_int8_threshold: 6.0
11
  - llm_int8_skip_modules: None
12
  - llm_int8_enable_fp32_cpu_offload: False
13
  - llm_int8_has_fp16_weight: False
14
+ - bnb_4bit_quant_type: fp4
15
+ - bnb_4bit_use_double_quant: False
16
+ - bnb_4bit_compute_dtype: float32
17
  ### Framework versions
18
 
19
 
checkpoint-300/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e194624690ed5347b1ee38663cafe4b90219064eeee73e8e90240e169d1c7a5
3
- size 12849093
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e50506142760259bd82a9c78175bec890b61a0e3fe649b140931d4f2ecea6bde
3
+ size 50492421
checkpoint-300/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:426b6985726cfa57b10a2ceb7852c6f85a389e9facba42145b01426ab7c985a4
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0226636856a5596306c2032091fb45bbfde326842fddb9676284dc01a2cd9733
3
  size 14575
checkpoint-300/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9850941e7721c4485c4be6cdc71482f604e4a686e50a540c13dcd9ef580d226b
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8b283af915b370e85d1ae3d1dc0043f555ce603e26ec8b90b7113a4b578935a
3
  size 627
checkpoint-300/trainer_state.json CHANGED
@@ -9,1808 +9,1808 @@
9
  "log_history": [
10
  {
11
  "epoch": 0.0,
12
- "learning_rate": 3.333333333333333e-07,
13
- "loss": 1.2235,
14
  "step": 1
15
  },
16
  {
17
  "epoch": 0.0,
18
- "learning_rate": 6.666666666666666e-07,
19
- "loss": 1.1855,
20
  "step": 2
21
  },
22
  {
23
  "epoch": 0.0,
24
- "learning_rate": 1e-06,
25
- "loss": 1.0787,
26
  "step": 3
27
  },
28
  {
29
  "epoch": 0.0,
30
- "learning_rate": 1.3333333333333332e-06,
31
- "loss": 1.2554,
32
  "step": 4
33
  },
34
  {
35
  "epoch": 0.0,
36
- "learning_rate": 1.6666666666666669e-06,
37
- "loss": 1.303,
38
  "step": 5
39
  },
40
  {
41
  "epoch": 0.0,
42
- "learning_rate": 2e-06,
43
- "loss": 1.3766,
44
  "step": 6
45
  },
46
  {
47
  "epoch": 0.0,
48
- "learning_rate": 2.3333333333333336e-06,
49
- "loss": 1.3295,
50
  "step": 7
51
  },
52
  {
53
  "epoch": 0.0,
54
- "learning_rate": 2.6666666666666664e-06,
55
- "loss": 1.4096,
56
  "step": 8
57
  },
58
  {
59
  "epoch": 0.0,
60
- "learning_rate": 3e-06,
61
- "loss": 1.2782,
62
  "step": 9
63
  },
64
  {
65
  "epoch": 0.0,
66
- "learning_rate": 2.9999125880491853e-06,
67
- "loss": 1.3555,
68
  "step": 10
69
  },
70
  {
71
  "epoch": 0.0,
72
- "learning_rate": 2.9996503623845394e-06,
73
- "loss": 1.4765,
74
  "step": 11
75
  },
76
  {
77
  "epoch": 0.0,
78
- "learning_rate": 2.9992133535682728e-06,
79
- "loss": 1.4239,
80
  "step": 12
81
  },
82
  {
83
  "epoch": 0.0,
84
- "learning_rate": 2.998601612533441e-06,
85
- "loss": 1.4622,
86
  "step": 13
87
  },
88
  {
89
  "epoch": 0.0,
90
- "learning_rate": 2.9978152105780155e-06,
91
- "loss": 1.5584,
92
  "step": 14
93
  },
94
  {
95
  "epoch": 0.0,
96
- "learning_rate": 2.9968542393565676e-06,
97
- "loss": 1.4483,
98
  "step": 15
99
  },
100
  {
101
  "epoch": 0.0,
102
- "learning_rate": 2.9957188108695897e-06,
103
- "loss": 1.6347,
104
  "step": 16
105
  },
106
  {
107
  "epoch": 0.0,
108
- "learning_rate": 2.99440905745044e-06,
109
- "loss": 1.5348,
110
  "step": 17
111
  },
112
  {
113
  "epoch": 0.0,
114
- "learning_rate": 2.992925131749921e-06,
115
- "loss": 1.6787,
116
  "step": 18
117
  },
118
  {
119
  "epoch": 0.0,
120
- "learning_rate": 2.9912672067184863e-06,
121
- "loss": 1.6322,
122
  "step": 19
123
  },
124
  {
125
  "epoch": 0.0,
126
- "learning_rate": 2.9894354755860847e-06,
127
- "loss": 1.6904,
128
  "step": 20
129
  },
130
  {
131
  "epoch": 0.0,
132
- "learning_rate": 2.9874301518396377e-06,
133
- "loss": 1.5149,
134
  "step": 21
135
  },
136
  {
137
  "epoch": 0.0,
138
- "learning_rate": 2.98525146919816e-06,
139
- "loss": 1.6881,
140
  "step": 22
141
  },
142
  {
143
  "epoch": 0.0,
144
- "learning_rate": 2.982899681585518e-06,
145
- "loss": 1.8038,
146
  "step": 23
147
  },
148
  {
149
  "epoch": 0.0,
150
- "learning_rate": 2.980375063100836e-06,
151
- "loss": 1.7428,
152
  "step": 24
153
  },
154
  {
155
  "epoch": 0.0,
156
- "learning_rate": 2.9776779079865498e-06,
157
- "loss": 1.8379,
158
  "step": 25
159
  },
160
  {
161
  "epoch": 0.0,
162
- "learning_rate": 2.9748085305941124e-06,
163
- "loss": 1.8679,
164
  "step": 26
165
  },
166
  {
167
  "epoch": 0.0,
168
- "learning_rate": 2.9717672653473587e-06,
169
- "loss": 1.8585,
170
  "step": 27
171
  },
172
  {
173
  "epoch": 0.0,
174
- "learning_rate": 2.9685544667035257e-06,
175
- "loss": 1.787,
176
  "step": 28
177
  },
178
  {
179
  "epoch": 0.0,
180
- "learning_rate": 2.9651705091119422e-06,
181
- "loss": 1.9908,
182
  "step": 29
183
  },
184
  {
185
  "epoch": 0.0,
186
- "learning_rate": 2.9616157869703894e-06,
187
- "loss": 2.0559,
188
  "step": 30
189
  },
190
  {
191
  "epoch": 0.0,
192
- "learning_rate": 2.957890714579128e-06,
193
- "loss": 2.1067,
194
  "step": 31
195
  },
196
  {
197
  "epoch": 0.0,
198
- "learning_rate": 2.9539957260926184e-06,
199
- "loss": 2.1172,
200
  "step": 32
201
  },
202
  {
203
  "epoch": 0.01,
204
- "learning_rate": 2.949931275468917e-06,
205
- "loss": 2.06,
206
  "step": 33
207
  },
208
  {
209
  "epoch": 0.01,
210
- "learning_rate": 2.9456978364167667e-06,
211
- "loss": 2.2804,
212
  "step": 34
213
  },
214
  {
215
  "epoch": 0.01,
216
- "learning_rate": 2.9412959023403906e-06,
217
- "loss": 2.3487,
218
  "step": 35
219
  },
220
  {
221
  "epoch": 0.01,
222
- "learning_rate": 2.9367259862819805e-06,
223
- "loss": 2.3993,
224
  "step": 36
225
  },
226
  {
227
  "epoch": 0.01,
228
- "learning_rate": 2.931988620861908e-06,
229
- "loss": 2.4545,
230
  "step": 37
231
  },
232
  {
233
  "epoch": 0.01,
234
- "learning_rate": 2.9270843582166428e-06,
235
- "loss": 2.3358,
236
  "step": 38
237
  },
238
  {
239
  "epoch": 0.01,
240
- "learning_rate": 2.9220137699344057e-06,
241
- "loss": 2.4548,
242
  "step": 39
243
  },
244
  {
245
  "epoch": 0.01,
246
- "learning_rate": 2.9167774469885483e-06,
247
- "loss": 2.4675,
248
  "step": 40
249
  },
250
  {
251
  "epoch": 0.01,
252
- "learning_rate": 2.911375999668675e-06,
253
- "loss": 2.468,
254
  "step": 41
255
  },
256
  {
257
  "epoch": 0.01,
258
- "learning_rate": 2.905810057509516e-06,
259
- "loss": 2.3514,
260
  "step": 42
261
  },
262
  {
263
  "epoch": 0.01,
264
- "learning_rate": 2.900080269217554e-06,
265
- "loss": 2.5067,
266
  "step": 43
267
  },
268
  {
269
  "epoch": 0.01,
270
- "learning_rate": 2.8941873025954193e-06,
271
- "loss": 2.5034,
272
  "step": 44
273
  },
274
  {
275
  "epoch": 0.01,
276
- "learning_rate": 2.8881318444640566e-06,
277
- "loss": 2.5753,
278
  "step": 45
279
  },
280
  {
281
  "epoch": 0.01,
282
- "learning_rate": 2.881914600582677e-06,
283
- "loss": 2.5559,
284
  "step": 46
285
  },
286
  {
287
  "epoch": 0.01,
288
- "learning_rate": 2.8755362955665014e-06,
289
- "loss": 2.7476,
290
  "step": 47
291
  },
292
  {
293
  "epoch": 0.01,
294
- "learning_rate": 2.8689976728023105e-06,
295
- "loss": 2.6816,
296
  "step": 48
297
  },
298
  {
299
  "epoch": 0.01,
300
- "learning_rate": 2.8622994943617984e-06,
301
- "loss": 2.7578,
302
  "step": 49
303
  },
304
  {
305
  "epoch": 0.01,
306
- "learning_rate": 2.855442540912758e-06,
307
- "loss": 2.8156,
308
  "step": 50
309
  },
310
  {
311
  "epoch": 0.01,
312
- "learning_rate": 2.848427611628093e-06,
313
- "loss": 1.1719,
314
  "step": 51
315
  },
316
  {
317
  "epoch": 0.01,
318
- "learning_rate": 2.8412555240926745e-06,
319
- "loss": 1.189,
320
  "step": 52
321
  },
322
  {
323
  "epoch": 0.01,
324
- "learning_rate": 2.8339271142080537e-06,
325
- "loss": 1.0689,
326
  "step": 53
327
  },
328
  {
329
  "epoch": 0.01,
330
- "learning_rate": 2.8264432360950353e-06,
331
- "loss": 1.1861,
332
  "step": 54
333
  },
334
  {
335
  "epoch": 0.01,
336
- "learning_rate": 2.8188047619941344e-06,
337
- "loss": 1.2506,
338
  "step": 55
339
  },
340
  {
341
  "epoch": 0.01,
342
- "learning_rate": 2.8110125821639135e-06,
343
- "loss": 1.2313,
344
  "step": 56
345
  },
346
  {
347
  "epoch": 0.01,
348
- "learning_rate": 2.803067604777227e-06,
349
- "loss": 1.1761,
350
  "step": 57
351
  },
352
  {
353
  "epoch": 0.01,
354
- "learning_rate": 2.7949707558153703e-06,
355
- "loss": 1.2601,
356
  "step": 58
357
  },
358
  {
359
  "epoch": 0.01,
360
- "learning_rate": 2.7867229789601615e-06,
361
- "loss": 1.364,
362
  "step": 59
363
  },
364
  {
365
  "epoch": 0.01,
366
- "learning_rate": 2.778325235483954e-06,
367
- "loss": 1.3515,
368
  "step": 60
369
  },
370
  {
371
  "epoch": 0.01,
372
- "learning_rate": 2.7697785041376006e-06,
373
- "loss": 1.3414,
374
  "step": 61
375
  },
376
  {
377
  "epoch": 0.01,
378
- "learning_rate": 2.7610837810363814e-06,
379
- "loss": 1.2057,
380
  "step": 62
381
  },
382
  {
383
  "epoch": 0.01,
384
- "learning_rate": 2.752242079543907e-06,
385
- "loss": 1.404,
386
  "step": 63
387
  },
388
  {
389
  "epoch": 0.01,
390
- "learning_rate": 2.743254430154012e-06,
391
- "loss": 1.3719,
392
  "step": 64
393
  },
394
  {
395
  "epoch": 0.01,
396
- "learning_rate": 2.734121880370652e-06,
397
- "loss": 1.4479,
398
  "step": 65
399
  },
400
  {
401
  "epoch": 0.01,
402
- "learning_rate": 2.7248454945858163e-06,
403
- "loss": 1.4983,
404
  "step": 66
405
  },
406
  {
407
  "epoch": 0.01,
408
- "learning_rate": 2.7154263539554765e-06,
409
- "loss": 1.477,
410
  "step": 67
411
  },
412
  {
413
  "epoch": 0.01,
414
- "learning_rate": 2.7058655562735753e-06,
415
- "loss": 1.5738,
416
  "step": 68
417
  },
418
  {
419
  "epoch": 0.01,
420
- "learning_rate": 2.696164215844081e-06,
421
- "loss": 1.635,
422
  "step": 69
423
  },
424
  {
425
  "epoch": 0.01,
426
- "learning_rate": 2.6863234633511186e-06,
427
- "loss": 1.541,
428
  "step": 70
429
  },
430
  {
431
  "epoch": 0.01,
432
- "learning_rate": 2.6763444457271836e-06,
433
- "loss": 1.6623,
434
  "step": 71
435
  },
436
  {
437
  "epoch": 0.01,
438
- "learning_rate": 2.666228326019474e-06,
439
- "loss": 1.7319,
440
  "step": 72
441
  },
442
  {
443
  "epoch": 0.01,
444
- "learning_rate": 2.655976283254334e-06,
445
- "loss": 1.6835,
446
  "step": 73
447
  },
448
  {
449
  "epoch": 0.01,
450
- "learning_rate": 2.6455895122998405e-06,
451
- "loss": 1.7268,
452
  "step": 74
453
  },
454
  {
455
  "epoch": 0.01,
456
- "learning_rate": 2.6350692237265428e-06,
457
- "loss": 1.7661,
458
  "step": 75
459
  },
460
  {
461
  "epoch": 0.01,
462
- "learning_rate": 2.624416643666371e-06,
463
- "loss": 1.6734,
464
  "step": 76
465
  },
466
  {
467
  "epoch": 0.01,
468
- "learning_rate": 2.6136330136697304e-06,
469
- "loss": 1.802,
470
  "step": 77
471
  },
472
  {
473
  "epoch": 0.01,
474
- "learning_rate": 2.602719590560801e-06,
475
- "loss": 1.8868,
476
  "step": 78
477
  },
478
  {
479
  "epoch": 0.01,
480
- "learning_rate": 2.591677646291054e-06,
481
- "loss": 1.785,
482
  "step": 79
483
  },
484
  {
485
  "epoch": 0.01,
486
- "learning_rate": 2.58050846779101e-06,
487
- "loss": 2.0649,
488
  "step": 80
489
  },
490
  {
491
  "epoch": 0.01,
492
- "learning_rate": 2.569213356820244e-06,
493
- "loss": 1.9862,
494
  "step": 81
495
  },
496
  {
497
  "epoch": 0.01,
498
- "learning_rate": 2.557793629815669e-06,
499
- "loss": 2.062,
500
  "step": 82
501
  },
502
  {
503
  "epoch": 0.01,
504
- "learning_rate": 2.5462506177381045e-06,
505
- "loss": 1.9782,
506
  "step": 83
507
  },
508
  {
509
  "epoch": 0.01,
510
- "learning_rate": 2.5345856659171565e-06,
511
- "loss": 2.1214,
512
  "step": 84
513
  },
514
  {
515
  "epoch": 0.01,
516
- "learning_rate": 2.522800133894418e-06,
517
- "loss": 2.3212,
518
  "step": 85
519
  },
520
  {
521
  "epoch": 0.01,
522
- "learning_rate": 2.510895395265016e-06,
523
- "loss": 2.2857,
524
  "step": 86
525
  },
526
  {
527
  "epoch": 0.01,
528
- "learning_rate": 2.498872837517522e-06,
529
- "loss": 2.1979,
530
  "step": 87
531
  },
532
  {
533
  "epoch": 0.01,
534
- "learning_rate": 2.486733861872236e-06,
535
- "loss": 2.332,
536
  "step": 88
537
  },
538
  {
539
  "epoch": 0.01,
540
- "learning_rate": 2.4744798831178817e-06,
541
- "loss": 2.3778,
542
  "step": 89
543
  },
544
  {
545
  "epoch": 0.01,
546
- "learning_rate": 2.4621123294467098e-06,
547
- "loss": 2.3099,
548
  "step": 90
549
  },
550
  {
551
  "epoch": 0.01,
552
- "learning_rate": 2.449632642288045e-06,
553
- "loss": 2.4029,
554
  "step": 91
555
  },
556
  {
557
  "epoch": 0.01,
558
- "learning_rate": 2.437042276140287e-06,
559
- "loss": 2.5416,
560
  "step": 92
561
  },
562
  {
563
  "epoch": 0.01,
564
- "learning_rate": 2.424342698401391e-06,
565
- "loss": 2.531,
566
  "step": 93
567
  },
568
  {
569
  "epoch": 0.01,
570
- "learning_rate": 2.4115353891978432e-06,
571
- "loss": 2.4672,
572
  "step": 94
573
  },
574
  {
575
  "epoch": 0.01,
576
- "learning_rate": 2.398621841212154e-06,
577
- "loss": 2.4891,
578
  "step": 95
579
  },
580
  {
581
  "epoch": 0.01,
582
- "learning_rate": 2.3856035595088842e-06,
583
- "loss": 2.5826,
584
  "step": 96
585
  },
586
  {
587
  "epoch": 0.01,
588
- "learning_rate": 2.372482061359234e-06,
589
- "loss": 2.5583,
590
  "step": 97
591
  },
592
  {
593
  "epoch": 0.02,
594
- "learning_rate": 2.3592588760642046e-06,
595
- "loss": 2.5609,
596
  "step": 98
597
  },
598
  {
599
  "epoch": 0.02,
600
- "learning_rate": 2.34593554477636e-06,
601
- "loss": 2.6917,
602
  "step": 99
603
  },
604
  {
605
  "epoch": 0.02,
606
- "learning_rate": 2.332513620320205e-06,
607
- "loss": 2.6746,
608
  "step": 100
609
  },
610
  {
611
  "epoch": 0.02,
612
- "learning_rate": 2.318994667011207e-06,
613
- "loss": 1.0723,
614
  "step": 101
615
  },
616
  {
617
  "epoch": 0.02,
618
- "learning_rate": 2.305380260473476e-06,
619
- "loss": 1.1946,
620
  "step": 102
621
  },
622
  {
623
  "epoch": 0.02,
624
- "learning_rate": 2.2916719874561227e-06,
625
- "loss": 1.1031,
626
  "step": 103
627
  },
628
  {
629
  "epoch": 0.02,
630
- "learning_rate": 2.277871445648332e-06,
631
- "loss": 1.2431,
632
  "step": 104
633
  },
634
  {
635
  "epoch": 0.02,
636
- "learning_rate": 2.2639802434931445e-06,
637
- "loss": 1.1853,
638
  "step": 105
639
  },
640
  {
641
  "epoch": 0.02,
642
- "learning_rate": 2.25e-06,
643
- "loss": 1.3308,
644
  "step": 106
645
  },
646
  {
647
  "epoch": 0.02,
648
- "learning_rate": 2.2359323445560408e-06,
649
- "loss": 1.2904,
650
  "step": 107
651
  },
652
  {
653
  "epoch": 0.02,
654
- "learning_rate": 2.221778916736208e-06,
655
- "loss": 1.338,
656
  "step": 108
657
  },
658
  {
659
  "epoch": 0.02,
660
- "learning_rate": 2.2075413661121492e-06,
661
- "loss": 1.1788,
662
  "step": 109
663
  },
664
  {
665
  "epoch": 0.02,
666
- "learning_rate": 2.1932213520599652e-06,
667
- "loss": 1.2099,
668
  "step": 110
669
  },
670
  {
671
  "epoch": 0.02,
672
- "learning_rate": 2.1788205435668085e-06,
673
- "loss": 1.3613,
674
  "step": 111
675
  },
676
  {
677
  "epoch": 0.02,
678
- "learning_rate": 2.1643406190363625e-06,
679
- "loss": 1.3139,
680
  "step": 112
681
  },
682
  {
683
  "epoch": 0.02,
684
- "learning_rate": 2.1497832660932298e-06,
685
- "loss": 1.3749,
686
  "step": 113
687
  },
688
  {
689
  "epoch": 0.02,
690
- "learning_rate": 2.135150181386236e-06,
691
- "loss": 1.3537,
692
  "step": 114
693
  },
694
  {
695
  "epoch": 0.02,
696
- "learning_rate": 2.1204430703906874e-06,
697
- "loss": 1.4139,
698
  "step": 115
699
  },
700
  {
701
  "epoch": 0.02,
702
- "learning_rate": 2.1056636472096025e-06,
703
- "loss": 1.5552,
704
  "step": 116
705
  },
706
  {
707
  "epoch": 0.02,
708
- "learning_rate": 2.090813634373931e-06,
709
- "loss": 1.5982,
710
  "step": 117
711
  },
712
  {
713
  "epoch": 0.02,
714
- "learning_rate": 2.0758947626417945e-06,
715
- "loss": 1.5037,
716
  "step": 118
717
  },
718
  {
719
  "epoch": 0.02,
720
- "learning_rate": 2.060908770796769e-06,
721
- "loss": 1.6662,
722
  "step": 119
723
  },
724
  {
725
  "epoch": 0.02,
726
- "learning_rate": 2.0458574054452316e-06,
727
- "loss": 1.5392,
728
  "step": 120
729
  },
730
  {
731
  "epoch": 0.02,
732
- "learning_rate": 2.0307424208127912e-06,
733
- "loss": 1.6647,
734
  "step": 121
735
  },
736
  {
737
  "epoch": 0.02,
738
- "learning_rate": 2.0155655785398396e-06,
739
- "loss": 1.6124,
740
  "step": 122
741
  },
742
  {
743
  "epoch": 0.02,
744
- "learning_rate": 2.000328647476231e-06,
745
- "loss": 1.674,
746
  "step": 123
747
  },
748
  {
749
  "epoch": 0.02,
750
- "learning_rate": 1.985033403475123e-06,
751
- "loss": 1.674,
752
  "step": 124
753
  },
754
  {
755
  "epoch": 0.02,
756
- "learning_rate": 1.969681629186004e-06,
757
- "loss": 1.6071,
758
  "step": 125
759
  },
760
  {
761
  "epoch": 0.02,
762
- "learning_rate": 1.954275113846926e-06,
763
- "loss": 1.7843,
764
  "step": 126
765
  },
766
  {
767
  "epoch": 0.02,
768
- "learning_rate": 1.9388156530759715e-06,
769
- "loss": 1.8194,
770
  "step": 127
771
  },
772
  {
773
  "epoch": 0.02,
774
- "learning_rate": 1.9233050486619715e-06,
775
- "loss": 1.7772,
776
  "step": 128
777
  },
778
  {
779
  "epoch": 0.02,
780
- "learning_rate": 1.9077451083545143e-06,
781
- "loss": 1.8367,
782
  "step": 129
783
  },
784
  {
785
  "epoch": 0.02,
786
- "learning_rate": 1.8921376456532485e-06,
787
- "loss": 1.9344,
788
  "step": 130
789
  },
790
  {
791
  "epoch": 0.02,
792
- "learning_rate": 1.8764844795965232e-06,
793
- "loss": 1.8485,
794
  "step": 131
795
  },
796
  {
797
  "epoch": 0.02,
798
- "learning_rate": 1.8607874345493807e-06,
799
- "loss": 1.9676,
800
  "step": 132
801
  },
802
  {
803
  "epoch": 0.02,
804
- "learning_rate": 1.8450483399909265e-06,
805
- "loss": 1.9576,
806
  "step": 133
807
  },
808
  {
809
  "epoch": 0.02,
810
- "learning_rate": 1.8292690303011076e-06,
811
- "loss": 2.1381,
812
  "step": 134
813
  },
814
  {
815
  "epoch": 0.02,
816
- "learning_rate": 1.813451344546913e-06,
817
- "loss": 2.2337,
818
  "step": 135
819
  },
820
  {
821
  "epoch": 0.02,
822
- "learning_rate": 1.7975971262680348e-06,
823
- "loss": 2.2179,
824
  "step": 136
825
  },
826
  {
827
  "epoch": 0.02,
828
- "learning_rate": 1.7817082232620054e-06,
829
- "loss": 2.3267,
830
  "step": 137
831
  },
832
  {
833
  "epoch": 0.02,
834
- "learning_rate": 1.7657864873688345e-06,
835
- "loss": 2.2561,
836
  "step": 138
837
  },
838
  {
839
  "epoch": 0.02,
840
- "learning_rate": 1.7498337742551817e-06,
841
- "loss": 2.3142,
842
  "step": 139
843
  },
844
  {
845
  "epoch": 0.02,
846
- "learning_rate": 1.7338519431980798e-06,
847
- "loss": 2.428,
848
  "step": 140
849
  },
850
  {
851
  "epoch": 0.02,
852
- "learning_rate": 1.7178428568682356e-06,
853
- "loss": 2.4711,
854
  "step": 141
855
  },
856
  {
857
  "epoch": 0.02,
858
- "learning_rate": 1.701808381112938e-06,
859
- "loss": 2.4576,
860
  "step": 142
861
  },
862
  {
863
  "epoch": 0.02,
864
- "learning_rate": 1.6857503847385956e-06,
865
- "loss": 2.4396,
866
  "step": 143
867
  },
868
  {
869
  "epoch": 0.02,
870
- "learning_rate": 1.6696707392929268e-06,
871
- "loss": 2.4912,
872
  "step": 144
873
  },
874
  {
875
  "epoch": 0.02,
876
- "learning_rate": 1.653571318846834e-06,
877
- "loss": 2.5406,
878
  "step": 145
879
  },
880
  {
881
  "epoch": 0.02,
882
- "learning_rate": 1.6374539997759822e-06,
883
- "loss": 2.5187,
884
  "step": 146
885
  },
886
  {
887
  "epoch": 0.02,
888
- "learning_rate": 1.6213206605421064e-06,
889
- "loss": 2.4131,
890
  "step": 147
891
  },
892
  {
893
  "epoch": 0.02,
894
- "learning_rate": 1.605173181474081e-06,
895
- "loss": 2.5484,
896
  "step": 148
897
  },
898
  {
899
  "epoch": 0.02,
900
- "learning_rate": 1.5890134445487679e-06,
901
- "loss": 2.4955,
902
  "step": 149
903
  },
904
  {
905
  "epoch": 0.02,
906
- "learning_rate": 1.5728433331716726e-06,
907
- "loss": 2.6633,
908
  "step": 150
909
  },
910
  {
911
  "epoch": 0.02,
912
- "learning_rate": 1.5566647319574351e-06,
913
- "loss": 1.0591,
914
  "step": 151
915
  },
916
  {
917
  "epoch": 0.02,
918
- "learning_rate": 1.5404795265101808e-06,
919
- "loss": 1.2533,
920
  "step": 152
921
  },
922
  {
923
  "epoch": 0.02,
924
- "learning_rate": 1.5242896032037523e-06,
925
- "loss": 1.1398,
926
  "step": 153
927
  },
928
  {
929
  "epoch": 0.02,
930
- "learning_rate": 1.5080968489618567e-06,
931
- "loss": 1.1983,
932
  "step": 154
933
  },
934
  {
935
  "epoch": 0.02,
936
- "learning_rate": 1.4919031510381438e-06,
937
- "loss": 1.2219,
938
  "step": 155
939
  },
940
  {
941
  "epoch": 0.02,
942
- "learning_rate": 1.4757103967962477e-06,
943
- "loss": 1.2204,
944
  "step": 156
945
  },
946
  {
947
  "epoch": 0.02,
948
- "learning_rate": 1.4595204734898199e-06,
949
- "loss": 1.2667,
950
  "step": 157
951
  },
952
  {
953
  "epoch": 0.02,
954
- "learning_rate": 1.4433352680425654e-06,
955
- "loss": 1.1853,
956
  "step": 158
957
  },
958
  {
959
  "epoch": 0.02,
960
- "learning_rate": 1.4271566668283281e-06,
961
- "loss": 1.2905,
962
  "step": 159
963
  },
964
  {
965
  "epoch": 0.02,
966
- "learning_rate": 1.410986555451232e-06,
967
- "loss": 1.4073,
968
  "step": 160
969
  },
970
  {
971
  "epoch": 0.02,
972
- "learning_rate": 1.3948268185259188e-06,
973
- "loss": 1.3725,
974
  "step": 161
975
  },
976
  {
977
  "epoch": 0.03,
978
- "learning_rate": 1.3786793394578939e-06,
979
- "loss": 1.3658,
980
  "step": 162
981
  },
982
  {
983
  "epoch": 0.03,
984
- "learning_rate": 1.362546000224018e-06,
985
- "loss": 1.3991,
986
  "step": 163
987
  },
988
  {
989
  "epoch": 0.03,
990
- "learning_rate": 1.3464286811531663e-06,
991
- "loss": 1.3649,
992
  "step": 164
993
  },
994
  {
995
  "epoch": 0.03,
996
- "learning_rate": 1.3303292607070737e-06,
997
- "loss": 1.4877,
998
  "step": 165
999
  },
1000
  {
1001
  "epoch": 0.03,
1002
- "learning_rate": 1.314249615261405e-06,
1003
- "loss": 1.4299,
1004
  "step": 166
1005
  },
1006
  {
1007
  "epoch": 0.03,
1008
- "learning_rate": 1.2981916188870622e-06,
1009
- "loss": 1.3683,
1010
  "step": 167
1011
  },
1012
  {
1013
  "epoch": 0.03,
1014
- "learning_rate": 1.282157143131765e-06,
1015
- "loss": 1.473,
1016
  "step": 168
1017
  },
1018
  {
1019
  "epoch": 0.03,
1020
- "learning_rate": 1.2661480568019203e-06,
1021
- "loss": 1.5913,
1022
  "step": 169
1023
  },
1024
  {
1025
  "epoch": 0.03,
1026
- "learning_rate": 1.2501662257448184e-06,
1027
- "loss": 1.5045,
1028
  "step": 170
1029
  },
1030
  {
1031
  "epoch": 0.03,
1032
- "learning_rate": 1.234213512631166e-06,
1033
- "loss": 1.5012,
1034
  "step": 171
1035
  },
1036
  {
1037
  "epoch": 0.03,
1038
- "learning_rate": 1.218291776737995e-06,
1039
- "loss": 1.5784,
1040
  "step": 172
1041
  },
1042
  {
1043
  "epoch": 0.03,
1044
- "learning_rate": 1.2024028737319653e-06,
1045
- "loss": 1.5583,
1046
  "step": 173
1047
  },
1048
  {
1049
  "epoch": 0.03,
1050
- "learning_rate": 1.1865486554530874e-06,
1051
- "loss": 1.7044,
1052
  "step": 174
1053
  },
1054
  {
1055
  "epoch": 0.03,
1056
- "learning_rate": 1.170730969698893e-06,
1057
- "loss": 1.7103,
1058
  "step": 175
1059
  },
1060
  {
1061
  "epoch": 0.03,
1062
- "learning_rate": 1.154951660009074e-06,
1063
- "loss": 1.9039,
1064
  "step": 176
1065
  },
1066
  {
1067
  "epoch": 0.03,
1068
- "learning_rate": 1.13921256545062e-06,
1069
- "loss": 1.8563,
1070
  "step": 177
1071
  },
1072
  {
1073
  "epoch": 0.03,
1074
- "learning_rate": 1.1235155204034768e-06,
1075
- "loss": 1.6943,
1076
  "step": 178
1077
  },
1078
  {
1079
  "epoch": 0.03,
1080
- "learning_rate": 1.1078623543467518e-06,
1081
- "loss": 1.8663,
1082
  "step": 179
1083
  },
1084
  {
1085
  "epoch": 0.03,
1086
- "learning_rate": 1.0922548916454855e-06,
1087
- "loss": 1.8081,
1088
  "step": 180
1089
  },
1090
  {
1091
  "epoch": 0.03,
1092
- "learning_rate": 1.0766949513380286e-06,
1093
- "loss": 1.7813,
1094
  "step": 181
1095
  },
1096
  {
1097
  "epoch": 0.03,
1098
- "learning_rate": 1.061184346924029e-06,
1099
- "loss": 2.0464,
1100
  "step": 182
1101
  },
1102
  {
1103
  "epoch": 0.03,
1104
- "learning_rate": 1.0457248861530742e-06,
1105
- "loss": 1.9243,
1106
  "step": 183
1107
  },
1108
  {
1109
  "epoch": 0.03,
1110
- "learning_rate": 1.0303183708139966e-06,
1111
- "loss": 1.9261,
1112
  "step": 184
1113
  },
1114
  {
1115
  "epoch": 0.03,
1116
- "learning_rate": 1.0149665965248775e-06,
1117
- "loss": 2.3096,
1118
  "step": 185
1119
  },
1120
  {
1121
  "epoch": 0.03,
1122
- "learning_rate": 9.996713525237694e-07,
1123
- "loss": 2.2348,
1124
  "step": 186
1125
  },
1126
  {
1127
  "epoch": 0.03,
1128
- "learning_rate": 9.8443442146016e-07,
1129
- "loss": 2.1942,
1130
  "step": 187
1131
  },
1132
  {
1133
  "epoch": 0.03,
1134
- "learning_rate": 9.69257579187209e-07,
1135
- "loss": 2.2085,
1136
  "step": 188
1137
  },
1138
  {
1139
  "epoch": 0.03,
1140
- "learning_rate": 9.54142594554769e-07,
1141
- "loss": 2.2091,
1142
  "step": 189
1143
  },
1144
  {
1145
  "epoch": 0.03,
1146
- "learning_rate": 9.39091229203231e-07,
1147
- "loss": 2.1674,
1148
  "step": 190
1149
  },
1150
  {
1151
  "epoch": 0.03,
1152
- "learning_rate": 9.241052373582058e-07,
1153
- "loss": 2.2736,
1154
  "step": 191
1155
  },
1156
  {
1157
  "epoch": 0.03,
1158
- "learning_rate": 9.091863656260696e-07,
1159
- "loss": 2.2478,
1160
  "step": 192
1161
  },
1162
  {
1163
  "epoch": 0.03,
1164
- "learning_rate": 8.943363527903977e-07,
1165
- "loss": 2.3178,
1166
  "step": 193
1167
  },
1168
  {
1169
  "epoch": 0.03,
1170
- "learning_rate": 8.795569296093133e-07,
1171
- "loss": 2.398,
1172
  "step": 194
1173
  },
1174
  {
1175
  "epoch": 0.03,
1176
- "learning_rate": 8.648498186137653e-07,
1177
- "loss": 2.489,
1178
  "step": 195
1179
  },
1180
  {
1181
  "epoch": 0.03,
1182
- "learning_rate": 8.502167339067705e-07,
1183
- "loss": 2.3547,
1184
  "step": 196
1185
  },
1186
  {
1187
  "epoch": 0.03,
1188
- "learning_rate": 8.356593809636371e-07,
1189
- "loss": 2.3467,
1190
  "step": 197
1191
  },
1192
  {
1193
  "epoch": 0.03,
1194
- "learning_rate": 8.211794564331918e-07,
1195
- "loss": 2.4009,
1196
  "step": 198
1197
  },
1198
  {
1199
  "epoch": 0.03,
1200
- "learning_rate": 8.067786479400346e-07,
1201
- "loss": 2.44,
1202
  "step": 199
1203
  },
1204
  {
1205
  "epoch": 0.03,
1206
- "learning_rate": 7.924586338878512e-07,
1207
- "loss": 2.5891,
1208
  "step": 200
1209
  },
1210
  {
1211
  "epoch": 0.03,
1212
- "learning_rate": 7.782210832637924e-07,
1213
- "loss": 1.2204,
1214
  "step": 201
1215
  },
1216
  {
1217
  "epoch": 0.03,
1218
- "learning_rate": 7.640676554439594e-07,
1219
- "loss": 1.1149,
1220
  "step": 202
1221
  },
1222
  {
1223
  "epoch": 0.03,
1224
- "learning_rate": 7.500000000000003e-07,
1225
- "loss": 1.1802,
1226
  "step": 203
1227
  },
1228
  {
1229
  "epoch": 0.03,
1230
- "learning_rate": 7.360197565068561e-07,
1231
- "loss": 1.0454,
1232
  "step": 204
1233
  },
1234
  {
1235
  "epoch": 0.03,
1236
- "learning_rate": 7.22128554351668e-07,
1237
- "loss": 1.1475,
1238
  "step": 205
1239
  },
1240
  {
1241
  "epoch": 0.03,
1242
- "learning_rate": 7.083280125438766e-07,
1243
- "loss": 1.1811,
1244
  "step": 206
1245
  },
1246
  {
1247
  "epoch": 0.03,
1248
- "learning_rate": 6.946197395265243e-07,
1249
- "loss": 1.2031,
1250
  "step": 207
1251
  },
1252
  {
1253
  "epoch": 0.03,
1254
- "learning_rate": 6.810053329887929e-07,
1255
- "loss": 1.3016,
1256
  "step": 208
1257
  },
1258
  {
1259
  "epoch": 0.03,
1260
- "learning_rate": 6.674863796797954e-07,
1261
- "loss": 1.1845,
1262
  "step": 209
1263
  },
1264
  {
1265
  "epoch": 0.03,
1266
- "learning_rate": 6.540644552236401e-07,
1267
- "loss": 1.2235,
1268
  "step": 210
1269
  },
1270
  {
1271
  "epoch": 0.03,
1272
- "learning_rate": 6.407411239357954e-07,
1273
- "loss": 1.3364,
1274
  "step": 211
1275
  },
1276
  {
1277
  "epoch": 0.03,
1278
- "learning_rate": 6.275179386407663e-07,
1279
- "loss": 1.2483,
1280
  "step": 212
1281
  },
1282
  {
1283
  "epoch": 0.03,
1284
- "learning_rate": 6.143964404911165e-07,
1285
- "loss": 1.3154,
1286
  "step": 213
1287
  },
1288
  {
1289
  "epoch": 0.03,
1290
- "learning_rate": 6.013781587878464e-07,
1291
- "loss": 1.2914,
1292
  "step": 214
1293
  },
1294
  {
1295
  "epoch": 0.03,
1296
- "learning_rate": 5.884646108021563e-07,
1297
- "loss": 1.324,
1298
  "step": 215
1299
  },
1300
  {
1301
  "epoch": 0.03,
1302
- "learning_rate": 5.756573015986089e-07,
1303
- "loss": 1.382,
1304
  "step": 216
1305
  },
1306
  {
1307
  "epoch": 0.03,
1308
- "learning_rate": 5.629577238597132e-07,
1309
- "loss": 1.2145,
1310
  "step": 217
1311
  },
1312
  {
1313
  "epoch": 0.03,
1314
- "learning_rate": 5.503673577119552e-07,
1315
- "loss": 1.4172,
1316
  "step": 218
1317
  },
1318
  {
1319
  "epoch": 0.03,
1320
- "learning_rate": 5.378876705532904e-07,
1321
- "loss": 1.4902,
1322
  "step": 219
1323
  },
1324
  {
1325
  "epoch": 0.03,
1326
- "learning_rate": 5.255201168821183e-07,
1327
- "loss": 1.5351,
1328
  "step": 220
1329
  },
1330
  {
1331
  "epoch": 0.03,
1332
- "learning_rate": 5.132661381277644e-07,
1333
- "loss": 1.4211,
1334
  "step": 221
1335
  },
1336
  {
1337
  "epoch": 0.03,
1338
- "learning_rate": 5.011271624824787e-07,
1339
- "loss": 1.5289,
1340
  "step": 222
1341
  },
1342
  {
1343
  "epoch": 0.03,
1344
- "learning_rate": 4.891046047349837e-07,
1345
- "loss": 1.5527,
1346
  "step": 223
1347
  },
1348
  {
1349
  "epoch": 0.03,
1350
- "learning_rate": 4.771998661055823e-07,
1351
- "loss": 1.5057,
1352
  "step": 224
1353
  },
1354
  {
1355
  "epoch": 0.03,
1356
- "learning_rate": 4.6541433408284356e-07,
1357
- "loss": 1.5804,
1358
  "step": 225
1359
  },
1360
  {
1361
  "epoch": 0.03,
1362
- "learning_rate": 4.5374938226189584e-07,
1363
- "loss": 1.6606,
1364
  "step": 226
1365
  },
1366
  {
1367
  "epoch": 0.04,
1368
- "learning_rate": 4.4220637018433163e-07,
1369
- "loss": 1.6492,
1370
  "step": 227
1371
  },
1372
  {
1373
  "epoch": 0.04,
1374
- "learning_rate": 4.3078664317975654e-07,
1375
- "loss": 1.6729,
1376
  "step": 228
1377
  },
1378
  {
1379
  "epoch": 0.04,
1380
- "learning_rate": 4.1949153220898987e-07,
1381
- "loss": 1.8831,
1382
  "step": 229
1383
  },
1384
  {
1385
  "epoch": 0.04,
1386
- "learning_rate": 4.0832235370894604e-07,
1387
- "loss": 1.8898,
1388
  "step": 230
1389
  },
1390
  {
1391
  "epoch": 0.04,
1392
- "learning_rate": 3.972804094391998e-07,
1393
- "loss": 1.6795,
1394
  "step": 231
1395
  },
1396
  {
1397
  "epoch": 0.04,
1398
- "learning_rate": 3.863669863302698e-07,
1399
- "loss": 1.9394,
1400
  "step": 232
1401
  },
1402
  {
1403
  "epoch": 0.04,
1404
- "learning_rate": 3.755833563336293e-07,
1405
- "loss": 1.9454,
1406
  "step": 233
1407
  },
1408
  {
1409
  "epoch": 0.04,
1410
- "learning_rate": 3.64930776273457e-07,
1411
- "loss": 2.014,
1412
  "step": 234
1413
  },
1414
  {
1415
  "epoch": 0.04,
1416
- "learning_rate": 3.544104877001596e-07,
1417
- "loss": 1.9824,
1418
  "step": 235
1419
  },
1420
  {
1421
  "epoch": 0.04,
1422
- "learning_rate": 3.440237167456663e-07,
1423
- "loss": 2.0745,
1424
  "step": 236
1425
  },
1426
  {
1427
  "epoch": 0.04,
1428
- "learning_rate": 3.337716739805264e-07,
1429
- "loss": 2.0536,
1430
  "step": 237
1431
  },
1432
  {
1433
  "epoch": 0.04,
1434
- "learning_rate": 3.2365555427281634e-07,
1435
- "loss": 2.2728,
1436
  "step": 238
1437
  },
1438
  {
1439
  "epoch": 0.04,
1440
- "learning_rate": 3.1367653664888173e-07,
1441
- "loss": 2.1603,
1442
  "step": 239
1443
  },
1444
  {
1445
  "epoch": 0.04,
1446
- "learning_rate": 3.0383578415591913e-07,
1447
- "loss": 2.1522,
1448
  "step": 240
1449
  },
1450
  {
1451
  "epoch": 0.04,
1452
- "learning_rate": 2.9413444372642496e-07,
1453
- "loss": 2.2954,
1454
  "step": 241
1455
  },
1456
  {
1457
  "epoch": 0.04,
1458
- "learning_rate": 2.8457364604452376e-07,
1459
- "loss": 2.1479,
1460
  "step": 242
1461
  },
1462
  {
1463
  "epoch": 0.04,
1464
- "learning_rate": 2.751545054141834e-07,
1465
- "loss": 2.3303,
1466
  "step": 243
1467
  },
1468
  {
1469
  "epoch": 0.04,
1470
- "learning_rate": 2.6587811962934823e-07,
1471
- "loss": 2.3258,
1472
  "step": 244
1473
  },
1474
  {
1475
  "epoch": 0.04,
1476
- "learning_rate": 2.567455698459882e-07,
1477
- "loss": 2.3271,
1478
  "step": 245
1479
  },
1480
  {
1481
  "epoch": 0.04,
1482
- "learning_rate": 2.4775792045609353e-07,
1483
- "loss": 2.3152,
1484
  "step": 246
1485
  },
1486
  {
1487
  "epoch": 0.04,
1488
- "learning_rate": 2.389162189636188e-07,
1489
- "loss": 2.4583,
1490
  "step": 247
1491
  },
1492
  {
1493
  "epoch": 0.04,
1494
- "learning_rate": 2.3022149586239972e-07,
1495
- "loss": 2.3737,
1496
  "step": 248
1497
  },
1498
  {
1499
  "epoch": 0.04,
1500
- "learning_rate": 2.2167476451604624e-07,
1501
- "loss": 2.5374,
1502
  "step": 249
1503
  },
1504
  {
1505
  "epoch": 0.04,
1506
- "learning_rate": 2.1327702103983864e-07,
1507
- "loss": 2.5436,
1508
  "step": 250
1509
  },
1510
  {
1511
  "epoch": 0.04,
1512
- "learning_rate": 2.0502924418463014e-07,
1513
- "loss": 1.1269,
1514
  "step": 251
1515
  },
1516
  {
1517
  "epoch": 0.04,
1518
- "learning_rate": 1.9693239522277327e-07,
1519
- "loss": 1.1588,
1520
  "step": 252
1521
  },
1522
  {
1523
  "epoch": 0.04,
1524
- "learning_rate": 1.8898741783608642e-07,
1525
- "loss": 1.127,
1526
  "step": 253
1527
  },
1528
  {
1529
  "epoch": 0.04,
1530
- "learning_rate": 1.811952380058657e-07,
1531
- "loss": 1.1566,
1532
  "step": 254
1533
  },
1534
  {
1535
  "epoch": 0.04,
1536
- "learning_rate": 1.7355676390496482e-07,
1537
- "loss": 1.2204,
1538
  "step": 255
1539
  },
1540
  {
1541
  "epoch": 0.04,
1542
- "learning_rate": 1.660728857919464e-07,
1543
- "loss": 1.1514,
1544
  "step": 256
1545
  },
1546
  {
1547
  "epoch": 0.04,
1548
- "learning_rate": 1.5874447590732537e-07,
1549
- "loss": 1.1718,
1550
  "step": 257
1551
  },
1552
  {
1553
  "epoch": 0.04,
1554
- "learning_rate": 1.5157238837190719e-07,
1555
- "loss": 1.3254,
1556
  "step": 258
1557
  },
1558
  {
1559
  "epoch": 0.04,
1560
- "learning_rate": 1.4455745908724226e-07,
1561
- "loss": 1.2906,
1562
  "step": 259
1563
  },
1564
  {
1565
  "epoch": 0.04,
1566
- "learning_rate": 1.377005056382018e-07,
1567
- "loss": 1.1925,
1568
  "step": 260
1569
  },
1570
  {
1571
  "epoch": 0.04,
1572
- "learning_rate": 1.3100232719768996e-07,
1573
- "loss": 1.2717,
1574
  "step": 261
1575
  },
1576
  {
1577
  "epoch": 0.04,
1578
- "learning_rate": 1.2446370443349863e-07,
1579
- "loss": 1.2414,
1580
  "step": 262
1581
  },
1582
  {
1583
  "epoch": 0.04,
1584
- "learning_rate": 1.180853994173236e-07,
1585
- "loss": 1.2571,
1586
  "step": 263
1587
  },
1588
  {
1589
  "epoch": 0.04,
1590
- "learning_rate": 1.1186815553594382e-07,
1591
- "loss": 1.1211,
1592
  "step": 264
1593
  },
1594
  {
1595
  "epoch": 0.04,
1596
- "learning_rate": 1.058126974045811e-07,
1597
- "loss": 1.3869,
1598
  "step": 265
1599
  },
1600
  {
1601
  "epoch": 0.04,
1602
- "learning_rate": 9.991973078244638e-08,
1603
- "loss": 1.4214,
1604
  "step": 266
1605
  },
1606
  {
1607
  "epoch": 0.04,
1608
- "learning_rate": 9.418994249048474e-08,
1609
- "loss": 1.4063,
1610
  "step": 267
1611
  },
1612
  {
1613
  "epoch": 0.04,
1614
- "learning_rate": 8.862400033132573e-08,
1615
- "loss": 1.4621,
1616
  "step": 268
1617
  },
1618
  {
1619
  "epoch": 0.04,
1620
- "learning_rate": 8.322255301145204e-08,
1621
- "loss": 1.5625,
1622
  "step": 269
1623
  },
1624
  {
1625
  "epoch": 0.04,
1626
- "learning_rate": 7.798623006559436e-08,
1627
- "loss": 1.519,
1628
  "step": 270
1629
  },
1630
  {
1631
  "epoch": 0.04,
1632
- "learning_rate": 7.291564178335719e-08,
1633
- "loss": 1.5473,
1634
  "step": 271
1635
  },
1636
  {
1637
  "epoch": 0.04,
1638
- "learning_rate": 6.801137913809214e-08,
1639
- "loss": 1.5158,
1640
  "step": 272
1641
  },
1642
  {
1643
  "epoch": 0.04,
1644
- "learning_rate": 6.327401371801944e-08,
1645
- "loss": 1.6969,
1646
  "step": 273
1647
  },
1648
  {
1649
  "epoch": 0.04,
1650
- "learning_rate": 5.870409765960966e-08,
1651
- "loss": 1.6519,
1652
  "step": 274
1653
  },
1654
  {
1655
  "epoch": 0.04,
1656
- "learning_rate": 5.430216358323309e-08,
1657
- "loss": 1.5961,
1658
  "step": 275
1659
  },
1660
  {
1661
  "epoch": 0.04,
1662
- "learning_rate": 5.00687245310833e-08,
1663
- "loss": 1.6128,
1664
  "step": 276
1665
  },
1666
  {
1667
  "epoch": 0.04,
1668
- "learning_rate": 4.60042739073816e-08,
1669
- "loss": 1.813,
1670
  "step": 277
1671
  },
1672
  {
1673
  "epoch": 0.04,
1674
- "learning_rate": 4.2109285420872055e-08,
1675
- "loss": 1.6032,
1676
  "step": 278
1677
  },
1678
  {
1679
  "epoch": 0.04,
1680
- "learning_rate": 3.838421302961098e-08,
1681
- "loss": 1.8927,
1682
  "step": 279
1683
  },
1684
  {
1685
  "epoch": 0.04,
1686
- "learning_rate": 3.4829490888057424e-08,
1687
- "loss": 1.9264,
1688
  "step": 280
1689
  },
1690
  {
1691
  "epoch": 0.04,
1692
- "learning_rate": 3.1445533296474484e-08,
1693
- "loss": 1.7726,
1694
  "step": 281
1695
  },
1696
  {
1697
  "epoch": 0.04,
1698
- "learning_rate": 2.8232734652641424e-08,
1699
- "loss": 1.9217,
1700
  "step": 282
1701
  },
1702
  {
1703
  "epoch": 0.04,
1704
- "learning_rate": 2.5191469405887625e-08,
1705
- "loss": 1.9583,
1706
  "step": 283
1707
  },
1708
  {
1709
  "epoch": 0.04,
1710
- "learning_rate": 2.2322092013450313e-08,
1711
- "loss": 2.0714,
1712
  "step": 284
1713
  },
1714
  {
1715
  "epoch": 0.04,
1716
- "learning_rate": 1.962493689916395e-08,
1717
- "loss": 2.0708,
1718
  "step": 285
1719
  },
1720
  {
1721
  "epoch": 0.04,
1722
- "learning_rate": 1.7100318414482063e-08,
1723
- "loss": 2.0683,
1724
  "step": 286
1725
  },
1726
  {
1727
  "epoch": 0.04,
1728
- "learning_rate": 1.4748530801840076e-08,
1729
- "loss": 2.1105,
1730
  "step": 287
1731
  },
1732
  {
1733
  "epoch": 0.04,
1734
- "learning_rate": 1.2569848160362384e-08,
1735
- "loss": 2.2285,
1736
  "step": 288
1737
  },
1738
  {
1739
  "epoch": 0.04,
1740
- "learning_rate": 1.0564524413915422e-08,
1741
- "loss": 2.288,
1742
  "step": 289
1743
  },
1744
  {
1745
  "epoch": 0.04,
1746
- "learning_rate": 8.732793281513663e-09,
1747
- "loss": 2.2546,
1748
  "step": 290
1749
  },
1750
  {
1751
  "epoch": 0.04,
1752
- "learning_rate": 7.074868250079081e-09,
1753
- "loss": 2.156,
1754
  "step": 291
1755
  },
1756
  {
1757
  "epoch": 0.05,
1758
- "learning_rate": 5.590942549560052e-09,
1759
- "loss": 2.2368,
1760
  "step": 292
1761
  },
1762
  {
1763
  "epoch": 0.05,
1764
- "learning_rate": 4.2811891304105345e-09,
1765
- "loss": 2.2568,
1766
  "step": 293
1767
  },
1768
  {
1769
  "epoch": 0.05,
1770
- "learning_rate": 3.145760643432527e-09,
1771
- "loss": 2.2339,
1772
  "step": 294
1773
  },
1774
  {
1775
  "epoch": 0.05,
1776
- "learning_rate": 2.1847894219846343e-09,
1777
- "loss": 2.3651,
1778
  "step": 295
1779
  },
1780
  {
1781
  "epoch": 0.05,
1782
- "learning_rate": 1.3983874665589035e-09,
1783
- "loss": 2.492,
1784
  "step": 296
1785
  },
1786
  {
1787
  "epoch": 0.05,
1788
- "learning_rate": 7.866464317276001e-10,
1789
- "loss": 2.3863,
1790
  "step": 297
1791
  },
1792
  {
1793
  "epoch": 0.05,
1794
- "learning_rate": 3.496376154604186e-10,
1795
- "loss": 2.4482,
1796
  "step": 298
1797
  },
1798
  {
1799
  "epoch": 0.05,
1800
- "learning_rate": 8.741195081479747e-11,
1801
- "loss": 2.5423,
1802
  "step": 299
1803
  },
1804
  {
1805
  "epoch": 0.05,
1806
  "learning_rate": 0.0,
1807
- "loss": 2.5864,
1808
  "step": 300
1809
  }
1810
  ],
1811
  "max_steps": 300,
1812
  "num_train_epochs": 1,
1813
- "total_flos": 1.174396008923136e+16,
1814
  "trial_name": null,
1815
  "trial_params": null
1816
  }
 
9
  "log_history": [
10
  {
11
  "epoch": 0.0,
12
+ "learning_rate": 1.1111111111111112e-05,
13
+ "loss": 1.1222,
14
  "step": 1
15
  },
16
  {
17
  "epoch": 0.0,
18
+ "learning_rate": 2.2222222222222223e-05,
19
+ "loss": 1.1901,
20
  "step": 2
21
  },
22
  {
23
  "epoch": 0.0,
24
+ "learning_rate": 3.3333333333333335e-05,
25
+ "loss": 1.2159,
26
  "step": 3
27
  },
28
  {
29
  "epoch": 0.0,
30
+ "learning_rate": 4.4444444444444447e-05,
31
+ "loss": 1.204,
32
  "step": 4
33
  },
34
  {
35
  "epoch": 0.0,
36
+ "learning_rate": 5.555555555555556e-05,
37
+ "loss": 1.2885,
38
  "step": 5
39
  },
40
  {
41
  "epoch": 0.0,
42
+ "learning_rate": 6.666666666666667e-05,
43
+ "loss": 1.1401,
44
  "step": 6
45
  },
46
  {
47
  "epoch": 0.0,
48
+ "learning_rate": 7.777777777777778e-05,
49
+ "loss": 1.1606,
50
  "step": 7
51
  },
52
  {
53
  "epoch": 0.0,
54
+ "learning_rate": 8.888888888888889e-05,
55
+ "loss": 1.3112,
56
  "step": 8
57
  },
58
  {
59
  "epoch": 0.0,
60
+ "learning_rate": 0.0001,
61
+ "loss": 1.2529,
62
  "step": 9
63
  },
64
  {
65
  "epoch": 0.0,
66
+ "learning_rate": 9.999708626830618e-05,
67
+ "loss": 1.1747,
68
  "step": 10
69
  },
70
  {
71
  "epoch": 0.0,
72
+ "learning_rate": 9.998834541281798e-05,
73
+ "loss": 1.1645,
74
  "step": 11
75
  },
76
  {
77
  "epoch": 0.0,
78
+ "learning_rate": 9.997377845227576e-05,
79
+ "loss": 1.3678,
80
  "step": 12
81
  },
82
  {
83
  "epoch": 0.0,
84
+ "learning_rate": 9.995338708444804e-05,
85
+ "loss": 1.3055,
86
  "step": 13
87
  },
88
  {
89
  "epoch": 0.0,
90
+ "learning_rate": 9.992717368593385e-05,
91
+ "loss": 1.2907,
92
  "step": 14
93
  },
94
  {
95
  "epoch": 0.0,
96
+ "learning_rate": 9.989514131188559e-05,
97
+ "loss": 1.2025,
98
  "step": 15
99
  },
100
  {
101
  "epoch": 0.0,
102
+ "learning_rate": 9.985729369565299e-05,
103
+ "loss": 1.3342,
104
  "step": 16
105
  },
106
  {
107
  "epoch": 0.0,
108
+ "learning_rate": 9.9813635248348e-05,
109
+ "loss": 1.3678,
110
  "step": 17
111
  },
112
  {
113
  "epoch": 0.0,
114
+ "learning_rate": 9.97641710583307e-05,
115
+ "loss": 1.273,
116
  "step": 18
117
  },
118
  {
119
  "epoch": 0.0,
120
+ "learning_rate": 9.970890689061622e-05,
121
+ "loss": 1.2473,
122
  "step": 19
123
  },
124
  {
125
  "epoch": 0.0,
126
+ "learning_rate": 9.964784918620282e-05,
127
+ "loss": 1.4571,
128
  "step": 20
129
  },
130
  {
131
  "epoch": 0.0,
132
+ "learning_rate": 9.958100506132127e-05,
133
+ "loss": 1.2284,
134
  "step": 21
135
  },
136
  {
137
  "epoch": 0.0,
138
+ "learning_rate": 9.950838230660534e-05,
139
+ "loss": 1.3918,
140
  "step": 22
141
  },
142
  {
143
  "epoch": 0.0,
144
+ "learning_rate": 9.942998938618394e-05,
145
+ "loss": 1.1981,
146
  "step": 23
147
  },
148
  {
149
  "epoch": 0.0,
150
+ "learning_rate": 9.934583543669453e-05,
151
+ "loss": 1.2575,
152
  "step": 24
153
  },
154
  {
155
  "epoch": 0.0,
156
+ "learning_rate": 9.925593026621833e-05,
157
+ "loss": 1.3153,
158
  "step": 25
159
  },
160
  {
161
  "epoch": 0.0,
162
+ "learning_rate": 9.916028435313708e-05,
163
+ "loss": 1.2294,
164
  "step": 26
165
  },
166
  {
167
  "epoch": 0.0,
168
+ "learning_rate": 9.905890884491195e-05,
169
+ "loss": 1.1945,
170
  "step": 27
171
  },
172
  {
173
  "epoch": 0.0,
174
+ "learning_rate": 9.895181555678418e-05,
175
+ "loss": 1.1637,
176
  "step": 28
177
  },
178
  {
179
  "epoch": 0.0,
180
+ "learning_rate": 9.883901697039808e-05,
181
+ "loss": 1.1551,
182
  "step": 29
183
  },
184
  {
185
  "epoch": 0.0,
186
+ "learning_rate": 9.872052623234632e-05,
187
+ "loss": 1.0256,
188
  "step": 30
189
  },
190
  {
191
  "epoch": 0.0,
192
+ "learning_rate": 9.85963571526376e-05,
193
+ "loss": 1.0165,
194
  "step": 31
195
  },
196
  {
197
  "epoch": 0.0,
198
+ "learning_rate": 9.846652420308728e-05,
199
+ "loss": 1.1713,
200
  "step": 32
201
  },
202
  {
203
  "epoch": 0.01,
204
+ "learning_rate": 9.833104251563056e-05,
205
+ "loss": 1.0292,
206
  "step": 33
207
  },
208
  {
209
  "epoch": 0.01,
210
+ "learning_rate": 9.818992788055889e-05,
211
+ "loss": 1.034,
212
  "step": 34
213
  },
214
  {
215
  "epoch": 0.01,
216
+ "learning_rate": 9.80431967446797e-05,
217
+ "loss": 1.0906,
218
  "step": 35
219
  },
220
  {
221
  "epoch": 0.01,
222
+ "learning_rate": 9.789086620939936e-05,
223
+ "loss": 1.0223,
224
  "step": 36
225
  },
226
  {
227
  "epoch": 0.01,
228
+ "learning_rate": 9.773295402873026e-05,
229
+ "loss": 1.0137,
230
  "step": 37
231
  },
232
  {
233
  "epoch": 0.01,
234
+ "learning_rate": 9.756947860722143e-05,
235
+ "loss": 1.0469,
236
  "step": 38
237
  },
238
  {
239
  "epoch": 0.01,
240
+ "learning_rate": 9.740045899781352e-05,
241
+ "loss": 0.9361,
242
  "step": 39
243
  },
244
  {
245
  "epoch": 0.01,
246
+ "learning_rate": 9.722591489961827e-05,
247
+ "loss": 0.9553,
248
  "step": 40
249
  },
250
  {
251
  "epoch": 0.01,
252
+ "learning_rate": 9.70458666556225e-05,
253
+ "loss": 0.864,
254
  "step": 41
255
  },
256
  {
257
  "epoch": 0.01,
258
+ "learning_rate": 9.686033525031719e-05,
259
+ "loss": 0.9793,
260
  "step": 42
261
  },
262
  {
263
  "epoch": 0.01,
264
+ "learning_rate": 9.66693423072518e-05,
265
+ "loss": 0.8839,
266
  "step": 43
267
  },
268
  {
269
  "epoch": 0.01,
270
+ "learning_rate": 9.647291008651398e-05,
271
+ "loss": 0.8898,
272
  "step": 44
273
  },
274
  {
275
  "epoch": 0.01,
276
+ "learning_rate": 9.627106148213522e-05,
277
+ "loss": 0.7912,
278
  "step": 45
279
  },
280
  {
281
  "epoch": 0.01,
282
+ "learning_rate": 9.606382001942255e-05,
283
+ "loss": 0.7436,
284
  "step": 46
285
  },
286
  {
287
  "epoch": 0.01,
288
+ "learning_rate": 9.585120985221671e-05,
289
+ "loss": 0.7915,
290
  "step": 47
291
  },
292
  {
293
  "epoch": 0.01,
294
+ "learning_rate": 9.563325576007701e-05,
295
+ "loss": 0.8672,
296
  "step": 48
297
  },
298
  {
299
  "epoch": 0.01,
300
+ "learning_rate": 9.540998314539328e-05,
301
+ "loss": 0.7161,
302
  "step": 49
303
  },
304
  {
305
  "epoch": 0.01,
306
+ "learning_rate": 9.518141803042527e-05,
307
+ "loss": 0.697,
308
  "step": 50
309
  },
310
  {
311
  "epoch": 0.01,
312
+ "learning_rate": 9.494758705426978e-05,
313
+ "loss": 0.8822,
314
  "step": 51
315
  },
316
  {
317
  "epoch": 0.01,
318
+ "learning_rate": 9.470851746975582e-05,
319
+ "loss": 0.7704,
320
  "step": 52
321
  },
322
  {
323
  "epoch": 0.01,
324
+ "learning_rate": 9.446423714026846e-05,
325
+ "loss": 0.8268,
326
  "step": 53
327
  },
328
  {
329
  "epoch": 0.01,
330
+ "learning_rate": 9.421477453650118e-05,
331
+ "loss": 0.8874,
332
  "step": 54
333
  },
334
  {
335
  "epoch": 0.01,
336
+ "learning_rate": 9.396015873313781e-05,
337
+ "loss": 0.8378,
338
  "step": 55
339
  },
340
  {
341
  "epoch": 0.01,
342
+ "learning_rate": 9.37004194054638e-05,
343
+ "loss": 0.889,
344
  "step": 56
345
  },
346
  {
347
  "epoch": 0.01,
348
+ "learning_rate": 9.343558682590756e-05,
349
+ "loss": 0.9847,
350
  "step": 57
351
  },
352
  {
353
  "epoch": 0.01,
354
+ "learning_rate": 9.316569186051234e-05,
355
+ "loss": 0.9055,
356
  "step": 58
357
  },
358
  {
359
  "epoch": 0.01,
360
+ "learning_rate": 9.289076596533872e-05,
361
+ "loss": 0.9204,
362
  "step": 59
363
  },
364
  {
365
  "epoch": 0.01,
366
+ "learning_rate": 9.261084118279847e-05,
367
+ "loss": 0.9052,
368
  "step": 60
369
  },
370
  {
371
  "epoch": 0.01,
372
+ "learning_rate": 9.232595013792002e-05,
373
+ "loss": 0.8531,
374
  "step": 61
375
  },
376
  {
377
  "epoch": 0.01,
378
+ "learning_rate": 9.203612603454604e-05,
379
+ "loss": 0.8028,
380
  "step": 62
381
  },
382
  {
383
  "epoch": 0.01,
384
+ "learning_rate": 9.174140265146356e-05,
385
+ "loss": 0.926,
386
  "step": 63
387
  },
388
  {
389
  "epoch": 0.01,
390
+ "learning_rate": 9.144181433846707e-05,
391
+ "loss": 0.9165,
392
  "step": 64
393
  },
394
  {
395
  "epoch": 0.01,
396
+ "learning_rate": 9.113739601235507e-05,
397
+ "loss": 1.0392,
398
  "step": 65
399
  },
400
  {
401
  "epoch": 0.01,
402
+ "learning_rate": 9.082818315286055e-05,
403
+ "loss": 0.8548,
404
  "step": 66
405
  },
406
  {
407
  "epoch": 0.01,
408
+ "learning_rate": 9.051421179851588e-05,
409
+ "loss": 0.898,
410
  "step": 67
411
  },
412
  {
413
  "epoch": 0.01,
414
+ "learning_rate": 9.01955185424525e-05,
415
+ "loss": 0.8343,
416
  "step": 68
417
  },
418
  {
419
  "epoch": 0.01,
420
+ "learning_rate": 8.987214052813604e-05,
421
+ "loss": 0.9507,
422
  "step": 69
423
  },
424
  {
425
  "epoch": 0.01,
426
+ "learning_rate": 8.954411544503729e-05,
427
+ "loss": 0.8919,
428
  "step": 70
429
  },
430
  {
431
  "epoch": 0.01,
432
+ "learning_rate": 8.921148152423946e-05,
433
+ "loss": 0.9146,
434
  "step": 71
435
  },
436
  {
437
  "epoch": 0.01,
438
+ "learning_rate": 8.887427753398248e-05,
439
+ "loss": 0.8907,
440
  "step": 72
441
  },
442
  {
443
  "epoch": 0.01,
444
+ "learning_rate": 8.853254277514446e-05,
445
+ "loss": 0.8352,
446
  "step": 73
447
  },
448
  {
449
  "epoch": 0.01,
450
+ "learning_rate": 8.818631707666135e-05,
451
+ "loss": 0.972,
452
  "step": 74
453
  },
454
  {
455
  "epoch": 0.01,
456
+ "learning_rate": 8.783564079088477e-05,
457
+ "loss": 0.9635,
458
  "step": 75
459
  },
460
  {
461
  "epoch": 0.01,
462
+ "learning_rate": 8.748055478887904e-05,
463
+ "loss": 0.843,
464
  "step": 76
465
  },
466
  {
467
  "epoch": 0.01,
468
+ "learning_rate": 8.712110045565768e-05,
469
+ "loss": 0.8543,
470
  "step": 77
471
  },
472
  {
473
  "epoch": 0.01,
474
+ "learning_rate": 8.675731968536002e-05,
475
+ "loss": 0.8737,
476
  "step": 78
477
  },
478
  {
479
  "epoch": 0.01,
480
+ "learning_rate": 8.638925487636848e-05,
481
+ "loss": 0.8705,
482
  "step": 79
483
  },
484
  {
485
  "epoch": 0.01,
486
+ "learning_rate": 8.6016948926367e-05,
487
+ "loss": 1.0117,
488
  "step": 80
489
  },
490
  {
491
  "epoch": 0.01,
492
+ "learning_rate": 8.564044522734147e-05,
493
+ "loss": 0.8361,
494
  "step": 81
495
  },
496
  {
497
  "epoch": 0.01,
498
+ "learning_rate": 8.52597876605223e-05,
499
+ "loss": 0.8629,
500
  "step": 82
501
  },
502
  {
503
  "epoch": 0.01,
504
+ "learning_rate": 8.487502059127015e-05,
505
+ "loss": 0.7811,
506
  "step": 83
507
  },
508
  {
509
  "epoch": 0.01,
510
+ "learning_rate": 8.448618886390522e-05,
511
+ "loss": 0.9086,
512
  "step": 84
513
  },
514
  {
515
  "epoch": 0.01,
516
+ "learning_rate": 8.40933377964806e-05,
517
+ "loss": 0.7538,
518
  "step": 85
519
  },
520
  {
521
  "epoch": 0.01,
522
+ "learning_rate": 8.369651317550054e-05,
523
+ "loss": 0.7346,
524
  "step": 86
525
  },
526
  {
527
  "epoch": 0.01,
528
+ "learning_rate": 8.329576125058406e-05,
529
+ "loss": 0.9076,
530
  "step": 87
531
  },
532
  {
533
  "epoch": 0.01,
534
+ "learning_rate": 8.289112872907454e-05,
535
+ "loss": 0.8393,
536
  "step": 88
537
  },
538
  {
539
  "epoch": 0.01,
540
+ "learning_rate": 8.248266277059607e-05,
541
+ "loss": 0.8094,
542
  "step": 89
543
  },
544
  {
545
  "epoch": 0.01,
546
+ "learning_rate": 8.2070410981557e-05,
547
+ "loss": 0.8885,
548
  "step": 90
549
  },
550
  {
551
  "epoch": 0.01,
552
+ "learning_rate": 8.16544214096015e-05,
553
+ "loss": 0.7709,
554
  "step": 91
555
  },
556
  {
557
  "epoch": 0.01,
558
+ "learning_rate": 8.123474253800957e-05,
559
+ "loss": 0.7681,
560
  "step": 92
561
  },
562
  {
563
  "epoch": 0.01,
564
+ "learning_rate": 8.081142328004637e-05,
565
+ "loss": 0.7999,
566
  "step": 93
567
  },
568
  {
569
  "epoch": 0.01,
570
+ "learning_rate": 8.038451297326145e-05,
571
+ "loss": 0.8105,
572
  "step": 94
573
  },
574
  {
575
  "epoch": 0.01,
576
+ "learning_rate": 7.995406137373846e-05,
577
+ "loss": 0.7392,
578
  "step": 95
579
  },
580
  {
581
  "epoch": 0.01,
582
+ "learning_rate": 7.952011865029614e-05,
583
+ "loss": 0.7033,
584
  "step": 96
585
  },
586
  {
587
  "epoch": 0.01,
588
+ "learning_rate": 7.908273537864113e-05,
589
+ "loss": 0.689,
590
  "step": 97
591
  },
592
  {
593
  "epoch": 0.02,
594
+ "learning_rate": 7.86419625354735e-05,
595
+ "loss": 0.7046,
596
  "step": 98
597
  },
598
  {
599
  "epoch": 0.02,
600
+ "learning_rate": 7.819785149254532e-05,
601
+ "loss": 0.6515,
602
  "step": 99
603
  },
604
  {
605
  "epoch": 0.02,
606
+ "learning_rate": 7.77504540106735e-05,
607
+ "loss": 0.6036,
608
  "step": 100
609
  },
610
  {
611
  "epoch": 0.02,
612
+ "learning_rate": 7.729982223370691e-05,
613
+ "loss": 0.8356,
614
  "step": 101
615
  },
616
  {
617
  "epoch": 0.02,
618
+ "learning_rate": 7.68460086824492e-05,
619
+ "loss": 0.7933,
620
  "step": 102
621
  },
622
  {
623
  "epoch": 0.02,
624
+ "learning_rate": 7.638906624853743e-05,
625
+ "loss": 0.932,
626
  "step": 103
627
  },
628
  {
629
  "epoch": 0.02,
630
+ "learning_rate": 7.592904818827775e-05,
631
+ "loss": 0.8673,
632
  "step": 104
633
  },
634
  {
635
  "epoch": 0.02,
636
+ "learning_rate": 7.546600811643816e-05,
637
+ "loss": 0.8731,
638
  "step": 105
639
  },
640
  {
641
  "epoch": 0.02,
642
+ "learning_rate": 7.500000000000001e-05,
643
+ "loss": 0.8142,
644
  "step": 106
645
  },
646
  {
647
  "epoch": 0.02,
648
+ "learning_rate": 7.453107815186803e-05,
649
+ "loss": 0.8912,
650
  "step": 107
651
  },
652
  {
653
  "epoch": 0.02,
654
+ "learning_rate": 7.405929722454026e-05,
655
+ "loss": 0.8659,
656
  "step": 108
657
  },
658
  {
659
  "epoch": 0.02,
660
+ "learning_rate": 7.358471220373832e-05,
661
+ "loss": 0.9129,
662
  "step": 109
663
  },
664
  {
665
  "epoch": 0.02,
666
+ "learning_rate": 7.310737840199885e-05,
667
+ "loss": 0.7937,
668
  "step": 110
669
  },
670
  {
671
  "epoch": 0.02,
672
+ "learning_rate": 7.262735145222696e-05,
673
+ "loss": 0.8494,
674
  "step": 111
675
  },
676
  {
677
  "epoch": 0.02,
678
+ "learning_rate": 7.214468730121208e-05,
679
+ "loss": 0.9226,
680
  "step": 112
681
  },
682
  {
683
  "epoch": 0.02,
684
+ "learning_rate": 7.165944220310767e-05,
685
+ "loss": 0.7861,
686
  "step": 113
687
  },
688
  {
689
  "epoch": 0.02,
690
+ "learning_rate": 7.117167271287453e-05,
691
+ "loss": 0.8482,
692
  "step": 114
693
  },
694
  {
695
  "epoch": 0.02,
696
+ "learning_rate": 7.068143567968957e-05,
697
+ "loss": 0.9018,
698
  "step": 115
699
  },
700
  {
701
  "epoch": 0.02,
702
+ "learning_rate": 7.018878824032009e-05,
703
+ "loss": 0.8878,
704
  "step": 116
705
  },
706
  {
707
  "epoch": 0.02,
708
+ "learning_rate": 6.969378781246436e-05,
709
+ "loss": 0.9144,
710
  "step": 117
711
  },
712
  {
713
  "epoch": 0.02,
714
+ "learning_rate": 6.919649208805981e-05,
715
+ "loss": 0.9245,
716
  "step": 118
717
  },
718
  {
719
  "epoch": 0.02,
720
+ "learning_rate": 6.869695902655897e-05,
721
+ "loss": 0.843,
722
  "step": 119
723
  },
724
  {
725
  "epoch": 0.02,
726
+ "learning_rate": 6.819524684817438e-05,
727
+ "loss": 0.9954,
728
  "step": 120
729
  },
730
  {
731
  "epoch": 0.02,
732
+ "learning_rate": 6.769141402709305e-05,
733
+ "loss": 0.9713,
734
  "step": 121
735
  },
736
  {
737
  "epoch": 0.02,
738
+ "learning_rate": 6.718551928466132e-05,
739
+ "loss": 1.0037,
740
  "step": 122
741
  },
742
  {
743
  "epoch": 0.02,
744
+ "learning_rate": 6.667762158254104e-05,
745
+ "loss": 0.6903,
746
  "step": 123
747
  },
748
  {
749
  "epoch": 0.02,
750
+ "learning_rate": 6.616778011583743e-05,
751
+ "loss": 0.8828,
752
  "step": 124
753
  },
754
  {
755
  "epoch": 0.02,
756
+ "learning_rate": 6.565605430620013e-05,
757
+ "loss": 0.8054,
758
  "step": 125
759
  },
760
  {
761
  "epoch": 0.02,
762
+ "learning_rate": 6.514250379489753e-05,
763
+ "loss": 0.8111,
764
  "step": 126
765
  },
766
  {
767
  "epoch": 0.02,
768
+ "learning_rate": 6.462718843586571e-05,
769
+ "loss": 0.9756,
770
  "step": 127
771
  },
772
  {
773
  "epoch": 0.02,
774
+ "learning_rate": 6.411016828873239e-05,
775
+ "loss": 1.0558,
776
  "step": 128
777
  },
778
  {
779
  "epoch": 0.02,
780
+ "learning_rate": 6.359150361181715e-05,
781
+ "loss": 0.886,
782
  "step": 129
783
  },
784
  {
785
  "epoch": 0.02,
786
+ "learning_rate": 6.307125485510828e-05,
787
+ "loss": 0.8109,
788
  "step": 130
789
  },
790
  {
791
  "epoch": 0.02,
792
+ "learning_rate": 6.254948265321744e-05,
793
+ "loss": 0.8609,
794
  "step": 131
795
  },
796
  {
797
  "epoch": 0.02,
798
+ "learning_rate": 6.202624781831268e-05,
799
+ "loss": 0.8583,
800
  "step": 132
801
  },
802
  {
803
  "epoch": 0.02,
804
+ "learning_rate": 6.150161133303089e-05,
805
+ "loss": 0.92,
806
  "step": 133
807
  },
808
  {
809
  "epoch": 0.02,
810
+ "learning_rate": 6.0975634343370256e-05,
811
+ "loss": 0.9136,
812
  "step": 134
813
  },
814
  {
815
  "epoch": 0.02,
816
+ "learning_rate": 6.044837815156377e-05,
817
+ "loss": 0.8487,
818
  "step": 135
819
  },
820
  {
821
  "epoch": 0.02,
822
+ "learning_rate": 5.99199042089345e-05,
823
+ "loss": 0.8143,
824
  "step": 136
825
  },
826
  {
827
  "epoch": 0.02,
828
+ "learning_rate": 5.939027410873351e-05,
829
+ "loss": 0.8766,
830
  "step": 137
831
  },
832
  {
833
  "epoch": 0.02,
834
+ "learning_rate": 5.885954957896115e-05,
835
+ "loss": 0.7539,
836
  "step": 138
837
  },
838
  {
839
  "epoch": 0.02,
840
+ "learning_rate": 5.832779247517273e-05,
841
+ "loss": 0.7721,
842
  "step": 139
843
  },
844
  {
845
  "epoch": 0.02,
846
+ "learning_rate": 5.779506477326933e-05,
847
+ "loss": 0.8023,
848
  "step": 140
849
  },
850
  {
851
  "epoch": 0.02,
852
+ "learning_rate": 5.726142856227452e-05,
853
+ "loss": 0.7718,
854
  "step": 141
855
  },
856
  {
857
  "epoch": 0.02,
858
+ "learning_rate": 5.672694603709794e-05,
859
+ "loss": 0.8443,
860
  "step": 142
861
  },
862
  {
863
  "epoch": 0.02,
864
+ "learning_rate": 5.619167949128652e-05,
865
+ "loss": 0.8167,
866
  "step": 143
867
  },
868
  {
869
  "epoch": 0.02,
870
+ "learning_rate": 5.565569130976422e-05,
871
+ "loss": 0.8438,
872
  "step": 144
873
  },
874
  {
875
  "epoch": 0.02,
876
+ "learning_rate": 5.5119043961561136e-05,
877
+ "loss": 0.6757,
878
  "step": 145
879
  },
880
  {
881
  "epoch": 0.02,
882
+ "learning_rate": 5.458179999253275e-05,
883
+ "loss": 0.6055,
884
  "step": 146
885
  },
886
  {
887
  "epoch": 0.02,
888
+ "learning_rate": 5.4044022018070214e-05,
889
+ "loss": 0.667,
890
  "step": 147
891
  },
892
  {
893
  "epoch": 0.02,
894
+ "learning_rate": 5.3505772715802704e-05,
895
+ "loss": 0.6251,
896
  "step": 148
897
  },
898
  {
899
  "epoch": 0.02,
900
+ "learning_rate": 5.296711481829226e-05,
901
+ "loss": 0.6498,
902
  "step": 149
903
  },
904
  {
905
  "epoch": 0.02,
906
+ "learning_rate": 5.242811110572242e-05,
907
+ "loss": 0.5881,
908
  "step": 150
909
  },
910
  {
911
  "epoch": 0.02,
912
+ "learning_rate": 5.188882439858117e-05,
913
+ "loss": 0.7092,
914
  "step": 151
915
  },
916
  {
917
  "epoch": 0.02,
918
+ "learning_rate": 5.134931755033936e-05,
919
+ "loss": 0.8843,
920
  "step": 152
921
  },
922
  {
923
  "epoch": 0.02,
924
+ "learning_rate": 5.080965344012508e-05,
925
+ "loss": 0.8751,
926
  "step": 153
927
  },
928
  {
929
  "epoch": 0.02,
930
+ "learning_rate": 5.0269894965395225e-05,
931
+ "loss": 0.8272,
932
  "step": 154
933
  },
934
  {
935
  "epoch": 0.02,
936
+ "learning_rate": 4.973010503460479e-05,
937
+ "loss": 0.8899,
938
  "step": 155
939
  },
940
  {
941
  "epoch": 0.02,
942
+ "learning_rate": 4.919034655987493e-05,
943
+ "loss": 0.9079,
944
  "step": 156
945
  },
946
  {
947
  "epoch": 0.02,
948
+ "learning_rate": 4.865068244966066e-05,
949
+ "loss": 0.8265,
950
  "step": 157
951
  },
952
  {
953
  "epoch": 0.02,
954
+ "learning_rate": 4.8111175601418844e-05,
955
+ "loss": 0.8614,
956
  "step": 158
957
  },
958
  {
959
  "epoch": 0.02,
960
+ "learning_rate": 4.7571888894277604e-05,
961
+ "loss": 0.811,
962
  "step": 159
963
  },
964
  {
965
  "epoch": 0.02,
966
+ "learning_rate": 4.703288518170774e-05,
967
+ "loss": 0.8822,
968
  "step": 160
969
  },
970
  {
971
  "epoch": 0.02,
972
+ "learning_rate": 4.6494227284197294e-05,
973
+ "loss": 0.876,
974
  "step": 161
975
  },
976
  {
977
  "epoch": 0.03,
978
+ "learning_rate": 4.59559779819298e-05,
979
+ "loss": 1.0022,
980
  "step": 162
981
  },
982
  {
983
  "epoch": 0.03,
984
+ "learning_rate": 4.541820000746727e-05,
985
+ "loss": 0.8351,
986
  "step": 163
987
  },
988
  {
989
  "epoch": 0.03,
990
+ "learning_rate": 4.4880956038438876e-05,
991
+ "loss": 0.9567,
992
  "step": 164
993
  },
994
  {
995
  "epoch": 0.03,
996
+ "learning_rate": 4.434430869023579e-05,
997
+ "loss": 0.9064,
998
  "step": 165
999
  },
1000
  {
1001
  "epoch": 0.03,
1002
+ "learning_rate": 4.38083205087135e-05,
1003
+ "loss": 0.817,
1004
  "step": 166
1005
  },
1006
  {
1007
  "epoch": 0.03,
1008
+ "learning_rate": 4.3273053962902076e-05,
1009
+ "loss": 0.8443,
1010
  "step": 167
1011
  },
1012
  {
1013
  "epoch": 0.03,
1014
+ "learning_rate": 4.27385714377255e-05,
1015
+ "loss": 0.8457,
1016
  "step": 168
1017
  },
1018
  {
1019
  "epoch": 0.03,
1020
+ "learning_rate": 4.220493522673067e-05,
1021
+ "loss": 0.8772,
1022
  "step": 169
1023
  },
1024
  {
1025
  "epoch": 0.03,
1026
+ "learning_rate": 4.1672207524827275e-05,
1027
+ "loss": 1.0726,
1028
  "step": 170
1029
  },
1030
  {
1031
  "epoch": 0.03,
1032
+ "learning_rate": 4.114045042103887e-05,
1033
+ "loss": 0.9497,
1034
  "step": 171
1035
  },
1036
  {
1037
  "epoch": 0.03,
1038
+ "learning_rate": 4.06097258912665e-05,
1039
+ "loss": 0.8689,
1040
  "step": 172
1041
  },
1042
  {
1043
  "epoch": 0.03,
1044
+ "learning_rate": 4.0080095791065505e-05,
1045
+ "loss": 0.9396,
1046
  "step": 173
1047
  },
1048
  {
1049
  "epoch": 0.03,
1050
+ "learning_rate": 3.955162184843625e-05,
1051
+ "loss": 0.9253,
1052
  "step": 174
1053
  },
1054
  {
1055
  "epoch": 0.03,
1056
+ "learning_rate": 3.902436565662977e-05,
1057
+ "loss": 0.8986,
1058
  "step": 175
1059
  },
1060
  {
1061
  "epoch": 0.03,
1062
+ "learning_rate": 3.849838866696913e-05,
1063
+ "loss": 0.9844,
1064
  "step": 176
1065
  },
1066
  {
1067
  "epoch": 0.03,
1068
+ "learning_rate": 3.7973752181687335e-05,
1069
+ "loss": 0.9125,
1070
  "step": 177
1071
  },
1072
  {
1073
  "epoch": 0.03,
1074
+ "learning_rate": 3.745051734678256e-05,
1075
+ "loss": 0.8938,
1076
  "step": 178
1077
  },
1078
  {
1079
  "epoch": 0.03,
1080
+ "learning_rate": 3.692874514489173e-05,
1081
+ "loss": 0.8496,
1082
  "step": 179
1083
  },
1084
  {
1085
  "epoch": 0.03,
1086
+ "learning_rate": 3.640849638818286e-05,
1087
+ "loss": 0.9429,
1088
  "step": 180
1089
  },
1090
  {
1091
  "epoch": 0.03,
1092
+ "learning_rate": 3.588983171126762e-05,
1093
+ "loss": 0.8858,
1094
  "step": 181
1095
  },
1096
  {
1097
  "epoch": 0.03,
1098
+ "learning_rate": 3.53728115641343e-05,
1099
+ "loss": 0.7386,
1100
  "step": 182
1101
  },
1102
  {
1103
  "epoch": 0.03,
1104
+ "learning_rate": 3.4857496205102474e-05,
1105
+ "loss": 1.0531,
1106
  "step": 183
1107
  },
1108
  {
1109
  "epoch": 0.03,
1110
+ "learning_rate": 3.434394569379988e-05,
1111
+ "loss": 0.9059,
1112
  "step": 184
1113
  },
1114
  {
1115
  "epoch": 0.03,
1116
+ "learning_rate": 3.3832219884162585e-05,
1117
+ "loss": 0.8182,
1118
  "step": 185
1119
  },
1120
  {
1121
  "epoch": 0.03,
1122
+ "learning_rate": 3.332237841745898e-05,
1123
+ "loss": 0.7084,
1124
  "step": 186
1125
  },
1126
  {
1127
  "epoch": 0.03,
1128
+ "learning_rate": 3.281448071533867e-05,
1129
+ "loss": 0.7416,
1130
  "step": 187
1131
  },
1132
  {
1133
  "epoch": 0.03,
1134
+ "learning_rate": 3.2308585972906966e-05,
1135
+ "loss": 0.6783,
1136
  "step": 188
1137
  },
1138
  {
1139
  "epoch": 0.03,
1140
+ "learning_rate": 3.180475315182563e-05,
1141
+ "loss": 0.6974,
1142
  "step": 189
1143
  },
1144
  {
1145
  "epoch": 0.03,
1146
+ "learning_rate": 3.130304097344103e-05,
1147
+ "loss": 0.6956,
1148
  "step": 190
1149
  },
1150
  {
1151
  "epoch": 0.03,
1152
+ "learning_rate": 3.080350791194019e-05,
1153
+ "loss": 0.6688,
1154
  "step": 191
1155
  },
1156
  {
1157
  "epoch": 0.03,
1158
+ "learning_rate": 3.0306212187535653e-05,
1159
+ "loss": 0.662,
1160
  "step": 192
1161
  },
1162
  {
1163
  "epoch": 0.03,
1164
+ "learning_rate": 2.9811211759679924e-05,
1165
+ "loss": 0.6405,
1166
  "step": 193
1167
  },
1168
  {
1169
  "epoch": 0.03,
1170
+ "learning_rate": 2.9318564320310444e-05,
1171
+ "loss": 0.5587,
1172
  "step": 194
1173
  },
1174
  {
1175
  "epoch": 0.03,
1176
+ "learning_rate": 2.882832728712551e-05,
1177
+ "loss": 0.6154,
1178
  "step": 195
1179
  },
1180
  {
1181
  "epoch": 0.03,
1182
+ "learning_rate": 2.8340557796892354e-05,
1183
+ "loss": 0.6829,
1184
  "step": 196
1185
  },
1186
  {
1187
  "epoch": 0.03,
1188
+ "learning_rate": 2.7855312698787904e-05,
1189
+ "loss": 0.5108,
1190
  "step": 197
1191
  },
1192
  {
1193
  "epoch": 0.03,
1194
+ "learning_rate": 2.737264854777306e-05,
1195
+ "loss": 0.4153,
1196
  "step": 198
1197
  },
1198
  {
1199
  "epoch": 0.03,
1200
+ "learning_rate": 2.6892621598001156e-05,
1201
+ "loss": 0.474,
1202
  "step": 199
1203
  },
1204
  {
1205
  "epoch": 0.03,
1206
+ "learning_rate": 2.6415287796261706e-05,
1207
+ "loss": 0.4005,
1208
  "step": 200
1209
  },
1210
  {
1211
  "epoch": 0.03,
1212
+ "learning_rate": 2.5940702775459747e-05,
1213
+ "loss": 0.7317,
1214
  "step": 201
1215
  },
1216
  {
1217
  "epoch": 0.03,
1218
+ "learning_rate": 2.5468921848131983e-05,
1219
+ "loss": 0.8145,
1220
  "step": 202
1221
  },
1222
  {
1223
  "epoch": 0.03,
1224
+ "learning_rate": 2.500000000000001e-05,
1225
+ "loss": 0.8618,
1226
  "step": 203
1227
  },
1228
  {
1229
  "epoch": 0.03,
1230
+ "learning_rate": 2.4533991883561868e-05,
1231
+ "loss": 0.9186,
1232
  "step": 204
1233
  },
1234
  {
1235
  "epoch": 0.03,
1236
+ "learning_rate": 2.407095181172227e-05,
1237
+ "loss": 0.8658,
1238
  "step": 205
1239
  },
1240
  {
1241
  "epoch": 0.03,
1242
+ "learning_rate": 2.3610933751462553e-05,
1243
+ "loss": 0.8398,
1244
  "step": 206
1245
  },
1246
  {
1247
  "epoch": 0.03,
1248
+ "learning_rate": 2.315399131755081e-05,
1249
+ "loss": 0.8834,
1250
  "step": 207
1251
  },
1252
  {
1253
  "epoch": 0.03,
1254
+ "learning_rate": 2.2700177766293096e-05,
1255
+ "loss": 0.9564,
1256
  "step": 208
1257
  },
1258
  {
1259
  "epoch": 0.03,
1260
+ "learning_rate": 2.2249545989326514e-05,
1261
+ "loss": 0.8387,
1262
  "step": 209
1263
  },
1264
  {
1265
  "epoch": 0.03,
1266
+ "learning_rate": 2.180214850745467e-05,
1267
+ "loss": 0.857,
1268
  "step": 210
1269
  },
1270
  {
1271
  "epoch": 0.03,
1272
+ "learning_rate": 2.1358037464526515e-05,
1273
+ "loss": 0.8656,
1274
  "step": 211
1275
  },
1276
  {
1277
  "epoch": 0.03,
1278
+ "learning_rate": 2.091726462135888e-05,
1279
+ "loss": 0.9432,
1280
  "step": 212
1281
  },
1282
  {
1283
  "epoch": 0.03,
1284
+ "learning_rate": 2.0479881349703883e-05,
1285
+ "loss": 0.8754,
1286
  "step": 213
1287
  },
1288
  {
1289
  "epoch": 0.03,
1290
+ "learning_rate": 2.0045938626261546e-05,
1291
+ "loss": 0.9271,
1292
  "step": 214
1293
  },
1294
  {
1295
  "epoch": 0.03,
1296
+ "learning_rate": 1.9615487026738543e-05,
1297
+ "loss": 0.9571,
1298
  "step": 215
1299
  },
1300
  {
1301
  "epoch": 0.03,
1302
+ "learning_rate": 1.9188576719953633e-05,
1303
+ "loss": 0.8626,
1304
  "step": 216
1305
  },
1306
  {
1307
  "epoch": 0.03,
1308
+ "learning_rate": 1.8765257461990442e-05,
1309
+ "loss": 0.7676,
1310
  "step": 217
1311
  },
1312
  {
1313
  "epoch": 0.03,
1314
+ "learning_rate": 1.834557859039851e-05,
1315
+ "loss": 0.9014,
1316
  "step": 218
1317
  },
1318
  {
1319
  "epoch": 0.03,
1320
+ "learning_rate": 1.7929589018443016e-05,
1321
+ "loss": 0.8107,
1322
  "step": 219
1323
  },
1324
  {
1325
  "epoch": 0.03,
1326
+ "learning_rate": 1.7517337229403946e-05,
1327
+ "loss": 0.7515,
1328
  "step": 220
1329
  },
1330
  {
1331
  "epoch": 0.03,
1332
+ "learning_rate": 1.710887127092548e-05,
1333
+ "loss": 0.9335,
1334
  "step": 221
1335
  },
1336
  {
1337
  "epoch": 0.03,
1338
+ "learning_rate": 1.6704238749415957e-05,
1339
+ "loss": 0.8645,
1340
  "step": 222
1341
  },
1342
  {
1343
  "epoch": 0.03,
1344
+ "learning_rate": 1.6303486824499458e-05,
1345
+ "loss": 0.7009,
1346
  "step": 223
1347
  },
1348
  {
1349
  "epoch": 0.03,
1350
+ "learning_rate": 1.5906662203519412e-05,
1351
+ "loss": 0.8673,
1352
  "step": 224
1353
  },
1354
  {
1355
  "epoch": 0.03,
1356
+ "learning_rate": 1.5513811136094787e-05,
1357
+ "loss": 0.8259,
1358
  "step": 225
1359
  },
1360
  {
1361
  "epoch": 0.03,
1362
+ "learning_rate": 1.5124979408729861e-05,
1363
+ "loss": 0.8709,
1364
  "step": 226
1365
  },
1366
  {
1367
  "epoch": 0.04,
1368
+ "learning_rate": 1.4740212339477721e-05,
1369
+ "loss": 0.7189,
1370
  "step": 227
1371
  },
1372
  {
1373
  "epoch": 0.04,
1374
+ "learning_rate": 1.4359554772658552e-05,
1375
+ "loss": 0.946,
1376
  "step": 228
1377
  },
1378
  {
1379
  "epoch": 0.04,
1380
+ "learning_rate": 1.3983051073632997e-05,
1381
+ "loss": 0.8452,
1382
  "step": 229
1383
  },
1384
  {
1385
  "epoch": 0.04,
1386
+ "learning_rate": 1.3610745123631535e-05,
1387
+ "loss": 0.7563,
1388
  "step": 230
1389
  },
1390
  {
1391
  "epoch": 0.04,
1392
+ "learning_rate": 1.3242680314639993e-05,
1393
+ "loss": 0.738,
1394
  "step": 231
1395
  },
1396
  {
1397
  "epoch": 0.04,
1398
+ "learning_rate": 1.2878899544342327e-05,
1399
+ "loss": 0.9574,
1400
  "step": 232
1401
  },
1402
  {
1403
  "epoch": 0.04,
1404
+ "learning_rate": 1.2519445211120979e-05,
1405
+ "loss": 0.7155,
1406
  "step": 233
1407
  },
1408
  {
1409
  "epoch": 0.04,
1410
+ "learning_rate": 1.2164359209115234e-05,
1411
+ "loss": 0.7498,
1412
  "step": 234
1413
  },
1414
  {
1415
  "epoch": 0.04,
1416
+ "learning_rate": 1.1813682923338653e-05,
1417
+ "loss": 0.8432,
1418
  "step": 235
1419
  },
1420
  {
1421
  "epoch": 0.04,
1422
+ "learning_rate": 1.1467457224855544e-05,
1423
+ "loss": 0.729,
1424
  "step": 236
1425
  },
1426
  {
1427
  "epoch": 0.04,
1428
+ "learning_rate": 1.1125722466017547e-05,
1429
+ "loss": 0.6728,
1430
  "step": 237
1431
  },
1432
  {
1433
  "epoch": 0.04,
1434
+ "learning_rate": 1.0788518475760545e-05,
1435
+ "loss": 0.6898,
1436
  "step": 238
1437
  },
1438
  {
1439
  "epoch": 0.04,
1440
+ "learning_rate": 1.0455884554962725e-05,
1441
+ "loss": 0.6286,
1442
  "step": 239
1443
  },
1444
  {
1445
  "epoch": 0.04,
1446
+ "learning_rate": 1.012785947186397e-05,
1447
+ "loss": 0.7136,
1448
  "step": 240
1449
  },
1450
  {
1451
  "epoch": 0.04,
1452
+ "learning_rate": 9.804481457547498e-06,
1453
+ "loss": 0.6081,
1454
  "step": 241
1455
  },
1456
  {
1457
  "epoch": 0.04,
1458
+ "learning_rate": 9.485788201484126e-06,
1459
+ "loss": 0.746,
1460
  "step": 242
1461
  },
1462
  {
1463
  "epoch": 0.04,
1464
+ "learning_rate": 9.171816847139448e-06,
1465
+ "loss": 0.6714,
1466
  "step": 243
1467
  },
1468
  {
1469
  "epoch": 0.04,
1470
+ "learning_rate": 8.86260398764494e-06,
1471
+ "loss": 0.6298,
1472
  "step": 244
1473
  },
1474
  {
1475
  "epoch": 0.04,
1476
+ "learning_rate": 8.558185661532941e-06,
1477
+ "loss": 0.5962,
1478
  "step": 245
1479
  },
1480
  {
1481
  "epoch": 0.04,
1482
+ "learning_rate": 8.25859734853645e-06,
1483
+ "loss": 0.6198,
1484
  "step": 246
1485
  },
1486
  {
1487
  "epoch": 0.04,
1488
+ "learning_rate": 7.96387396545396e-06,
1489
+ "loss": 0.5893,
1490
  "step": 247
1491
  },
1492
  {
1493
  "epoch": 0.04,
1494
+ "learning_rate": 7.67404986207999e-06,
1495
+ "loss": 0.6294,
1496
  "step": 248
1497
  },
1498
  {
1499
  "epoch": 0.04,
1500
+ "learning_rate": 7.389158817201542e-06,
1501
+ "loss": 0.515,
1502
  "step": 249
1503
  },
1504
  {
1505
  "epoch": 0.04,
1506
+ "learning_rate": 7.109234034661289e-06,
1507
+ "loss": 0.4909,
1508
  "step": 250
1509
  },
1510
  {
1511
  "epoch": 0.04,
1512
+ "learning_rate": 6.8343081394876715e-06,
1513
+ "loss": 1.0134,
1514
  "step": 251
1515
  },
1516
  {
1517
  "epoch": 0.04,
1518
+ "learning_rate": 6.564413174092443e-06,
1519
+ "loss": 1.0413,
1520
  "step": 252
1521
  },
1522
  {
1523
  "epoch": 0.04,
1524
+ "learning_rate": 6.299580594536214e-06,
1525
+ "loss": 0.7905,
1526
  "step": 253
1527
  },
1528
  {
1529
  "epoch": 0.04,
1530
+ "learning_rate": 6.0398412668621895e-06,
1531
+ "loss": 0.7666,
1532
  "step": 254
1533
  },
1534
  {
1535
  "epoch": 0.04,
1536
+ "learning_rate": 5.785225463498828e-06,
1537
+ "loss": 0.8807,
1538
  "step": 255
1539
  },
1540
  {
1541
  "epoch": 0.04,
1542
+ "learning_rate": 5.535762859731547e-06,
1543
+ "loss": 0.8202,
1544
  "step": 256
1545
  },
1546
  {
1547
  "epoch": 0.04,
1548
+ "learning_rate": 5.291482530244179e-06,
1549
+ "loss": 0.9371,
1550
  "step": 257
1551
  },
1552
  {
1553
  "epoch": 0.04,
1554
+ "learning_rate": 5.05241294573024e-06,
1555
+ "loss": 0.8992,
1556
  "step": 258
1557
  },
1558
  {
1559
  "epoch": 0.04,
1560
+ "learning_rate": 4.818581969574742e-06,
1561
+ "loss": 0.8329,
1562
  "step": 259
1563
  },
1564
  {
1565
  "epoch": 0.04,
1566
+ "learning_rate": 4.590016854606727e-06,
1567
+ "loss": 0.9316,
1568
  "step": 260
1569
  },
1570
  {
1571
  "epoch": 0.04,
1572
+ "learning_rate": 4.366744239922998e-06,
1573
+ "loss": 0.8608,
1574
  "step": 261
1575
  },
1576
  {
1577
  "epoch": 0.04,
1578
+ "learning_rate": 4.148790147783288e-06,
1579
+ "loss": 0.9218,
1580
  "step": 262
1581
  },
1582
  {
1583
  "epoch": 0.04,
1584
+ "learning_rate": 3.936179980577453e-06,
1585
+ "loss": 0.952,
1586
  "step": 263
1587
  },
1588
  {
1589
  "epoch": 0.04,
1590
+ "learning_rate": 3.728938517864794e-06,
1591
+ "loss": 0.8879,
1592
  "step": 264
1593
  },
1594
  {
1595
  "epoch": 0.04,
1596
+ "learning_rate": 3.527089913486037e-06,
1597
+ "loss": 0.8573,
1598
  "step": 265
1599
  },
1600
  {
1601
  "epoch": 0.04,
1602
+ "learning_rate": 3.3306576927482126e-06,
1603
+ "loss": 0.8729,
1604
  "step": 266
1605
  },
1606
  {
1607
  "epoch": 0.04,
1608
+ "learning_rate": 3.1396647496828247e-06,
1609
+ "loss": 0.9766,
1610
  "step": 267
1611
  },
1612
  {
1613
  "epoch": 0.04,
1614
+ "learning_rate": 2.9541333443775243e-06,
1615
+ "loss": 0.8435,
1616
  "step": 268
1617
  },
1618
  {
1619
  "epoch": 0.04,
1620
+ "learning_rate": 2.774085100381735e-06,
1621
+ "loss": 0.7851,
1622
  "step": 269
1623
  },
1624
  {
1625
  "epoch": 0.04,
1626
+ "learning_rate": 2.5995410021864787e-06,
1627
+ "loss": 0.9089,
1628
  "step": 270
1629
  },
1630
  {
1631
  "epoch": 0.04,
1632
+ "learning_rate": 2.430521392778573e-06,
1633
+ "loss": 0.8561,
1634
  "step": 271
1635
  },
1636
  {
1637
  "epoch": 0.04,
1638
+ "learning_rate": 2.2670459712697377e-06,
1639
+ "loss": 0.8957,
1640
  "step": 272
1641
  },
1642
  {
1643
  "epoch": 0.04,
1644
+ "learning_rate": 2.1091337906006482e-06,
1645
+ "loss": 0.7897,
1646
  "step": 273
1647
  },
1648
  {
1649
  "epoch": 0.04,
1650
+ "learning_rate": 1.956803255320322e-06,
1651
+ "loss": 0.897,
1652
  "step": 274
1653
  },
1654
  {
1655
  "epoch": 0.04,
1656
+ "learning_rate": 1.810072119441103e-06,
1657
+ "loss": 0.8682,
1658
  "step": 275
1659
  },
1660
  {
1661
  "epoch": 0.04,
1662
+ "learning_rate": 1.6689574843694433e-06,
1663
+ "loss": 0.9593,
1664
  "step": 276
1665
  },
1666
  {
1667
  "epoch": 0.04,
1668
+ "learning_rate": 1.53347579691272e-06,
1669
+ "loss": 0.7004,
1670
  "step": 277
1671
  },
1672
  {
1673
  "epoch": 0.04,
1674
+ "learning_rate": 1.4036428473624019e-06,
1675
+ "loss": 0.9315,
1676
  "step": 278
1677
  },
1678
  {
1679
  "epoch": 0.04,
1680
+ "learning_rate": 1.2794737676536994e-06,
1681
+ "loss": 0.8717,
1682
  "step": 279
1683
  },
1684
  {
1685
  "epoch": 0.04,
1686
+ "learning_rate": 1.1609830296019143e-06,
1687
+ "loss": 0.7959,
1688
  "step": 280
1689
  },
1690
  {
1691
  "epoch": 0.04,
1692
+ "learning_rate": 1.0481844432158161e-06,
1693
+ "loss": 0.8804,
1694
  "step": 281
1695
  },
1696
  {
1697
  "epoch": 0.04,
1698
+ "learning_rate": 9.410911550880475e-07,
1699
+ "loss": 0.7299,
1700
  "step": 282
1701
  },
1702
  {
1703
  "epoch": 0.04,
1704
+ "learning_rate": 8.397156468629208e-07,
1705
+ "loss": 0.733,
1706
  "step": 283
1707
  },
1708
  {
1709
  "epoch": 0.04,
1710
+ "learning_rate": 7.44069733781677e-07,
1711
+ "loss": 0.7353,
1712
  "step": 284
1713
  },
1714
  {
1715
  "epoch": 0.04,
1716
+ "learning_rate": 6.54164563305465e-07,
1717
+ "loss": 0.7066,
1718
  "step": 285
1719
  },
1720
  {
1721
  "epoch": 0.04,
1722
+ "learning_rate": 5.700106138160688e-07,
1723
+ "loss": 0.7054,
1724
  "step": 286
1725
  },
1726
  {
1727
  "epoch": 0.04,
1728
+ "learning_rate": 4.916176933946693e-07,
1729
+ "loss": 0.8131,
1730
  "step": 287
1731
  },
1732
  {
1733
  "epoch": 0.04,
1734
+ "learning_rate": 4.189949386787462e-07,
1735
+ "loss": 0.6315,
1736
  "step": 288
1737
  },
1738
  {
1739
  "epoch": 0.04,
1740
+ "learning_rate": 3.5215081379718074e-07,
1741
+ "loss": 0.6279,
1742
  "step": 289
1743
  },
1744
  {
1745
  "epoch": 0.04,
1746
+ "learning_rate": 2.9109310938378877e-07,
1747
+ "loss": 0.6736,
1748
  "step": 290
1749
  },
1750
  {
1751
  "epoch": 0.04,
1752
+ "learning_rate": 2.3582894166930268e-07,
1753
+ "loss": 0.6892,
1754
  "step": 291
1755
  },
1756
  {
1757
  "epoch": 0.05,
1758
+ "learning_rate": 1.8636475165200174e-07,
1759
+ "loss": 0.6936,
1760
  "step": 292
1761
  },
1762
  {
1763
  "epoch": 0.05,
1764
+ "learning_rate": 1.427063043470178e-07,
1765
+ "loss": 0.6369,
1766
  "step": 293
1767
  },
1768
  {
1769
  "epoch": 0.05,
1770
+ "learning_rate": 1.0485868811441757e-07,
1771
+ "loss": 0.6448,
1772
  "step": 294
1773
  },
1774
  {
1775
  "epoch": 0.05,
1776
+ "learning_rate": 7.282631406615447e-08,
1777
+ "loss": 0.6009,
1778
  "step": 295
1779
  },
1780
  {
1781
  "epoch": 0.05,
1782
+ "learning_rate": 4.661291555196345e-08,
1783
+ "loss": 0.5067,
1784
  "step": 296
1785
  },
1786
  {
1787
  "epoch": 0.05,
1788
+ "learning_rate": 2.6221547724253337e-08,
1789
+ "loss": 0.5489,
1790
  "step": 297
1791
  },
1792
  {
1793
  "epoch": 0.05,
1794
+ "learning_rate": 1.1654587182013953e-08,
1795
+ "loss": 0.5175,
1796
  "step": 298
1797
  },
1798
  {
1799
  "epoch": 0.05,
1800
+ "learning_rate": 2.9137316938265825e-09,
1801
+ "loss": 0.5188,
1802
  "step": 299
1803
  },
1804
  {
1805
  "epoch": 0.05,
1806
  "learning_rate": 0.0,
1807
+ "loss": 0.4117,
1808
  "step": 300
1809
  }
1810
  ],
1811
  "max_steps": 300,
1812
  "num_train_epochs": 1,
1813
+ "total_flos": 2.3308777993273344e+16,
1814
  "trial_name": null,
1815
  "trial_params": null
1816
  }
checkpoint-300/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4373b7353f7618ba071e8149528a0ae191779985a891ab40089b4f1cda12cac9
3
  size 4027
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26d05e06b74b598e07338ab9a422f3e15db4fef4b5c96eb25a8e5404684e051b
3
  size 4027
flyte_training_config.json CHANGED
@@ -1 +1 @@
1
- {"base_model": "meta-llama/Llama-2-7b-hf", "data_path": "yahma/alpaca-cleaned", "instruction_key": "instruction", "input_key": "input", "output_key": "output", "output_dir": "./output", "device_map": "auto", "batch_size": 8, "micro_batch_size": 8, "num_epochs": 1, "max_steps": 300, "eval_steps": 200, "save_steps": 50, "learning_rate": 3e-06, "cutoff_len": 512, "val_set_size": 0, "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "weight_decay": 0.02, "warmup_ratio": 0.03, "lr_scheduler_type": "cosine", "lora_target_modules": ["q_proj", "k_proj", "v_proj"], "train_on_inputs": true, "add_eos_token": true, "group_by_length": true, "resume_from_checkpoint": null, "wandb_project": "unionai-llm-fine-tuning", "wandb_run_name": "", "wandb_watch": "", "wandb_log_model": "", "debug_mode": false, "debug_train_data_size": 1024}
 
1
+ {"base_model": "meta-llama/Llama-2-7b-hf", "data_path": "yahma/alpaca-cleaned", "instruction_key": "instruction", "input_key": "input", "output_key": "output", "output_dir": "./output", "device_map": "auto", "batch_size": 8, "micro_batch_size": 8, "num_epochs": 1, "max_steps": 300, "eval_steps": 200, "save_steps": 50, "learning_rate": 0.0001, "cutoff_len": 512, "val_set_size": 0, "lora_r": 8, "lora_alpha": 16, "lora_dropout": 0.05, "weight_decay": 0.02, "warmup_ratio": 0.03, "lr_scheduler_type": "cosine", "lora_target_modules": ["q_proj", "k_proj", "v_proj"], "train_on_inputs": true, "add_eos_token": true, "group_by_length": true, "resume_from_checkpoint": null, "wandb_project": "unionai-llm-fine-tuning", "wandb_run_name": "", "wandb_watch": "", "wandb_log_model": "", "debug_mode": false, "debug_train_data_size": 1024}
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4373b7353f7618ba071e8149528a0ae191779985a891ab40089b4f1cda12cac9
3
  size 4027
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26d05e06b74b598e07338ab9a422f3e15db4fef4b5c96eb25a8e5404684e051b
3
  size 4027