mlabonne commited on
Commit
6f97173
1 Parent(s): 04a9c72

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -95,7 +95,7 @@ special_tokens:
95
 
96
  This model is a fine-tuned version of [ai21labs/Jamba-v0.1](https://huggingface.co/ai21labs/Jamba-v0.1) on the None dataset.
97
  It achieves the following results on the evaluation set:
98
- - Loss: 0.9651
99
 
100
  ## Model description
101
 
@@ -133,10 +133,10 @@ The following hyperparameters were used during training:
133
  | Training Loss | Epoch | Step | Validation Loss |
134
  |:-------------:|:-----:|:----:|:---------------:|
135
  | 0.6274 | 0.01 | 1 | 1.0298 |
136
- | 0.4403 | 0.25 | 42 | 0.9768 |
137
- | 0.4417 | 0.5 | 84 | 0.9675 |
138
- | 0.4451 | 0.75 | 126 | 0.9652 |
139
- | 0.4616 | 1.0 | 168 | 0.9651 |
140
 
141
 
142
  ### Framework versions
 
95
 
96
  This model is a fine-tuned version of [ai21labs/Jamba-v0.1](https://huggingface.co/ai21labs/Jamba-v0.1) on the None dataset.
97
  It achieves the following results on the evaluation set:
98
+ - Loss: 0.9641
99
 
100
  ## Model description
101
 
 
133
  | Training Loss | Epoch | Step | Validation Loss |
134
  |:-------------:|:-----:|:----:|:---------------:|
135
  | 0.6274 | 0.01 | 1 | 1.0298 |
136
+ | 0.44 | 0.25 | 42 | 0.9770 |
137
+ | 0.4406 | 0.5 | 84 | 0.9653 |
138
+ | 0.4445 | 0.75 | 126 | 0.9645 |
139
+ | 0.4609 | 1.0 | 168 | 0.9641 |
140
 
141
 
142
  ### Framework versions
adapter_config.json CHANGED
@@ -20,17 +20,17 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "x_proj",
24
- "o_proj",
25
- "up_proj",
26
  "v_proj",
27
- "down_proj",
28
  "dt_proj",
29
- "k_proj",
30
- "q_proj",
31
  "in_proj",
 
 
 
32
  "gate_proj",
33
- "out_proj",
 
34
  "router"
35
  ],
36
  "task_type": "CAUSAL_LM",
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "q_proj",
 
 
24
  "v_proj",
 
25
  "dt_proj",
26
+ "out_proj",
 
27
  "in_proj",
28
+ "k_proj",
29
+ "x_proj",
30
+ "o_proj",
31
  "gate_proj",
32
+ "down_proj",
33
+ "up_proj",
34
  "router"
35
  ],
36
  "task_type": "CAUSAL_LM",
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:485eddbdd38763c9850251f824a3877a9ddc0db9c893fae77c6ad1f9eb0d379c
3
  size 531653306
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5666b45b26616be53d976f599b2d2d6262c975237daca20b37002cb90b38e7c
3
  size 531653306
checkpoint-126/adapter_config.json CHANGED
@@ -20,17 +20,17 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "x_proj",
24
- "o_proj",
25
- "up_proj",
26
  "v_proj",
27
- "down_proj",
28
  "dt_proj",
29
- "k_proj",
30
- "q_proj",
31
  "in_proj",
 
 
 
32
  "gate_proj",
33
- "out_proj",
 
34
  "router"
35
  ],
36
  "task_type": "CAUSAL_LM",
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "q_proj",
 
 
24
  "v_proj",
 
25
  "dt_proj",
26
+ "out_proj",
 
27
  "in_proj",
28
+ "k_proj",
29
+ "x_proj",
30
+ "o_proj",
31
  "gate_proj",
32
+ "down_proj",
33
+ "up_proj",
34
  "router"
35
  ],
36
  "task_type": "CAUSAL_LM",
checkpoint-126/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be949fb50dbe53d64e9f4375ed327e36ae2368bf683cfdaf813667de418fcbb8
3
  size 531611600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e5244d4a10dbd753d970890412e3ca9603e7dcf433636b5774a56cd24230ca1
3
  size 531611600
checkpoint-126/global_step126/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6998ecb2f6d60f27d30371a8ce207805cf8a48dd8f296036fff813d22b65593
3
  size 797643792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30e35823423f4d40789f18071347d83fb7bfc4ac817c60dd4a4a37af4433ed7a
3
  size 797643792
checkpoint-126/global_step126/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f504e0638f11f71fa63e7587596b481c876e9ec17a2e4703833ab1713a855b1
3
  size 797644432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bb2c01e320ef6c12ebd1a5ff11884d226fce0fc7a7455216a45c07ed35197d8
3
  size 797644432
checkpoint-126/global_step126/mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:271bfc43b751ef756538a0f6b19ee11530e5d5c9f68f5e3c3b086877dc1d3a6a
3
  size 1345660121
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa91f679f40d2c52f881c918ddc04e8bf7639c32c1ebb9c47161e35088f9ff18
3
  size 1345660121
checkpoint-126/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:601237d8f7ccca9157ec802c2d8dbf105a33ca51949abaf17e8caac019fa582f
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:136c534e0ab55c33db36cd2e9ff890cf7a74c156913de6ef32dd097847df2202
3
  size 14512
checkpoint-126/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b453660e9304c04b047fceaa4cad80dfb1b50b01880351c794e0268dc4d5d75
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7e5b7eb90da0157ab2ee2a702d0793eaa00e971e5110cd0820814aa3fa7d061
3
  size 14512
checkpoint-126/trainer_state.json CHANGED
@@ -10,7 +10,7 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.01,
13
- "grad_norm": 0.30475762367257353,
14
  "learning_rate": 2e-05,
15
  "loss": 0.6274,
16
  "step": 1
@@ -18,908 +18,908 @@
18
  {
19
  "epoch": 0.01,
20
  "eval_loss": 1.0297596454620361,
21
- "eval_runtime": 150.6014,
22
- "eval_samples_per_second": 1.653,
23
- "eval_steps_per_second": 0.83,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.01,
28
- "grad_norm": 0.2798137150913395,
29
  "learning_rate": 4e-05,
30
  "loss": 0.6362,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.02,
35
- "grad_norm": 0.3178684451319545,
36
  "learning_rate": 6e-05,
37
  "loss": 0.6299,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.02,
42
- "grad_norm": 0.33651284916847835,
43
  "learning_rate": 8e-05,
44
- "loss": 0.6391,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.03,
49
- "grad_norm": 0.23984915388648712,
50
  "learning_rate": 0.0001,
51
- "loss": 0.6071,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.04,
56
- "grad_norm": 0.20514040410017348,
57
  "learning_rate": 0.00012,
58
- "loss": 0.5996,
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.04,
63
- "grad_norm": 0.1950018128286362,
64
  "learning_rate": 0.00014,
65
- "loss": 0.6298,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.05,
70
- "grad_norm": 0.14246019947393238,
71
  "learning_rate": 0.00016,
72
- "loss": 0.5108,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 0.05,
77
- "grad_norm": 0.15792014279750227,
78
  "learning_rate": 0.00018,
79
- "loss": 0.5529,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 0.06,
84
- "grad_norm": 0.1517889177511264,
85
  "learning_rate": 0.0002,
86
- "loss": 0.5433,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 0.07,
91
- "grad_norm": 0.1372121219437277,
92
  "learning_rate": 0.00019998023297700658,
93
- "loss": 0.5856,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 0.07,
98
- "grad_norm": 0.16740807394942855,
99
  "learning_rate": 0.00019992093972273018,
100
- "loss": 0.5546,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 0.08,
105
- "grad_norm": 0.13512320693394078,
106
  "learning_rate": 0.00019982214367819328,
107
- "loss": 0.6193,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 0.08,
112
- "grad_norm": 0.16169796294070152,
113
  "learning_rate": 0.0001996838839014696,
114
  "loss": 0.5495,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 0.09,
119
- "grad_norm": 0.16796913812281988,
120
  "learning_rate": 0.00019950621505224273,
121
- "loss": 0.5035,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 0.1,
126
- "grad_norm": 0.1800514764162192,
127
  "learning_rate": 0.00019928920737019733,
128
  "loss": 0.5083,
129
  "step": 16
130
  },
131
  {
132
  "epoch": 0.1,
133
- "grad_norm": 0.170432124866908,
134
  "learning_rate": 0.0001990329466472502,
135
- "loss": 0.632,
136
  "step": 17
137
  },
138
  {
139
  "epoch": 0.11,
140
- "grad_norm": 0.19129325489749488,
141
  "learning_rate": 0.00019873753419363336,
142
- "loss": 0.4813,
143
  "step": 18
144
  },
145
  {
146
  "epoch": 0.11,
147
- "grad_norm": 0.1459357988760762,
148
  "learning_rate": 0.00019840308679784207,
149
- "loss": 0.4973,
150
  "step": 19
151
  },
152
  {
153
  "epoch": 0.12,
154
- "grad_norm": 0.192594730984382,
155
  "learning_rate": 0.00019802973668046363,
156
- "loss": 0.5291,
157
  "step": 20
158
  },
159
  {
160
  "epoch": 0.12,
161
- "grad_norm": 0.859025467969139,
162
  "learning_rate": 0.0001976176314419051,
163
- "loss": 0.5296,
164
  "step": 21
165
  },
166
  {
167
  "epoch": 0.13,
168
- "grad_norm": 0.13366297885670222,
169
  "learning_rate": 0.000197166934004041,
170
  "loss": 0.4819,
171
  "step": 22
172
  },
173
  {
174
  "epoch": 0.14,
175
- "grad_norm": 0.15698714419747645,
176
  "learning_rate": 0.00019667782254580374,
177
- "loss": 0.5409,
178
  "step": 23
179
  },
180
  {
181
  "epoch": 0.14,
182
- "grad_norm": 0.10995943735837355,
183
  "learning_rate": 0.00019615049043274205,
184
- "loss": 0.5108,
185
  "step": 24
186
  },
187
  {
188
  "epoch": 0.15,
189
- "grad_norm": 0.10796742192788925,
190
  "learning_rate": 0.00019558514614057609,
191
- "loss": 0.5215,
192
  "step": 25
193
  },
194
  {
195
  "epoch": 0.15,
196
- "grad_norm": 0.11641740089490231,
197
  "learning_rate": 0.00019498201317277828,
198
- "loss": 0.5012,
199
  "step": 26
200
  },
201
  {
202
  "epoch": 0.16,
203
- "grad_norm": 0.1120175962893241,
204
  "learning_rate": 0.00019434132997221345,
205
- "loss": 0.474,
206
  "step": 27
207
  },
208
  {
209
  "epoch": 0.17,
210
- "grad_norm": 0.1218171278782483,
211
  "learning_rate": 0.0001936633498268728,
212
- "loss": 0.5216,
213
  "step": 28
214
  },
215
  {
216
  "epoch": 0.17,
217
- "grad_norm": 0.11718521115928844,
218
  "learning_rate": 0.0001929483407697387,
219
- "loss": 0.4856,
220
  "step": 29
221
  },
222
  {
223
  "epoch": 0.18,
224
- "grad_norm": 0.12611471038571026,
225
  "learning_rate": 0.00019219658547282067,
226
- "loss": 0.4823,
227
  "step": 30
228
  },
229
  {
230
  "epoch": 0.18,
231
- "grad_norm": 0.11106871615269753,
232
  "learning_rate": 0.00019140838113540346,
233
- "loss": 0.4869,
234
  "step": 31
235
  },
236
  {
237
  "epoch": 0.19,
238
- "grad_norm": 0.1416503230360699,
239
  "learning_rate": 0.00019058403936655233,
240
- "loss": 0.5341,
241
  "step": 32
242
  },
243
  {
244
  "epoch": 0.2,
245
- "grad_norm": 0.10761396399791698,
246
  "learning_rate": 0.00018972388606192125,
247
- "loss": 0.4304,
248
  "step": 33
249
  },
250
  {
251
  "epoch": 0.2,
252
- "grad_norm": 0.10975376180434356,
253
  "learning_rate": 0.0001888282612749132,
254
- "loss": 0.4646,
255
  "step": 34
256
  },
257
  {
258
  "epoch": 0.21,
259
- "grad_norm": 0.12848879670359908,
260
  "learning_rate": 0.00018789751908224338,
261
- "loss": 0.4972,
262
  "step": 35
263
  },
264
  {
265
  "epoch": 0.21,
266
- "grad_norm": 0.11904721819683833,
267
  "learning_rate": 0.00018693202744395827,
268
- "loss": 0.505,
269
  "step": 36
270
  },
271
  {
272
  "epoch": 0.22,
273
- "grad_norm": 0.12249852034224981,
274
  "learning_rate": 0.00018593216805796612,
275
  "loss": 0.5396,
276
  "step": 37
277
  },
278
  {
279
  "epoch": 0.23,
280
- "grad_norm": 0.12453395046646995,
281
  "learning_rate": 0.00018489833620913642,
282
- "loss": 0.4917,
283
  "step": 38
284
  },
285
  {
286
  "epoch": 0.23,
287
- "grad_norm": 0.12585770374422164,
288
  "learning_rate": 0.00018383094061302766,
289
- "loss": 0.5079,
290
  "step": 39
291
  },
292
  {
293
  "epoch": 0.24,
294
- "grad_norm": 0.11095271476322731,
295
  "learning_rate": 0.00018273040325430574,
296
- "loss": 0.4812,
297
  "step": 40
298
  },
299
  {
300
  "epoch": 0.24,
301
- "grad_norm": 0.12968115101635422,
302
  "learning_rate": 0.00018159715921991612,
303
- "loss": 0.5106,
304
  "step": 41
305
  },
306
  {
307
  "epoch": 0.25,
308
- "grad_norm": 0.10933018515590627,
309
  "learning_rate": 0.00018043165652707649,
310
- "loss": 0.4403,
311
  "step": 42
312
  },
313
  {
314
  "epoch": 0.25,
315
- "eval_loss": 0.9767947196960449,
316
- "eval_runtime": 152.0343,
317
- "eval_samples_per_second": 1.638,
318
- "eval_steps_per_second": 0.822,
319
  "step": 42
320
  },
321
  {
322
  "epoch": 0.26,
323
- "grad_norm": 0.11525719626792096,
324
  "learning_rate": 0.00017923435594615744,
325
- "loss": 0.482,
326
  "step": 43
327
  },
328
  {
329
  "epoch": 0.26,
330
- "grad_norm": 0.12962154411778218,
331
  "learning_rate": 0.00017800573081852122,
332
- "loss": 0.5452,
333
  "step": 44
334
  },
335
  {
336
  "epoch": 0.27,
337
- "grad_norm": 0.12555700120045588,
338
  "learning_rate": 0.0001767462668693908,
339
- "loss": 0.5084,
340
  "step": 45
341
  },
342
  {
343
  "epoch": 0.27,
344
- "grad_norm": 0.11427565378293324,
345
  "learning_rate": 0.00017545646201582303,
346
- "loss": 0.5191,
347
  "step": 46
348
  },
349
  {
350
  "epoch": 0.28,
351
- "grad_norm": 0.10974901402857151,
352
  "learning_rate": 0.00017413682616986185,
353
- "loss": 0.4703,
354
  "step": 47
355
  },
356
  {
357
  "epoch": 0.29,
358
- "grad_norm": 0.11781465084480325,
359
  "learning_rate": 0.00017278788103694943,
360
- "loss": 0.4548,
361
  "step": 48
362
  },
363
  {
364
  "epoch": 0.29,
365
- "grad_norm": 0.10781807228559999,
366
  "learning_rate": 0.000171410159909675,
367
- "loss": 0.476,
368
  "step": 49
369
  },
370
  {
371
  "epoch": 0.3,
372
- "grad_norm": 0.12502639462035098,
373
  "learning_rate": 0.00017000420745694254,
374
- "loss": 0.5084,
375
  "step": 50
376
  },
377
  {
378
  "epoch": 0.3,
379
- "grad_norm": 0.10718920826593327,
380
  "learning_rate": 0.00016857057950864132,
381
- "loss": 0.5093,
382
  "step": 51
383
  },
384
  {
385
  "epoch": 0.31,
386
- "grad_norm": 0.10040549880547282,
387
  "learning_rate": 0.0001671098428359037,
388
- "loss": 0.4644,
389
  "step": 52
390
  },
391
  {
392
  "epoch": 0.32,
393
- "grad_norm": 0.11778478994740472,
394
  "learning_rate": 0.00016562257492703757,
395
- "loss": 0.4725,
396
  "step": 53
397
  },
398
  {
399
  "epoch": 0.32,
400
- "grad_norm": 0.1008386031049932,
401
  "learning_rate": 0.000164109363759222,
402
- "loss": 0.5121,
403
  "step": 54
404
  },
405
  {
406
  "epoch": 0.33,
407
- "grad_norm": 0.1170302528140235,
408
  "learning_rate": 0.000162570807566056,
409
- "loss": 0.4766,
410
  "step": 55
411
  },
412
  {
413
  "epoch": 0.33,
414
- "grad_norm": 0.1104526773884303,
415
  "learning_rate": 0.00016100751460105243,
416
- "loss": 0.4886,
417
  "step": 56
418
  },
419
  {
420
  "epoch": 0.34,
421
- "grad_norm": 0.10467920768691032,
422
  "learning_rate": 0.00015942010289717105,
423
- "loss": 0.4703,
424
  "step": 57
425
  },
426
  {
427
  "epoch": 0.34,
428
- "grad_norm": 0.11551406829220555,
429
  "learning_rate": 0.00015780920002248484,
430
- "loss": 0.4837,
431
  "step": 58
432
  },
433
  {
434
  "epoch": 0.35,
435
- "grad_norm": 0.11133818831887894,
436
  "learning_rate": 0.0001561754428320771,
437
- "loss": 0.5148,
438
  "step": 59
439
  },
440
  {
441
  "epoch": 0.36,
442
- "grad_norm": 0.11281448423273216,
443
  "learning_rate": 0.00015451947721626676,
444
- "loss": 0.4561,
445
  "step": 60
446
  },
447
  {
448
  "epoch": 0.36,
449
- "grad_norm": 0.13934126997471205,
450
  "learning_rate": 0.00015284195784526195,
451
- "loss": 0.5069,
452
  "step": 61
453
  },
454
  {
455
  "epoch": 0.37,
456
- "grad_norm": 0.11851655387640142,
457
  "learning_rate": 0.00015114354791034225,
458
- "loss": 0.5094,
459
  "step": 62
460
  },
461
  {
462
  "epoch": 0.37,
463
- "grad_norm": 0.12909148374566123,
464
  "learning_rate": 0.0001494249188616723,
465
- "loss": 0.581,
466
  "step": 63
467
  },
468
  {
469
  "epoch": 0.38,
470
- "grad_norm": 0.11070161341925377,
471
  "learning_rate": 0.00014768675014285062,
472
- "loss": 0.4585,
473
  "step": 64
474
  },
475
  {
476
  "epoch": 0.39,
477
- "grad_norm": 0.13308674882888374,
478
  "learning_rate": 0.00014592972892229778,
479
- "loss": 0.4974,
480
  "step": 65
481
  },
482
  {
483
  "epoch": 0.39,
484
- "grad_norm": 0.12124588853708144,
485
  "learning_rate": 0.0001441545498215912,
486
- "loss": 0.4463,
487
  "step": 66
488
  },
489
  {
490
  "epoch": 0.4,
491
- "grad_norm": 0.1183570515369953,
492
  "learning_rate": 0.00014236191464085286,
493
- "loss": 0.447,
494
  "step": 67
495
  },
496
  {
497
  "epoch": 0.4,
498
- "grad_norm": 0.13520024884417237,
499
  "learning_rate": 0.00014055253208129938,
500
- "loss": 0.5309,
501
  "step": 68
502
  },
503
  {
504
  "epoch": 0.41,
505
- "grad_norm": 0.12184981458813801,
506
  "learning_rate": 0.00013872711746506413,
507
- "loss": 0.4532,
508
  "step": 69
509
  },
510
  {
511
  "epoch": 0.42,
512
- "grad_norm": 0.12449299540645078,
513
  "learning_rate": 0.00013688639245240078,
514
- "loss": 0.5198,
515
  "step": 70
516
  },
517
  {
518
  "epoch": 0.42,
519
- "grad_norm": 0.1383134750490429,
520
  "learning_rate": 0.00013503108475638244,
521
- "loss": 0.5629,
522
  "step": 71
523
  },
524
  {
525
  "epoch": 0.43,
526
- "grad_norm": 0.246237001656926,
527
  "learning_rate": 0.0001331619278552068,
528
- "loss": 0.4869,
529
  "step": 72
530
  },
531
  {
532
  "epoch": 0.43,
533
- "grad_norm": 0.13337703940933632,
534
  "learning_rate": 0.00013127966070222274,
535
- "loss": 0.4792,
536
  "step": 73
537
  },
538
  {
539
  "epoch": 0.44,
540
- "grad_norm": 0.12428922033806454,
541
  "learning_rate": 0.00012938502743379212,
542
- "loss": 0.4825,
543
  "step": 74
544
  },
545
  {
546
  "epoch": 0.45,
547
- "grad_norm": 0.13290774912900208,
548
  "learning_rate": 0.00012747877707510252,
549
- "loss": 0.5138,
550
  "step": 75
551
  },
552
  {
553
  "epoch": 0.45,
554
- "grad_norm": 0.11185975046756892,
555
  "learning_rate": 0.0001255616632440475,
556
- "loss": 0.4815,
557
  "step": 76
558
  },
559
  {
560
  "epoch": 0.46,
561
- "grad_norm": 0.1130592868215497,
562
  "learning_rate": 0.0001236344438532905,
563
- "loss": 0.5046,
564
  "step": 77
565
  },
566
  {
567
  "epoch": 0.46,
568
- "grad_norm": 0.12882943465594857,
569
  "learning_rate": 0.0001216978808106318,
570
  "loss": 0.5091,
571
  "step": 78
572
  },
573
  {
574
  "epoch": 0.47,
575
- "grad_norm": 0.14837896297082676,
576
  "learning_rate": 0.00011975273971779528,
577
- "loss": 0.5158,
578
  "step": 79
579
  },
580
  {
581
  "epoch": 0.48,
582
- "grad_norm": 0.1265223309856292,
583
  "learning_rate": 0.00011779978956775506,
584
- "loss": 0.5068,
585
  "step": 80
586
  },
587
  {
588
  "epoch": 0.48,
589
- "grad_norm": 0.14042502330520407,
590
  "learning_rate": 0.0001158398024407215,
591
- "loss": 0.5061,
592
  "step": 81
593
  },
594
  {
595
  "epoch": 0.49,
596
- "grad_norm": 0.1261526695491767,
597
  "learning_rate": 0.00011387355319890685,
598
- "loss": 0.4691,
599
  "step": 82
600
  },
601
  {
602
  "epoch": 0.49,
603
- "grad_norm": 0.12007305451001854,
604
  "learning_rate": 0.00011190181918019049,
605
- "loss": 0.4753,
606
  "step": 83
607
  },
608
  {
609
  "epoch": 0.5,
610
- "grad_norm": 0.12809956897166885,
611
  "learning_rate": 0.00010992537989080618,
612
- "loss": 0.4417,
613
  "step": 84
614
  },
615
  {
616
  "epoch": 0.5,
617
- "eval_loss": 0.9675251841545105,
618
- "eval_runtime": 152.4793,
619
- "eval_samples_per_second": 1.633,
620
- "eval_steps_per_second": 0.82,
621
  "step": 84
622
  },
623
  {
624
  "epoch": 0.51,
625
- "grad_norm": 0.11858329804793687,
626
  "learning_rate": 0.00010794501669717145,
627
- "loss": 0.4868,
628
  "step": 85
629
  },
630
  {
631
  "epoch": 0.51,
632
- "grad_norm": 0.10984649953887334,
633
  "learning_rate": 0.00010596151251698199,
634
- "loss": 0.4598,
635
  "step": 86
636
  },
637
  {
638
  "epoch": 0.52,
639
- "grad_norm": 0.10927203986256682,
640
  "learning_rate": 0.0001039756515096926,
641
- "loss": 0.4693,
642
  "step": 87
643
  },
644
  {
645
  "epoch": 0.52,
646
- "grad_norm": 0.11205046531522328,
647
  "learning_rate": 0.00010198821876650701,
648
- "loss": 0.4921,
649
  "step": 88
650
  },
651
  {
652
  "epoch": 0.53,
653
- "grad_norm": 0.13232347270009215,
654
  "learning_rate": 0.0001,
655
- "loss": 0.4695,
656
  "step": 89
657
  },
658
  {
659
  "epoch": 0.53,
660
- "grad_norm": 0.12136881873560385,
661
  "learning_rate": 9.801178123349298e-05,
662
- "loss": 0.4859,
663
  "step": 90
664
  },
665
  {
666
  "epoch": 0.54,
667
- "grad_norm": 0.14347476421156694,
668
  "learning_rate": 9.602434849030745e-05,
669
- "loss": 0.4796,
670
  "step": 91
671
  },
672
  {
673
  "epoch": 0.55,
674
- "grad_norm": 0.13956845267055204,
675
  "learning_rate": 9.403848748301802e-05,
676
- "loss": 0.5339,
677
  "step": 92
678
  },
679
  {
680
  "epoch": 0.55,
681
- "grad_norm": 0.12814010903196785,
682
  "learning_rate": 9.205498330282856e-05,
683
- "loss": 0.5267,
684
  "step": 93
685
  },
686
  {
687
  "epoch": 0.56,
688
- "grad_norm": 0.12798850330908082,
689
  "learning_rate": 9.007462010919386e-05,
690
- "loss": 0.4604,
691
  "step": 94
692
  },
693
  {
694
  "epoch": 0.56,
695
- "grad_norm": 0.13673366056605873,
696
  "learning_rate": 8.809818081980953e-05,
697
- "loss": 0.49,
698
  "step": 95
699
  },
700
  {
701
  "epoch": 0.57,
702
- "grad_norm": 0.12607483394599764,
703
  "learning_rate": 8.612644680109319e-05,
704
- "loss": 0.4774,
705
  "step": 96
706
  },
707
  {
708
  "epoch": 0.58,
709
- "grad_norm": 0.1365629261848207,
710
  "learning_rate": 8.416019755927851e-05,
711
- "loss": 0.4827,
712
  "step": 97
713
  },
714
  {
715
  "epoch": 0.58,
716
- "grad_norm": 0.12122559291940836,
717
  "learning_rate": 8.2200210432245e-05,
718
- "loss": 0.5044,
719
  "step": 98
720
  },
721
  {
722
  "epoch": 0.59,
723
- "grad_norm": 0.11655390642565265,
724
  "learning_rate": 8.024726028220474e-05,
725
- "loss": 0.503,
726
  "step": 99
727
  },
728
  {
729
  "epoch": 0.59,
730
- "grad_norm": 0.12394574502796742,
731
  "learning_rate": 7.83021191893682e-05,
732
- "loss": 0.491,
733
  "step": 100
734
  },
735
  {
736
  "epoch": 0.6,
737
- "grad_norm": 0.14922841699852962,
738
  "learning_rate": 7.636555614670953e-05,
739
- "loss": 0.457,
740
  "step": 101
741
  },
742
  {
743
  "epoch": 0.61,
744
- "grad_norm": 0.11076924096187928,
745
  "learning_rate": 7.443833675595255e-05,
746
- "loss": 0.4603,
747
  "step": 102
748
  },
749
  {
750
  "epoch": 0.61,
751
- "grad_norm": 0.1263594611752413,
752
  "learning_rate": 7.252122292489747e-05,
753
- "loss": 0.4859,
754
  "step": 103
755
  },
756
  {
757
  "epoch": 0.62,
758
- "grad_norm": 0.11432575178505003,
759
  "learning_rate": 7.061497256620793e-05,
760
- "loss": 0.4627,
761
  "step": 104
762
  },
763
  {
764
  "epoch": 0.62,
765
- "grad_norm": 0.1078119391965793,
766
  "learning_rate": 6.87203392977773e-05,
767
- "loss": 0.4829,
768
  "step": 105
769
  },
770
  {
771
  "epoch": 0.63,
772
- "grad_norm": 0.12752089816514908,
773
  "learning_rate": 6.683807214479323e-05,
774
- "loss": 0.46,
775
  "step": 106
776
  },
777
  {
778
  "epoch": 0.64,
779
- "grad_norm": 0.11421623043902956,
780
  "learning_rate": 6.496891524361757e-05,
781
- "loss": 0.4429,
782
  "step": 107
783
  },
784
  {
785
  "epoch": 0.64,
786
- "grad_norm": 0.10432253193399477,
787
  "learning_rate": 6.311360754759923e-05,
788
- "loss": 0.402,
789
  "step": 108
790
  },
791
  {
792
  "epoch": 0.65,
793
- "grad_norm": 0.12155248673662734,
794
  "learning_rate": 6.127288253493591e-05,
795
- "loss": 0.5126,
796
  "step": 109
797
  },
798
  {
799
  "epoch": 0.65,
800
- "grad_norm": 0.14266947863559803,
801
  "learning_rate": 5.9447467918700614e-05,
802
- "loss": 0.4821,
803
  "step": 110
804
  },
805
  {
806
  "epoch": 0.66,
807
- "grad_norm": 0.14851250761112514,
808
  "learning_rate": 5.763808535914723e-05,
809
- "loss": 0.4891,
810
  "step": 111
811
  },
812
  {
813
  "epoch": 0.67,
814
- "grad_norm": 0.14264023747361737,
815
  "learning_rate": 5.584545017840885e-05,
816
- "loss": 0.5181,
817
  "step": 112
818
  },
819
  {
820
  "epoch": 0.67,
821
- "grad_norm": 0.12837168363458795,
822
  "learning_rate": 5.407027107770219e-05,
823
- "loss": 0.5599,
824
  "step": 113
825
  },
826
  {
827
  "epoch": 0.68,
828
- "grad_norm": 0.11874709251257598,
829
  "learning_rate": 5.2313249857149414e-05,
830
- "loss": 0.4536,
831
  "step": 114
832
  },
833
  {
834
  "epoch": 0.68,
835
- "grad_norm": 0.12010754957532713,
836
  "learning_rate": 5.0575081138327715e-05,
837
- "loss": 0.5004,
838
  "step": 115
839
  },
840
  {
841
  "epoch": 0.69,
842
- "grad_norm": 0.13464124440677885,
843
  "learning_rate": 4.885645208965779e-05,
844
- "loss": 0.4985,
845
  "step": 116
846
  },
847
  {
848
  "epoch": 0.7,
849
- "grad_norm": 0.13701854261941088,
850
  "learning_rate": 4.715804215473809e-05,
851
- "loss": 0.4709,
852
  "step": 117
853
  },
854
  {
855
  "epoch": 0.7,
856
- "grad_norm": 0.1335483738873249,
857
  "learning_rate": 4.548052278373327e-05,
858
- "loss": 0.4735,
859
  "step": 118
860
  },
861
  {
862
  "epoch": 0.71,
863
- "grad_norm": 0.13603172024059101,
864
  "learning_rate": 4.382455716792291e-05,
865
- "loss": 0.4721,
866
  "step": 119
867
  },
868
  {
869
  "epoch": 0.71,
870
- "grad_norm": 0.13843339239058639,
871
  "learning_rate": 4.219079997751515e-05,
872
  "loss": 0.4954,
873
  "step": 120
874
  },
875
  {
876
  "epoch": 0.72,
877
- "grad_norm": 0.15011169526780793,
878
  "learning_rate": 4.0579897102828966e-05,
879
- "loss": 0.4648,
880
  "step": 121
881
  },
882
  {
883
  "epoch": 0.73,
884
- "grad_norm": 0.13061595453081623,
885
  "learning_rate": 3.899248539894757e-05,
886
- "loss": 0.4801,
887
  "step": 122
888
  },
889
  {
890
  "epoch": 0.73,
891
- "grad_norm": 0.14067787924603412,
892
  "learning_rate": 3.7429192433944014e-05,
893
- "loss": 0.4805,
894
  "step": 123
895
  },
896
  {
897
  "epoch": 0.74,
898
- "grad_norm": 0.13420057703295998,
899
  "learning_rate": 3.589063624077802e-05,
900
- "loss": 0.4446,
901
  "step": 124
902
  },
903
  {
904
  "epoch": 0.74,
905
- "grad_norm": 0.14083737654873127,
906
  "learning_rate": 3.4377425072962465e-05,
907
- "loss": 0.46,
908
  "step": 125
909
  },
910
  {
911
  "epoch": 0.75,
912
- "grad_norm": 0.13231889777376862,
913
  "learning_rate": 3.289015716409631e-05,
914
- "loss": 0.4451,
915
  "step": 126
916
  },
917
  {
918
  "epoch": 0.75,
919
- "eval_loss": 0.9651579260826111,
920
- "eval_runtime": 155.5959,
921
- "eval_samples_per_second": 1.6,
922
- "eval_steps_per_second": 0.803,
923
  "step": 126
924
  }
925
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.01,
13
+ "grad_norm": 0.29289755909939047,
14
  "learning_rate": 2e-05,
15
  "loss": 0.6274,
16
  "step": 1
 
18
  {
19
  "epoch": 0.01,
20
  "eval_loss": 1.0297596454620361,
21
+ "eval_runtime": 153.0715,
22
+ "eval_samples_per_second": 1.627,
23
+ "eval_steps_per_second": 0.817,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.01,
28
+ "grad_norm": 0.27956410941469395,
29
  "learning_rate": 4e-05,
30
  "loss": 0.6362,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.02,
35
+ "grad_norm": 0.3009590515152092,
36
  "learning_rate": 6e-05,
37
  "loss": 0.6299,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.02,
42
+ "grad_norm": 0.34354546270515235,
43
  "learning_rate": 8e-05,
44
+ "loss": 0.6395,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.03,
49
+ "grad_norm": 0.2470961998205002,
50
  "learning_rate": 0.0001,
51
+ "loss": 0.6068,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.04,
56
+ "grad_norm": 0.2071993792912813,
57
  "learning_rate": 0.00012,
58
+ "loss": 0.5993,
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.04,
63
+ "grad_norm": 0.20237019487582247,
64
  "learning_rate": 0.00014,
65
+ "loss": 0.6293,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.05,
70
+ "grad_norm": 0.13810925455451734,
71
  "learning_rate": 0.00016,
72
+ "loss": 0.5101,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 0.05,
77
+ "grad_norm": 0.22762469698117493,
78
  "learning_rate": 0.00018,
79
+ "loss": 0.5527,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 0.06,
84
+ "grad_norm": 0.15718586910417978,
85
  "learning_rate": 0.0002,
86
+ "loss": 0.5437,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 0.07,
91
+ "grad_norm": 0.13825435646308676,
92
  "learning_rate": 0.00019998023297700658,
93
+ "loss": 0.5858,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 0.07,
98
+ "grad_norm": 0.15072585803075744,
99
  "learning_rate": 0.00019992093972273018,
100
+ "loss": 0.554,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 0.08,
105
+ "grad_norm": 0.13616736806741778,
106
  "learning_rate": 0.00019982214367819328,
107
+ "loss": 0.6199,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 0.08,
112
+ "grad_norm": 0.15841398198140327,
113
  "learning_rate": 0.0001996838839014696,
114
  "loss": 0.5495,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 0.09,
119
+ "grad_norm": 0.16548492090565414,
120
  "learning_rate": 0.00019950621505224273,
121
+ "loss": 0.5043,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 0.1,
126
+ "grad_norm": 0.16965923620151113,
127
  "learning_rate": 0.00019928920737019733,
128
  "loss": 0.5083,
129
  "step": 16
130
  },
131
  {
132
  "epoch": 0.1,
133
+ "grad_norm": 0.1853247768877015,
134
  "learning_rate": 0.0001990329466472502,
135
+ "loss": 0.6318,
136
  "step": 17
137
  },
138
  {
139
  "epoch": 0.11,
140
+ "grad_norm": 0.1824469971767483,
141
  "learning_rate": 0.00019873753419363336,
142
+ "loss": 0.4809,
143
  "step": 18
144
  },
145
  {
146
  "epoch": 0.11,
147
+ "grad_norm": 0.13288488939703025,
148
  "learning_rate": 0.00019840308679784207,
149
+ "loss": 0.4974,
150
  "step": 19
151
  },
152
  {
153
  "epoch": 0.12,
154
+ "grad_norm": 0.18507602579352214,
155
  "learning_rate": 0.00019802973668046363,
156
+ "loss": 0.5288,
157
  "step": 20
158
  },
159
  {
160
  "epoch": 0.12,
161
+ "grad_norm": 0.13591012925799634,
162
  "learning_rate": 0.0001976176314419051,
163
+ "loss": 0.5291,
164
  "step": 21
165
  },
166
  {
167
  "epoch": 0.13,
168
+ "grad_norm": 0.13704233024715257,
169
  "learning_rate": 0.000197166934004041,
170
  "loss": 0.4819,
171
  "step": 22
172
  },
173
  {
174
  "epoch": 0.14,
175
+ "grad_norm": 0.15362008062157553,
176
  "learning_rate": 0.00019667782254580374,
177
+ "loss": 0.5408,
178
  "step": 23
179
  },
180
  {
181
  "epoch": 0.14,
182
+ "grad_norm": 0.11171032733778614,
183
  "learning_rate": 0.00019615049043274205,
184
+ "loss": 0.5101,
185
  "step": 24
186
  },
187
  {
188
  "epoch": 0.15,
189
+ "grad_norm": 0.11449051252849428,
190
  "learning_rate": 0.00019558514614057609,
191
+ "loss": 0.5209,
192
  "step": 25
193
  },
194
  {
195
  "epoch": 0.15,
196
+ "grad_norm": 0.11611407915744347,
197
  "learning_rate": 0.00019498201317277828,
198
+ "loss": 0.5005,
199
  "step": 26
200
  },
201
  {
202
  "epoch": 0.16,
203
+ "grad_norm": 0.12735633641627706,
204
  "learning_rate": 0.00019434132997221345,
205
+ "loss": 0.4741,
206
  "step": 27
207
  },
208
  {
209
  "epoch": 0.17,
210
+ "grad_norm": 0.11871518327376328,
211
  "learning_rate": 0.0001936633498268728,
212
+ "loss": 0.5213,
213
  "step": 28
214
  },
215
  {
216
  "epoch": 0.17,
217
+ "grad_norm": 0.11403376465541806,
218
  "learning_rate": 0.0001929483407697387,
219
+ "loss": 0.4842,
220
  "step": 29
221
  },
222
  {
223
  "epoch": 0.18,
224
+ "grad_norm": 0.11675155934307391,
225
  "learning_rate": 0.00019219658547282067,
226
+ "loss": 0.4825,
227
  "step": 30
228
  },
229
  {
230
  "epoch": 0.18,
231
+ "grad_norm": 0.10789338581384152,
232
  "learning_rate": 0.00019140838113540346,
233
+ "loss": 0.4866,
234
  "step": 31
235
  },
236
  {
237
  "epoch": 0.19,
238
+ "grad_norm": 0.14336837303756964,
239
  "learning_rate": 0.00019058403936655233,
240
+ "loss": 0.5325,
241
  "step": 32
242
  },
243
  {
244
  "epoch": 0.2,
245
+ "grad_norm": 0.10401694793599091,
246
  "learning_rate": 0.00018972388606192125,
247
+ "loss": 0.4292,
248
  "step": 33
249
  },
250
  {
251
  "epoch": 0.2,
252
+ "grad_norm": 0.10614245315138679,
253
  "learning_rate": 0.0001888282612749132,
254
+ "loss": 0.4638,
255
  "step": 34
256
  },
257
  {
258
  "epoch": 0.21,
259
+ "grad_norm": 0.1250509143492961,
260
  "learning_rate": 0.00018789751908224338,
261
+ "loss": 0.4963,
262
  "step": 35
263
  },
264
  {
265
  "epoch": 0.21,
266
+ "grad_norm": 0.12668831423294083,
267
  "learning_rate": 0.00018693202744395827,
268
+ "loss": 0.5043,
269
  "step": 36
270
  },
271
  {
272
  "epoch": 0.22,
273
+ "grad_norm": 0.11594924793976216,
274
  "learning_rate": 0.00018593216805796612,
275
  "loss": 0.5396,
276
  "step": 37
277
  },
278
  {
279
  "epoch": 0.23,
280
+ "grad_norm": 0.12365894749489846,
281
  "learning_rate": 0.00018489833620913642,
282
+ "loss": 0.4899,
283
  "step": 38
284
  },
285
  {
286
  "epoch": 0.23,
287
+ "grad_norm": 0.13164318970805183,
288
  "learning_rate": 0.00018383094061302766,
289
+ "loss": 0.5065,
290
  "step": 39
291
  },
292
  {
293
  "epoch": 0.24,
294
+ "grad_norm": 0.10587870431925453,
295
  "learning_rate": 0.00018273040325430574,
296
+ "loss": 0.4805,
297
  "step": 40
298
  },
299
  {
300
  "epoch": 0.24,
301
+ "grad_norm": 0.12134503625595237,
302
  "learning_rate": 0.00018159715921991612,
303
+ "loss": 0.5103,
304
  "step": 41
305
  },
306
  {
307
  "epoch": 0.25,
308
+ "grad_norm": 0.10855490758268896,
309
  "learning_rate": 0.00018043165652707649,
310
+ "loss": 0.44,
311
  "step": 42
312
  },
313
  {
314
  "epoch": 0.25,
315
+ "eval_loss": 0.9770342707633972,
316
+ "eval_runtime": 154.3933,
317
+ "eval_samples_per_second": 1.613,
318
+ "eval_steps_per_second": 0.81,
319
  "step": 42
320
  },
321
  {
322
  "epoch": 0.26,
323
+ "grad_norm": 0.11753509152196469,
324
  "learning_rate": 0.00017923435594615744,
325
+ "loss": 0.4819,
326
  "step": 43
327
  },
328
  {
329
  "epoch": 0.26,
330
+ "grad_norm": 0.13774603727779988,
331
  "learning_rate": 0.00017800573081852122,
332
+ "loss": 0.5451,
333
  "step": 44
334
  },
335
  {
336
  "epoch": 0.27,
337
+ "grad_norm": 0.11994636996852912,
338
  "learning_rate": 0.0001767462668693908,
339
+ "loss": 0.5079,
340
  "step": 45
341
  },
342
  {
343
  "epoch": 0.27,
344
+ "grad_norm": 0.11803018063108017,
345
  "learning_rate": 0.00017545646201582303,
346
+ "loss": 0.5183,
347
  "step": 46
348
  },
349
  {
350
  "epoch": 0.28,
351
+ "grad_norm": 0.12122026879022209,
352
  "learning_rate": 0.00017413682616986185,
353
+ "loss": 0.4692,
354
  "step": 47
355
  },
356
  {
357
  "epoch": 0.29,
358
+ "grad_norm": 0.12840154129375927,
359
  "learning_rate": 0.00017278788103694943,
360
+ "loss": 0.4538,
361
  "step": 48
362
  },
363
  {
364
  "epoch": 0.29,
365
+ "grad_norm": 0.10754191454075242,
366
  "learning_rate": 0.000171410159909675,
367
+ "loss": 0.4745,
368
  "step": 49
369
  },
370
  {
371
  "epoch": 0.3,
372
+ "grad_norm": 0.10980392972154758,
373
  "learning_rate": 0.00017000420745694254,
374
+ "loss": 0.5077,
375
  "step": 50
376
  },
377
  {
378
  "epoch": 0.3,
379
+ "grad_norm": 0.1018314102740997,
380
  "learning_rate": 0.00016857057950864132,
381
+ "loss": 0.5077,
382
  "step": 51
383
  },
384
  {
385
  "epoch": 0.31,
386
+ "grad_norm": 0.10572512690181787,
387
  "learning_rate": 0.0001671098428359037,
388
+ "loss": 0.4637,
389
  "step": 52
390
  },
391
  {
392
  "epoch": 0.32,
393
+ "grad_norm": 0.13106813432864392,
394
  "learning_rate": 0.00016562257492703757,
395
+ "loss": 0.4718,
396
  "step": 53
397
  },
398
  {
399
  "epoch": 0.32,
400
+ "grad_norm": 0.11024484284605006,
401
  "learning_rate": 0.000164109363759222,
402
+ "loss": 0.5115,
403
  "step": 54
404
  },
405
  {
406
  "epoch": 0.33,
407
+ "grad_norm": 0.12388815222110366,
408
  "learning_rate": 0.000162570807566056,
409
+ "loss": 0.4756,
410
  "step": 55
411
  },
412
  {
413
  "epoch": 0.33,
414
+ "grad_norm": 0.12702188830349206,
415
  "learning_rate": 0.00016100751460105243,
416
+ "loss": 0.4881,
417
  "step": 56
418
  },
419
  {
420
  "epoch": 0.34,
421
+ "grad_norm": 0.11086308659932648,
422
  "learning_rate": 0.00015942010289717105,
423
+ "loss": 0.4701,
424
  "step": 57
425
  },
426
  {
427
  "epoch": 0.34,
428
+ "grad_norm": 0.11517890306226485,
429
  "learning_rate": 0.00015780920002248484,
430
+ "loss": 0.4835,
431
  "step": 58
432
  },
433
  {
434
  "epoch": 0.35,
435
+ "grad_norm": 0.1253541013817757,
436
  "learning_rate": 0.0001561754428320771,
437
+ "loss": 0.5147,
438
  "step": 59
439
  },
440
  {
441
  "epoch": 0.36,
442
+ "grad_norm": 0.11139862438389284,
443
  "learning_rate": 0.00015451947721626676,
444
+ "loss": 0.4552,
445
  "step": 60
446
  },
447
  {
448
  "epoch": 0.36,
449
+ "grad_norm": 0.1152466912880715,
450
  "learning_rate": 0.00015284195784526195,
451
+ "loss": 0.5053,
452
  "step": 61
453
  },
454
  {
455
  "epoch": 0.37,
456
+ "grad_norm": 0.11719477948785388,
457
  "learning_rate": 0.00015114354791034225,
458
+ "loss": 0.5079,
459
  "step": 62
460
  },
461
  {
462
  "epoch": 0.37,
463
+ "grad_norm": 0.12730957754598396,
464
  "learning_rate": 0.0001494249188616723,
465
+ "loss": 0.58,
466
  "step": 63
467
  },
468
  {
469
  "epoch": 0.38,
470
+ "grad_norm": 0.11907511614241814,
471
  "learning_rate": 0.00014768675014285062,
472
+ "loss": 0.4579,
473
  "step": 64
474
  },
475
  {
476
  "epoch": 0.39,
477
+ "grad_norm": 0.13613222195950112,
478
  "learning_rate": 0.00014592972892229778,
479
+ "loss": 0.4965,
480
  "step": 65
481
  },
482
  {
483
  "epoch": 0.39,
484
+ "grad_norm": 0.11610538032286319,
485
  "learning_rate": 0.0001441545498215912,
486
+ "loss": 0.4467,
487
  "step": 66
488
  },
489
  {
490
  "epoch": 0.4,
491
+ "grad_norm": 0.1046724680527729,
492
  "learning_rate": 0.00014236191464085286,
493
+ "loss": 0.4463,
494
  "step": 67
495
  },
496
  {
497
  "epoch": 0.4,
498
+ "grad_norm": 0.11631551012538931,
499
  "learning_rate": 0.00014055253208129938,
500
+ "loss": 0.5304,
501
  "step": 68
502
  },
503
  {
504
  "epoch": 0.41,
505
+ "grad_norm": 0.11792788666231199,
506
  "learning_rate": 0.00013872711746506413,
507
+ "loss": 0.453,
508
  "step": 69
509
  },
510
  {
511
  "epoch": 0.42,
512
+ "grad_norm": 0.11566019183159004,
513
  "learning_rate": 0.00013688639245240078,
514
+ "loss": 0.5192,
515
  "step": 70
516
  },
517
  {
518
  "epoch": 0.42,
519
+ "grad_norm": 0.12967661214418025,
520
  "learning_rate": 0.00013503108475638244,
521
+ "loss": 0.5625,
522
  "step": 71
523
  },
524
  {
525
  "epoch": 0.43,
526
+ "grad_norm": 0.10928605499181634,
527
  "learning_rate": 0.0001331619278552068,
528
+ "loss": 0.4861,
529
  "step": 72
530
  },
531
  {
532
  "epoch": 0.43,
533
+ "grad_norm": 0.12485741141890881,
534
  "learning_rate": 0.00013127966070222274,
535
+ "loss": 0.4782,
536
  "step": 73
537
  },
538
  {
539
  "epoch": 0.44,
540
+ "grad_norm": 0.1271574869759,
541
  "learning_rate": 0.00012938502743379212,
542
+ "loss": 0.4819,
543
  "step": 74
544
  },
545
  {
546
  "epoch": 0.45,
547
+ "grad_norm": 0.14528941719728583,
548
  "learning_rate": 0.00012747877707510252,
549
+ "loss": 0.5132,
550
  "step": 75
551
  },
552
  {
553
  "epoch": 0.45,
554
+ "grad_norm": 0.12760881412243183,
555
  "learning_rate": 0.0001255616632440475,
556
+ "loss": 0.4817,
557
  "step": 76
558
  },
559
  {
560
  "epoch": 0.46,
561
+ "grad_norm": 0.1239773484446177,
562
  "learning_rate": 0.0001236344438532905,
563
+ "loss": 0.5045,
564
  "step": 77
565
  },
566
  {
567
  "epoch": 0.46,
568
+ "grad_norm": 0.1307497276864945,
569
  "learning_rate": 0.0001216978808106318,
570
  "loss": 0.5091,
571
  "step": 78
572
  },
573
  {
574
  "epoch": 0.47,
575
+ "grad_norm": 0.1355749101256534,
576
  "learning_rate": 0.00011975273971779528,
577
+ "loss": 0.5145,
578
  "step": 79
579
  },
580
  {
581
  "epoch": 0.48,
582
+ "grad_norm": 0.12394948316350356,
583
  "learning_rate": 0.00011779978956775506,
584
+ "loss": 0.5057,
585
  "step": 80
586
  },
587
  {
588
  "epoch": 0.48,
589
+ "grad_norm": 0.13424789799433426,
590
  "learning_rate": 0.0001158398024407215,
591
+ "loss": 0.5058,
592
  "step": 81
593
  },
594
  {
595
  "epoch": 0.49,
596
+ "grad_norm": 0.13054427599188898,
597
  "learning_rate": 0.00011387355319890685,
598
+ "loss": 0.4683,
599
  "step": 82
600
  },
601
  {
602
  "epoch": 0.49,
603
+ "grad_norm": 0.11996704114604598,
604
  "learning_rate": 0.00011190181918019049,
605
+ "loss": 0.4748,
606
  "step": 83
607
  },
608
  {
609
  "epoch": 0.5,
610
+ "grad_norm": 0.13071208464837492,
611
  "learning_rate": 0.00010992537989080618,
612
+ "loss": 0.4406,
613
  "step": 84
614
  },
615
  {
616
  "epoch": 0.5,
617
+ "eval_loss": 0.9652944803237915,
618
+ "eval_runtime": 153.5377,
619
+ "eval_samples_per_second": 1.622,
620
+ "eval_steps_per_second": 0.814,
621
  "step": 84
622
  },
623
  {
624
  "epoch": 0.51,
625
+ "grad_norm": 0.11393059638442392,
626
  "learning_rate": 0.00010794501669717145,
627
+ "loss": 0.4877,
628
  "step": 85
629
  },
630
  {
631
  "epoch": 0.51,
632
+ "grad_norm": 0.11364636282385192,
633
  "learning_rate": 0.00010596151251698199,
634
+ "loss": 0.4597,
635
  "step": 86
636
  },
637
  {
638
  "epoch": 0.52,
639
+ "grad_norm": 0.12548992883216656,
640
  "learning_rate": 0.0001039756515096926,
641
+ "loss": 0.4685,
642
  "step": 87
643
  },
644
  {
645
  "epoch": 0.52,
646
+ "grad_norm": 0.12825123946992636,
647
  "learning_rate": 0.00010198821876650701,
648
+ "loss": 0.4924,
649
  "step": 88
650
  },
651
  {
652
  "epoch": 0.53,
653
+ "grad_norm": 0.12264617890724591,
654
  "learning_rate": 0.0001,
655
+ "loss": 0.4678,
656
  "step": 89
657
  },
658
  {
659
  "epoch": 0.53,
660
+ "grad_norm": 0.1238990144553216,
661
  "learning_rate": 9.801178123349298e-05,
662
+ "loss": 0.4854,
663
  "step": 90
664
  },
665
  {
666
  "epoch": 0.54,
667
+ "grad_norm": 0.13533454171482565,
668
  "learning_rate": 9.602434849030745e-05,
669
+ "loss": 0.4784,
670
  "step": 91
671
  },
672
  {
673
  "epoch": 0.55,
674
+ "grad_norm": 0.13623990945679543,
675
  "learning_rate": 9.403848748301802e-05,
676
+ "loss": 0.5322,
677
  "step": 92
678
  },
679
  {
680
  "epoch": 0.55,
681
+ "grad_norm": 0.16411366119133766,
682
  "learning_rate": 9.205498330282856e-05,
683
+ "loss": 0.5258,
684
  "step": 93
685
  },
686
  {
687
  "epoch": 0.56,
688
+ "grad_norm": 0.11646811294381437,
689
  "learning_rate": 9.007462010919386e-05,
690
+ "loss": 0.4599,
691
  "step": 94
692
  },
693
  {
694
  "epoch": 0.56,
695
+ "grad_norm": 0.12610649830308363,
696
  "learning_rate": 8.809818081980953e-05,
697
+ "loss": 0.4891,
698
  "step": 95
699
  },
700
  {
701
  "epoch": 0.57,
702
+ "grad_norm": 0.12324596375061997,
703
  "learning_rate": 8.612644680109319e-05,
704
+ "loss": 0.4771,
705
  "step": 96
706
  },
707
  {
708
  "epoch": 0.58,
709
+ "grad_norm": 0.12390720748290898,
710
  "learning_rate": 8.416019755927851e-05,
711
+ "loss": 0.4814,
712
  "step": 97
713
  },
714
  {
715
  "epoch": 0.58,
716
+ "grad_norm": 0.11156155557793686,
717
  "learning_rate": 8.2200210432245e-05,
718
+ "loss": 0.5041,
719
  "step": 98
720
  },
721
  {
722
  "epoch": 0.59,
723
+ "grad_norm": 0.11235071151397323,
724
  "learning_rate": 8.024726028220474e-05,
725
+ "loss": 0.5023,
726
  "step": 99
727
  },
728
  {
729
  "epoch": 0.59,
730
+ "grad_norm": 0.12382934807374943,
731
  "learning_rate": 7.83021191893682e-05,
732
+ "loss": 0.4917,
733
  "step": 100
734
  },
735
  {
736
  "epoch": 0.6,
737
+ "grad_norm": 0.13051565063971357,
738
  "learning_rate": 7.636555614670953e-05,
739
+ "loss": 0.4567,
740
  "step": 101
741
  },
742
  {
743
  "epoch": 0.61,
744
+ "grad_norm": 0.11975246976802223,
745
  "learning_rate": 7.443833675595255e-05,
746
+ "loss": 0.4584,
747
  "step": 102
748
  },
749
  {
750
  "epoch": 0.61,
751
+ "grad_norm": 0.1390325061190883,
752
  "learning_rate": 7.252122292489747e-05,
753
+ "loss": 0.4853,
754
  "step": 103
755
  },
756
  {
757
  "epoch": 0.62,
758
+ "grad_norm": 0.13092284300351015,
759
  "learning_rate": 7.061497256620793e-05,
760
+ "loss": 0.4623,
761
  "step": 104
762
  },
763
  {
764
  "epoch": 0.62,
765
+ "grad_norm": 0.1291139691922856,
766
  "learning_rate": 6.87203392977773e-05,
767
+ "loss": 0.4826,
768
  "step": 105
769
  },
770
  {
771
  "epoch": 0.63,
772
+ "grad_norm": 0.12630345566649634,
773
  "learning_rate": 6.683807214479323e-05,
774
+ "loss": 0.4597,
775
  "step": 106
776
  },
777
  {
778
  "epoch": 0.64,
779
+ "grad_norm": 0.13061642512554117,
780
  "learning_rate": 6.496891524361757e-05,
781
+ "loss": 0.4415,
782
  "step": 107
783
  },
784
  {
785
  "epoch": 0.64,
786
+ "grad_norm": 0.11414947835097713,
787
  "learning_rate": 6.311360754759923e-05,
788
+ "loss": 0.4011,
789
  "step": 108
790
  },
791
  {
792
  "epoch": 0.65,
793
+ "grad_norm": 0.11453014913188615,
794
  "learning_rate": 6.127288253493591e-05,
795
+ "loss": 0.5118,
796
  "step": 109
797
  },
798
  {
799
  "epoch": 0.65,
800
+ "grad_norm": 0.15000233747727326,
801
  "learning_rate": 5.9447467918700614e-05,
802
+ "loss": 0.482,
803
  "step": 110
804
  },
805
  {
806
  "epoch": 0.66,
807
+ "grad_norm": 0.14402898946913723,
808
  "learning_rate": 5.763808535914723e-05,
809
+ "loss": 0.4881,
810
  "step": 111
811
  },
812
  {
813
  "epoch": 0.67,
814
+ "grad_norm": 0.1371953423024685,
815
  "learning_rate": 5.584545017840885e-05,
816
+ "loss": 0.5178,
817
  "step": 112
818
  },
819
  {
820
  "epoch": 0.67,
821
+ "grad_norm": 0.14105581149485616,
822
  "learning_rate": 5.407027107770219e-05,
823
+ "loss": 0.5584,
824
  "step": 113
825
  },
826
  {
827
  "epoch": 0.68,
828
+ "grad_norm": 0.13646035299068404,
829
  "learning_rate": 5.2313249857149414e-05,
830
+ "loss": 0.4535,
831
  "step": 114
832
  },
833
  {
834
  "epoch": 0.68,
835
+ "grad_norm": 0.14741390311838884,
836
  "learning_rate": 5.0575081138327715e-05,
837
+ "loss": 0.5,
838
  "step": 115
839
  },
840
  {
841
  "epoch": 0.69,
842
+ "grad_norm": 0.12313483515418376,
843
  "learning_rate": 4.885645208965779e-05,
844
+ "loss": 0.4982,
845
  "step": 116
846
  },
847
  {
848
  "epoch": 0.7,
849
+ "grad_norm": 0.13192008123654636,
850
  "learning_rate": 4.715804215473809e-05,
851
+ "loss": 0.4698,
852
  "step": 117
853
  },
854
  {
855
  "epoch": 0.7,
856
+ "grad_norm": 0.11197839688620534,
857
  "learning_rate": 4.548052278373327e-05,
858
+ "loss": 0.4737,
859
  "step": 118
860
  },
861
  {
862
  "epoch": 0.71,
863
+ "grad_norm": 0.12574619847347493,
864
  "learning_rate": 4.382455716792291e-05,
865
+ "loss": 0.472,
866
  "step": 119
867
  },
868
  {
869
  "epoch": 0.71,
870
+ "grad_norm": 0.11785384512597769,
871
  "learning_rate": 4.219079997751515e-05,
872
  "loss": 0.4954,
873
  "step": 120
874
  },
875
  {
876
  "epoch": 0.72,
877
+ "grad_norm": 0.1371137498771671,
878
  "learning_rate": 4.0579897102828966e-05,
879
+ "loss": 0.4645,
880
  "step": 121
881
  },
882
  {
883
  "epoch": 0.73,
884
+ "grad_norm": 0.1562115085397725,
885
  "learning_rate": 3.899248539894757e-05,
886
+ "loss": 0.4798,
887
  "step": 122
888
  },
889
  {
890
  "epoch": 0.73,
891
+ "grad_norm": 0.14251919403809987,
892
  "learning_rate": 3.7429192433944014e-05,
893
+ "loss": 0.4794,
894
  "step": 123
895
  },
896
  {
897
  "epoch": 0.74,
898
+ "grad_norm": 0.13567896269328303,
899
  "learning_rate": 3.589063624077802e-05,
900
+ "loss": 0.4441,
901
  "step": 124
902
  },
903
  {
904
  "epoch": 0.74,
905
+ "grad_norm": 0.14154096161651117,
906
  "learning_rate": 3.4377425072962465e-05,
907
+ "loss": 0.4583,
908
  "step": 125
909
  },
910
  {
911
  "epoch": 0.75,
912
+ "grad_norm": 0.12685010104316322,
913
  "learning_rate": 3.289015716409631e-05,
914
+ "loss": 0.4445,
915
  "step": 126
916
  },
917
  {
918
  "epoch": 0.75,
919
+ "eval_loss": 0.9644750952720642,
920
+ "eval_runtime": 155.1633,
921
+ "eval_samples_per_second": 1.605,
922
+ "eval_steps_per_second": 0.806,
923
  "step": 126
924
  }
925
  ],
checkpoint-126/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbfaffe231fe183758cc2353296944fef75710807fa8ea7663978d9e906d00f7
3
  size 6968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3baf97ea9bcc6b73a05f59bedb019ebf1656742a3d0668caac6a61c4da440046
3
  size 6968
checkpoint-168/adapter_config.json CHANGED
@@ -20,17 +20,17 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "x_proj",
24
- "o_proj",
25
- "up_proj",
26
  "v_proj",
27
- "down_proj",
28
  "dt_proj",
29
- "k_proj",
30
- "q_proj",
31
  "in_proj",
 
 
 
32
  "gate_proj",
33
- "out_proj",
 
34
  "router"
35
  ],
36
  "task_type": "CAUSAL_LM",
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "q_proj",
 
 
24
  "v_proj",
 
25
  "dt_proj",
26
+ "out_proj",
 
27
  "in_proj",
28
+ "k_proj",
29
+ "x_proj",
30
+ "o_proj",
31
  "gate_proj",
32
+ "down_proj",
33
+ "up_proj",
34
  "router"
35
  ],
36
  "task_type": "CAUSAL_LM",
checkpoint-168/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e965e9e667252ca3c021de51e081f6a3831a14d282903458d11e44f878a73597
3
  size 531611600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f54d945abe2be9a1a659c9f6a5966e52d7ed377a90ad865b77df2ae56be278d1
3
  size 531611600
checkpoint-168/global_step168/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b66d8b3841b807a595a50e795a52d67de68eb94015081f06580ca93cd533b645
3
  size 797643792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4c09b43bea186099825610cb6037a20b5cdc7616c4b0c0cff35da8a99f88806
3
  size 797643792
checkpoint-168/global_step168/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b94caa551e67c01642f6b55c16bf11b71089c2f0fbd09d4215bd26e34ed7a530
3
  size 797644432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6fde27562524ca3badefc0ac640cd2761d58191ac2180933190780a60570d85
3
  size 797644432
checkpoint-168/global_step168/mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8130cd37e9434fdc6e150b642bbebca3eeebac77da37779b2183486786db25a5
3
  size 1345660121
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f40c593123f55cdf67c443b279b6eac0236c47c74d4fd2671735a1e41a4f57e
3
  size 1345660121
checkpoint-168/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:753bf22647e1e0f236cfa8df696cc134d9e1c0684712d00bf72503af01afa161
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecd14f5486e42b398ac0fe7c317f329cc3f6385858e18676a22a6b18f29c226f
3
  size 14512
checkpoint-168/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c15347a58866f49cdc458b966fb754bbfbe4244548a2a26de0617bb3969cd63
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6b1c2e52201fc6231525ffe165db2aee0cbfcfdfcb4e2a61aac311b2649dde0
3
  size 14512
checkpoint-168/trainer_state.json CHANGED
@@ -10,7 +10,7 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.01,
13
- "grad_norm": 0.30475762367257353,
14
  "learning_rate": 2e-05,
15
  "loss": 0.6274,
16
  "step": 1
@@ -18,1210 +18,1210 @@
18
  {
19
  "epoch": 0.01,
20
  "eval_loss": 1.0297596454620361,
21
- "eval_runtime": 150.6014,
22
- "eval_samples_per_second": 1.653,
23
- "eval_steps_per_second": 0.83,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.01,
28
- "grad_norm": 0.2798137150913395,
29
  "learning_rate": 4e-05,
30
  "loss": 0.6362,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.02,
35
- "grad_norm": 0.3178684451319545,
36
  "learning_rate": 6e-05,
37
  "loss": 0.6299,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.02,
42
- "grad_norm": 0.33651284916847835,
43
  "learning_rate": 8e-05,
44
- "loss": 0.6391,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.03,
49
- "grad_norm": 0.23984915388648712,
50
  "learning_rate": 0.0001,
51
- "loss": 0.6071,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.04,
56
- "grad_norm": 0.20514040410017348,
57
  "learning_rate": 0.00012,
58
- "loss": 0.5996,
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.04,
63
- "grad_norm": 0.1950018128286362,
64
  "learning_rate": 0.00014,
65
- "loss": 0.6298,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.05,
70
- "grad_norm": 0.14246019947393238,
71
  "learning_rate": 0.00016,
72
- "loss": 0.5108,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 0.05,
77
- "grad_norm": 0.15792014279750227,
78
  "learning_rate": 0.00018,
79
- "loss": 0.5529,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 0.06,
84
- "grad_norm": 0.1517889177511264,
85
  "learning_rate": 0.0002,
86
- "loss": 0.5433,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 0.07,
91
- "grad_norm": 0.1372121219437277,
92
  "learning_rate": 0.00019998023297700658,
93
- "loss": 0.5856,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 0.07,
98
- "grad_norm": 0.16740807394942855,
99
  "learning_rate": 0.00019992093972273018,
100
- "loss": 0.5546,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 0.08,
105
- "grad_norm": 0.13512320693394078,
106
  "learning_rate": 0.00019982214367819328,
107
- "loss": 0.6193,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 0.08,
112
- "grad_norm": 0.16169796294070152,
113
  "learning_rate": 0.0001996838839014696,
114
  "loss": 0.5495,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 0.09,
119
- "grad_norm": 0.16796913812281988,
120
  "learning_rate": 0.00019950621505224273,
121
- "loss": 0.5035,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 0.1,
126
- "grad_norm": 0.1800514764162192,
127
  "learning_rate": 0.00019928920737019733,
128
  "loss": 0.5083,
129
  "step": 16
130
  },
131
  {
132
  "epoch": 0.1,
133
- "grad_norm": 0.170432124866908,
134
  "learning_rate": 0.0001990329466472502,
135
- "loss": 0.632,
136
  "step": 17
137
  },
138
  {
139
  "epoch": 0.11,
140
- "grad_norm": 0.19129325489749488,
141
  "learning_rate": 0.00019873753419363336,
142
- "loss": 0.4813,
143
  "step": 18
144
  },
145
  {
146
  "epoch": 0.11,
147
- "grad_norm": 0.1459357988760762,
148
  "learning_rate": 0.00019840308679784207,
149
- "loss": 0.4973,
150
  "step": 19
151
  },
152
  {
153
  "epoch": 0.12,
154
- "grad_norm": 0.192594730984382,
155
  "learning_rate": 0.00019802973668046363,
156
- "loss": 0.5291,
157
  "step": 20
158
  },
159
  {
160
  "epoch": 0.12,
161
- "grad_norm": 0.859025467969139,
162
  "learning_rate": 0.0001976176314419051,
163
- "loss": 0.5296,
164
  "step": 21
165
  },
166
  {
167
  "epoch": 0.13,
168
- "grad_norm": 0.13366297885670222,
169
  "learning_rate": 0.000197166934004041,
170
  "loss": 0.4819,
171
  "step": 22
172
  },
173
  {
174
  "epoch": 0.14,
175
- "grad_norm": 0.15698714419747645,
176
  "learning_rate": 0.00019667782254580374,
177
- "loss": 0.5409,
178
  "step": 23
179
  },
180
  {
181
  "epoch": 0.14,
182
- "grad_norm": 0.10995943735837355,
183
  "learning_rate": 0.00019615049043274205,
184
- "loss": 0.5108,
185
  "step": 24
186
  },
187
  {
188
  "epoch": 0.15,
189
- "grad_norm": 0.10796742192788925,
190
  "learning_rate": 0.00019558514614057609,
191
- "loss": 0.5215,
192
  "step": 25
193
  },
194
  {
195
  "epoch": 0.15,
196
- "grad_norm": 0.11641740089490231,
197
  "learning_rate": 0.00019498201317277828,
198
- "loss": 0.5012,
199
  "step": 26
200
  },
201
  {
202
  "epoch": 0.16,
203
- "grad_norm": 0.1120175962893241,
204
  "learning_rate": 0.00019434132997221345,
205
- "loss": 0.474,
206
  "step": 27
207
  },
208
  {
209
  "epoch": 0.17,
210
- "grad_norm": 0.1218171278782483,
211
  "learning_rate": 0.0001936633498268728,
212
- "loss": 0.5216,
213
  "step": 28
214
  },
215
  {
216
  "epoch": 0.17,
217
- "grad_norm": 0.11718521115928844,
218
  "learning_rate": 0.0001929483407697387,
219
- "loss": 0.4856,
220
  "step": 29
221
  },
222
  {
223
  "epoch": 0.18,
224
- "grad_norm": 0.12611471038571026,
225
  "learning_rate": 0.00019219658547282067,
226
- "loss": 0.4823,
227
  "step": 30
228
  },
229
  {
230
  "epoch": 0.18,
231
- "grad_norm": 0.11106871615269753,
232
  "learning_rate": 0.00019140838113540346,
233
- "loss": 0.4869,
234
  "step": 31
235
  },
236
  {
237
  "epoch": 0.19,
238
- "grad_norm": 0.1416503230360699,
239
  "learning_rate": 0.00019058403936655233,
240
- "loss": 0.5341,
241
  "step": 32
242
  },
243
  {
244
  "epoch": 0.2,
245
- "grad_norm": 0.10761396399791698,
246
  "learning_rate": 0.00018972388606192125,
247
- "loss": 0.4304,
248
  "step": 33
249
  },
250
  {
251
  "epoch": 0.2,
252
- "grad_norm": 0.10975376180434356,
253
  "learning_rate": 0.0001888282612749132,
254
- "loss": 0.4646,
255
  "step": 34
256
  },
257
  {
258
  "epoch": 0.21,
259
- "grad_norm": 0.12848879670359908,
260
  "learning_rate": 0.00018789751908224338,
261
- "loss": 0.4972,
262
  "step": 35
263
  },
264
  {
265
  "epoch": 0.21,
266
- "grad_norm": 0.11904721819683833,
267
  "learning_rate": 0.00018693202744395827,
268
- "loss": 0.505,
269
  "step": 36
270
  },
271
  {
272
  "epoch": 0.22,
273
- "grad_norm": 0.12249852034224981,
274
  "learning_rate": 0.00018593216805796612,
275
  "loss": 0.5396,
276
  "step": 37
277
  },
278
  {
279
  "epoch": 0.23,
280
- "grad_norm": 0.12453395046646995,
281
  "learning_rate": 0.00018489833620913642,
282
- "loss": 0.4917,
283
  "step": 38
284
  },
285
  {
286
  "epoch": 0.23,
287
- "grad_norm": 0.12585770374422164,
288
  "learning_rate": 0.00018383094061302766,
289
- "loss": 0.5079,
290
  "step": 39
291
  },
292
  {
293
  "epoch": 0.24,
294
- "grad_norm": 0.11095271476322731,
295
  "learning_rate": 0.00018273040325430574,
296
- "loss": 0.4812,
297
  "step": 40
298
  },
299
  {
300
  "epoch": 0.24,
301
- "grad_norm": 0.12968115101635422,
302
  "learning_rate": 0.00018159715921991612,
303
- "loss": 0.5106,
304
  "step": 41
305
  },
306
  {
307
  "epoch": 0.25,
308
- "grad_norm": 0.10933018515590627,
309
  "learning_rate": 0.00018043165652707649,
310
- "loss": 0.4403,
311
  "step": 42
312
  },
313
  {
314
  "epoch": 0.25,
315
- "eval_loss": 0.9767947196960449,
316
- "eval_runtime": 152.0343,
317
- "eval_samples_per_second": 1.638,
318
- "eval_steps_per_second": 0.822,
319
  "step": 42
320
  },
321
  {
322
  "epoch": 0.26,
323
- "grad_norm": 0.11525719626792096,
324
  "learning_rate": 0.00017923435594615744,
325
- "loss": 0.482,
326
  "step": 43
327
  },
328
  {
329
  "epoch": 0.26,
330
- "grad_norm": 0.12962154411778218,
331
  "learning_rate": 0.00017800573081852122,
332
- "loss": 0.5452,
333
  "step": 44
334
  },
335
  {
336
  "epoch": 0.27,
337
- "grad_norm": 0.12555700120045588,
338
  "learning_rate": 0.0001767462668693908,
339
- "loss": 0.5084,
340
  "step": 45
341
  },
342
  {
343
  "epoch": 0.27,
344
- "grad_norm": 0.11427565378293324,
345
  "learning_rate": 0.00017545646201582303,
346
- "loss": 0.5191,
347
  "step": 46
348
  },
349
  {
350
  "epoch": 0.28,
351
- "grad_norm": 0.10974901402857151,
352
  "learning_rate": 0.00017413682616986185,
353
- "loss": 0.4703,
354
  "step": 47
355
  },
356
  {
357
  "epoch": 0.29,
358
- "grad_norm": 0.11781465084480325,
359
  "learning_rate": 0.00017278788103694943,
360
- "loss": 0.4548,
361
  "step": 48
362
  },
363
  {
364
  "epoch": 0.29,
365
- "grad_norm": 0.10781807228559999,
366
  "learning_rate": 0.000171410159909675,
367
- "loss": 0.476,
368
  "step": 49
369
  },
370
  {
371
  "epoch": 0.3,
372
- "grad_norm": 0.12502639462035098,
373
  "learning_rate": 0.00017000420745694254,
374
- "loss": 0.5084,
375
  "step": 50
376
  },
377
  {
378
  "epoch": 0.3,
379
- "grad_norm": 0.10718920826593327,
380
  "learning_rate": 0.00016857057950864132,
381
- "loss": 0.5093,
382
  "step": 51
383
  },
384
  {
385
  "epoch": 0.31,
386
- "grad_norm": 0.10040549880547282,
387
  "learning_rate": 0.0001671098428359037,
388
- "loss": 0.4644,
389
  "step": 52
390
  },
391
  {
392
  "epoch": 0.32,
393
- "grad_norm": 0.11778478994740472,
394
  "learning_rate": 0.00016562257492703757,
395
- "loss": 0.4725,
396
  "step": 53
397
  },
398
  {
399
  "epoch": 0.32,
400
- "grad_norm": 0.1008386031049932,
401
  "learning_rate": 0.000164109363759222,
402
- "loss": 0.5121,
403
  "step": 54
404
  },
405
  {
406
  "epoch": 0.33,
407
- "grad_norm": 0.1170302528140235,
408
  "learning_rate": 0.000162570807566056,
409
- "loss": 0.4766,
410
  "step": 55
411
  },
412
  {
413
  "epoch": 0.33,
414
- "grad_norm": 0.1104526773884303,
415
  "learning_rate": 0.00016100751460105243,
416
- "loss": 0.4886,
417
  "step": 56
418
  },
419
  {
420
  "epoch": 0.34,
421
- "grad_norm": 0.10467920768691032,
422
  "learning_rate": 0.00015942010289717105,
423
- "loss": 0.4703,
424
  "step": 57
425
  },
426
  {
427
  "epoch": 0.34,
428
- "grad_norm": 0.11551406829220555,
429
  "learning_rate": 0.00015780920002248484,
430
- "loss": 0.4837,
431
  "step": 58
432
  },
433
  {
434
  "epoch": 0.35,
435
- "grad_norm": 0.11133818831887894,
436
  "learning_rate": 0.0001561754428320771,
437
- "loss": 0.5148,
438
  "step": 59
439
  },
440
  {
441
  "epoch": 0.36,
442
- "grad_norm": 0.11281448423273216,
443
  "learning_rate": 0.00015451947721626676,
444
- "loss": 0.4561,
445
  "step": 60
446
  },
447
  {
448
  "epoch": 0.36,
449
- "grad_norm": 0.13934126997471205,
450
  "learning_rate": 0.00015284195784526195,
451
- "loss": 0.5069,
452
  "step": 61
453
  },
454
  {
455
  "epoch": 0.37,
456
- "grad_norm": 0.11851655387640142,
457
  "learning_rate": 0.00015114354791034225,
458
- "loss": 0.5094,
459
  "step": 62
460
  },
461
  {
462
  "epoch": 0.37,
463
- "grad_norm": 0.12909148374566123,
464
  "learning_rate": 0.0001494249188616723,
465
- "loss": 0.581,
466
  "step": 63
467
  },
468
  {
469
  "epoch": 0.38,
470
- "grad_norm": 0.11070161341925377,
471
  "learning_rate": 0.00014768675014285062,
472
- "loss": 0.4585,
473
  "step": 64
474
  },
475
  {
476
  "epoch": 0.39,
477
- "grad_norm": 0.13308674882888374,
478
  "learning_rate": 0.00014592972892229778,
479
- "loss": 0.4974,
480
  "step": 65
481
  },
482
  {
483
  "epoch": 0.39,
484
- "grad_norm": 0.12124588853708144,
485
  "learning_rate": 0.0001441545498215912,
486
- "loss": 0.4463,
487
  "step": 66
488
  },
489
  {
490
  "epoch": 0.4,
491
- "grad_norm": 0.1183570515369953,
492
  "learning_rate": 0.00014236191464085286,
493
- "loss": 0.447,
494
  "step": 67
495
  },
496
  {
497
  "epoch": 0.4,
498
- "grad_norm": 0.13520024884417237,
499
  "learning_rate": 0.00014055253208129938,
500
- "loss": 0.5309,
501
  "step": 68
502
  },
503
  {
504
  "epoch": 0.41,
505
- "grad_norm": 0.12184981458813801,
506
  "learning_rate": 0.00013872711746506413,
507
- "loss": 0.4532,
508
  "step": 69
509
  },
510
  {
511
  "epoch": 0.42,
512
- "grad_norm": 0.12449299540645078,
513
  "learning_rate": 0.00013688639245240078,
514
- "loss": 0.5198,
515
  "step": 70
516
  },
517
  {
518
  "epoch": 0.42,
519
- "grad_norm": 0.1383134750490429,
520
  "learning_rate": 0.00013503108475638244,
521
- "loss": 0.5629,
522
  "step": 71
523
  },
524
  {
525
  "epoch": 0.43,
526
- "grad_norm": 0.246237001656926,
527
  "learning_rate": 0.0001331619278552068,
528
- "loss": 0.4869,
529
  "step": 72
530
  },
531
  {
532
  "epoch": 0.43,
533
- "grad_norm": 0.13337703940933632,
534
  "learning_rate": 0.00013127966070222274,
535
- "loss": 0.4792,
536
  "step": 73
537
  },
538
  {
539
  "epoch": 0.44,
540
- "grad_norm": 0.12428922033806454,
541
  "learning_rate": 0.00012938502743379212,
542
- "loss": 0.4825,
543
  "step": 74
544
  },
545
  {
546
  "epoch": 0.45,
547
- "grad_norm": 0.13290774912900208,
548
  "learning_rate": 0.00012747877707510252,
549
- "loss": 0.5138,
550
  "step": 75
551
  },
552
  {
553
  "epoch": 0.45,
554
- "grad_norm": 0.11185975046756892,
555
  "learning_rate": 0.0001255616632440475,
556
- "loss": 0.4815,
557
  "step": 76
558
  },
559
  {
560
  "epoch": 0.46,
561
- "grad_norm": 0.1130592868215497,
562
  "learning_rate": 0.0001236344438532905,
563
- "loss": 0.5046,
564
  "step": 77
565
  },
566
  {
567
  "epoch": 0.46,
568
- "grad_norm": 0.12882943465594857,
569
  "learning_rate": 0.0001216978808106318,
570
  "loss": 0.5091,
571
  "step": 78
572
  },
573
  {
574
  "epoch": 0.47,
575
- "grad_norm": 0.14837896297082676,
576
  "learning_rate": 0.00011975273971779528,
577
- "loss": 0.5158,
578
  "step": 79
579
  },
580
  {
581
  "epoch": 0.48,
582
- "grad_norm": 0.1265223309856292,
583
  "learning_rate": 0.00011779978956775506,
584
- "loss": 0.5068,
585
  "step": 80
586
  },
587
  {
588
  "epoch": 0.48,
589
- "grad_norm": 0.14042502330520407,
590
  "learning_rate": 0.0001158398024407215,
591
- "loss": 0.5061,
592
  "step": 81
593
  },
594
  {
595
  "epoch": 0.49,
596
- "grad_norm": 0.1261526695491767,
597
  "learning_rate": 0.00011387355319890685,
598
- "loss": 0.4691,
599
  "step": 82
600
  },
601
  {
602
  "epoch": 0.49,
603
- "grad_norm": 0.12007305451001854,
604
  "learning_rate": 0.00011190181918019049,
605
- "loss": 0.4753,
606
  "step": 83
607
  },
608
  {
609
  "epoch": 0.5,
610
- "grad_norm": 0.12809956897166885,
611
  "learning_rate": 0.00010992537989080618,
612
- "loss": 0.4417,
613
  "step": 84
614
  },
615
  {
616
  "epoch": 0.5,
617
- "eval_loss": 0.9675251841545105,
618
- "eval_runtime": 152.4793,
619
- "eval_samples_per_second": 1.633,
620
- "eval_steps_per_second": 0.82,
621
  "step": 84
622
  },
623
  {
624
  "epoch": 0.51,
625
- "grad_norm": 0.11858329804793687,
626
  "learning_rate": 0.00010794501669717145,
627
- "loss": 0.4868,
628
  "step": 85
629
  },
630
  {
631
  "epoch": 0.51,
632
- "grad_norm": 0.10984649953887334,
633
  "learning_rate": 0.00010596151251698199,
634
- "loss": 0.4598,
635
  "step": 86
636
  },
637
  {
638
  "epoch": 0.52,
639
- "grad_norm": 0.10927203986256682,
640
  "learning_rate": 0.0001039756515096926,
641
- "loss": 0.4693,
642
  "step": 87
643
  },
644
  {
645
  "epoch": 0.52,
646
- "grad_norm": 0.11205046531522328,
647
  "learning_rate": 0.00010198821876650701,
648
- "loss": 0.4921,
649
  "step": 88
650
  },
651
  {
652
  "epoch": 0.53,
653
- "grad_norm": 0.13232347270009215,
654
  "learning_rate": 0.0001,
655
- "loss": 0.4695,
656
  "step": 89
657
  },
658
  {
659
  "epoch": 0.53,
660
- "grad_norm": 0.12136881873560385,
661
  "learning_rate": 9.801178123349298e-05,
662
- "loss": 0.4859,
663
  "step": 90
664
  },
665
  {
666
  "epoch": 0.54,
667
- "grad_norm": 0.14347476421156694,
668
  "learning_rate": 9.602434849030745e-05,
669
- "loss": 0.4796,
670
  "step": 91
671
  },
672
  {
673
  "epoch": 0.55,
674
- "grad_norm": 0.13956845267055204,
675
  "learning_rate": 9.403848748301802e-05,
676
- "loss": 0.5339,
677
  "step": 92
678
  },
679
  {
680
  "epoch": 0.55,
681
- "grad_norm": 0.12814010903196785,
682
  "learning_rate": 9.205498330282856e-05,
683
- "loss": 0.5267,
684
  "step": 93
685
  },
686
  {
687
  "epoch": 0.56,
688
- "grad_norm": 0.12798850330908082,
689
  "learning_rate": 9.007462010919386e-05,
690
- "loss": 0.4604,
691
  "step": 94
692
  },
693
  {
694
  "epoch": 0.56,
695
- "grad_norm": 0.13673366056605873,
696
  "learning_rate": 8.809818081980953e-05,
697
- "loss": 0.49,
698
  "step": 95
699
  },
700
  {
701
  "epoch": 0.57,
702
- "grad_norm": 0.12607483394599764,
703
  "learning_rate": 8.612644680109319e-05,
704
- "loss": 0.4774,
705
  "step": 96
706
  },
707
  {
708
  "epoch": 0.58,
709
- "grad_norm": 0.1365629261848207,
710
  "learning_rate": 8.416019755927851e-05,
711
- "loss": 0.4827,
712
  "step": 97
713
  },
714
  {
715
  "epoch": 0.58,
716
- "grad_norm": 0.12122559291940836,
717
  "learning_rate": 8.2200210432245e-05,
718
- "loss": 0.5044,
719
  "step": 98
720
  },
721
  {
722
  "epoch": 0.59,
723
- "grad_norm": 0.11655390642565265,
724
  "learning_rate": 8.024726028220474e-05,
725
- "loss": 0.503,
726
  "step": 99
727
  },
728
  {
729
  "epoch": 0.59,
730
- "grad_norm": 0.12394574502796742,
731
  "learning_rate": 7.83021191893682e-05,
732
- "loss": 0.491,
733
  "step": 100
734
  },
735
  {
736
  "epoch": 0.6,
737
- "grad_norm": 0.14922841699852962,
738
  "learning_rate": 7.636555614670953e-05,
739
- "loss": 0.457,
740
  "step": 101
741
  },
742
  {
743
  "epoch": 0.61,
744
- "grad_norm": 0.11076924096187928,
745
  "learning_rate": 7.443833675595255e-05,
746
- "loss": 0.4603,
747
  "step": 102
748
  },
749
  {
750
  "epoch": 0.61,
751
- "grad_norm": 0.1263594611752413,
752
  "learning_rate": 7.252122292489747e-05,
753
- "loss": 0.4859,
754
  "step": 103
755
  },
756
  {
757
  "epoch": 0.62,
758
- "grad_norm": 0.11432575178505003,
759
  "learning_rate": 7.061497256620793e-05,
760
- "loss": 0.4627,
761
  "step": 104
762
  },
763
  {
764
  "epoch": 0.62,
765
- "grad_norm": 0.1078119391965793,
766
  "learning_rate": 6.87203392977773e-05,
767
- "loss": 0.4829,
768
  "step": 105
769
  },
770
  {
771
  "epoch": 0.63,
772
- "grad_norm": 0.12752089816514908,
773
  "learning_rate": 6.683807214479323e-05,
774
- "loss": 0.46,
775
  "step": 106
776
  },
777
  {
778
  "epoch": 0.64,
779
- "grad_norm": 0.11421623043902956,
780
  "learning_rate": 6.496891524361757e-05,
781
- "loss": 0.4429,
782
  "step": 107
783
  },
784
  {
785
  "epoch": 0.64,
786
- "grad_norm": 0.10432253193399477,
787
  "learning_rate": 6.311360754759923e-05,
788
- "loss": 0.402,
789
  "step": 108
790
  },
791
  {
792
  "epoch": 0.65,
793
- "grad_norm": 0.12155248673662734,
794
  "learning_rate": 6.127288253493591e-05,
795
- "loss": 0.5126,
796
  "step": 109
797
  },
798
  {
799
  "epoch": 0.65,
800
- "grad_norm": 0.14266947863559803,
801
  "learning_rate": 5.9447467918700614e-05,
802
- "loss": 0.4821,
803
  "step": 110
804
  },
805
  {
806
  "epoch": 0.66,
807
- "grad_norm": 0.14851250761112514,
808
  "learning_rate": 5.763808535914723e-05,
809
- "loss": 0.4891,
810
  "step": 111
811
  },
812
  {
813
  "epoch": 0.67,
814
- "grad_norm": 0.14264023747361737,
815
  "learning_rate": 5.584545017840885e-05,
816
- "loss": 0.5181,
817
  "step": 112
818
  },
819
  {
820
  "epoch": 0.67,
821
- "grad_norm": 0.12837168363458795,
822
  "learning_rate": 5.407027107770219e-05,
823
- "loss": 0.5599,
824
  "step": 113
825
  },
826
  {
827
  "epoch": 0.68,
828
- "grad_norm": 0.11874709251257598,
829
  "learning_rate": 5.2313249857149414e-05,
830
- "loss": 0.4536,
831
  "step": 114
832
  },
833
  {
834
  "epoch": 0.68,
835
- "grad_norm": 0.12010754957532713,
836
  "learning_rate": 5.0575081138327715e-05,
837
- "loss": 0.5004,
838
  "step": 115
839
  },
840
  {
841
  "epoch": 0.69,
842
- "grad_norm": 0.13464124440677885,
843
  "learning_rate": 4.885645208965779e-05,
844
- "loss": 0.4985,
845
  "step": 116
846
  },
847
  {
848
  "epoch": 0.7,
849
- "grad_norm": 0.13701854261941088,
850
  "learning_rate": 4.715804215473809e-05,
851
- "loss": 0.4709,
852
  "step": 117
853
  },
854
  {
855
  "epoch": 0.7,
856
- "grad_norm": 0.1335483738873249,
857
  "learning_rate": 4.548052278373327e-05,
858
- "loss": 0.4735,
859
  "step": 118
860
  },
861
  {
862
  "epoch": 0.71,
863
- "grad_norm": 0.13603172024059101,
864
  "learning_rate": 4.382455716792291e-05,
865
- "loss": 0.4721,
866
  "step": 119
867
  },
868
  {
869
  "epoch": 0.71,
870
- "grad_norm": 0.13843339239058639,
871
  "learning_rate": 4.219079997751515e-05,
872
  "loss": 0.4954,
873
  "step": 120
874
  },
875
  {
876
  "epoch": 0.72,
877
- "grad_norm": 0.15011169526780793,
878
  "learning_rate": 4.0579897102828966e-05,
879
- "loss": 0.4648,
880
  "step": 121
881
  },
882
  {
883
  "epoch": 0.73,
884
- "grad_norm": 0.13061595453081623,
885
  "learning_rate": 3.899248539894757e-05,
886
- "loss": 0.4801,
887
  "step": 122
888
  },
889
  {
890
  "epoch": 0.73,
891
- "grad_norm": 0.14067787924603412,
892
  "learning_rate": 3.7429192433944014e-05,
893
- "loss": 0.4805,
894
  "step": 123
895
  },
896
  {
897
  "epoch": 0.74,
898
- "grad_norm": 0.13420057703295998,
899
  "learning_rate": 3.589063624077802e-05,
900
- "loss": 0.4446,
901
  "step": 124
902
  },
903
  {
904
  "epoch": 0.74,
905
- "grad_norm": 0.14083737654873127,
906
  "learning_rate": 3.4377425072962465e-05,
907
- "loss": 0.46,
908
  "step": 125
909
  },
910
  {
911
  "epoch": 0.75,
912
- "grad_norm": 0.13231889777376862,
913
  "learning_rate": 3.289015716409631e-05,
914
- "loss": 0.4451,
915
  "step": 126
916
  },
917
  {
918
  "epoch": 0.75,
919
- "eval_loss": 0.9651579260826111,
920
- "eval_runtime": 155.5959,
921
- "eval_samples_per_second": 1.6,
922
- "eval_steps_per_second": 0.803,
923
  "step": 126
924
  },
925
  {
926
  "epoch": 0.75,
927
- "grad_norm": 0.1119782513388846,
928
  "learning_rate": 3.14294204913587e-05,
929
- "loss": 0.4728,
930
  "step": 127
931
  },
932
  {
933
  "epoch": 0.76,
934
- "grad_norm": 0.1258610262132018,
935
  "learning_rate": 2.9995792543057478e-05,
936
- "loss": 0.4793,
937
  "step": 128
938
  },
939
  {
940
  "epoch": 0.77,
941
- "grad_norm": 0.1332956040603169,
942
  "learning_rate": 2.8589840090325027e-05,
943
- "loss": 0.4919,
944
  "step": 129
945
  },
946
  {
947
  "epoch": 0.77,
948
- "grad_norm": 0.13669237836174272,
949
  "learning_rate": 2.7212118963050592e-05,
950
- "loss": 0.4863,
951
  "step": 130
952
  },
953
  {
954
  "epoch": 0.78,
955
- "grad_norm": 0.13718073990096244,
956
  "learning_rate": 2.586317383013821e-05,
957
- "loss": 0.4652,
958
  "step": 131
959
  },
960
  {
961
  "epoch": 0.78,
962
- "grad_norm": 0.1328715725965393,
963
  "learning_rate": 2.4543537984176978e-05,
964
- "loss": 0.4464,
965
  "step": 132
966
  },
967
  {
968
  "epoch": 0.79,
969
- "grad_norm": 0.12474829320490133,
970
  "learning_rate": 2.325373313060919e-05,
971
- "loss": 0.501,
972
  "step": 133
973
  },
974
  {
975
  "epoch": 0.8,
976
- "grad_norm": 0.13832785688421897,
977
  "learning_rate": 2.19942691814788e-05,
978
- "loss": 0.5113,
979
  "step": 134
980
  },
981
  {
982
  "epoch": 0.8,
983
- "grad_norm": 0.13446374612957607,
984
  "learning_rate": 2.076564405384258e-05,
985
- "loss": 0.452,
986
  "step": 135
987
  },
988
  {
989
  "epoch": 0.81,
990
- "grad_norm": 0.15087962227561943,
991
  "learning_rate": 1.9568343472923524e-05,
992
- "loss": 0.5114,
993
  "step": 136
994
  },
995
  {
996
  "epoch": 0.81,
997
- "grad_norm": 0.13471199433579678,
998
  "learning_rate": 1.840284078008393e-05,
999
- "loss": 0.5074,
1000
  "step": 137
1001
  },
1002
  {
1003
  "epoch": 0.82,
1004
- "grad_norm": 0.14471602871304384,
1005
  "learning_rate": 1.7269596745694295e-05,
1006
- "loss": 0.4812,
1007
  "step": 138
1008
  },
1009
  {
1010
  "epoch": 0.83,
1011
- "grad_norm": 0.12791095480012135,
1012
  "learning_rate": 1.616905938697234e-05,
1013
- "loss": 0.439,
1014
  "step": 139
1015
  },
1016
  {
1017
  "epoch": 0.83,
1018
- "grad_norm": 0.13395748535894433,
1019
  "learning_rate": 1.5101663790863596e-05,
1020
- "loss": 0.4942,
1021
  "step": 140
1022
  },
1023
  {
1024
  "epoch": 0.84,
1025
- "grad_norm": 0.1216157843375959,
1026
  "learning_rate": 1.4067831942033904e-05,
1027
- "loss": 0.4904,
1028
  "step": 141
1029
  },
1030
  {
1031
  "epoch": 0.84,
1032
- "grad_norm": 0.12513029490165878,
1033
  "learning_rate": 1.3067972556041752e-05,
1034
- "loss": 0.4583,
1035
  "step": 142
1036
  },
1037
  {
1038
  "epoch": 0.85,
1039
- "grad_norm": 0.11835922329329535,
1040
  "learning_rate": 1.210248091775663e-05,
1041
- "loss": 0.5281,
1042
  "step": 143
1043
  },
1044
  {
1045
  "epoch": 0.86,
1046
- "grad_norm": 0.14513302866752326,
1047
  "learning_rate": 1.1171738725086833e-05,
1048
- "loss": 0.4432,
1049
  "step": 144
1050
  },
1051
  {
1052
  "epoch": 0.86,
1053
- "grad_norm": 0.11934584390756656,
1054
  "learning_rate": 1.0276113938078769e-05,
1055
- "loss": 0.4664,
1056
  "step": 145
1057
  },
1058
  {
1059
  "epoch": 0.87,
1060
- "grad_norm": 0.11746932949614314,
1061
  "learning_rate": 9.415960633447674e-06,
1062
  "loss": 0.4452,
1063
  "step": 146
1064
  },
1065
  {
1066
  "epoch": 0.87,
1067
- "grad_norm": 0.1183312470782195,
1068
  "learning_rate": 8.59161886459654e-06,
1069
- "loss": 0.4858,
1070
  "step": 147
1071
  },
1072
  {
1073
  "epoch": 0.88,
1074
- "grad_norm": 0.1226342260426643,
1075
  "learning_rate": 7.803414527179343e-06,
1076
- "loss": 0.4609,
1077
  "step": 148
1078
  },
1079
  {
1080
  "epoch": 0.89,
1081
- "grad_norm": 0.12849903086517628,
1082
  "learning_rate": 7.051659230261298e-06,
1083
- "loss": 0.4992,
1084
  "step": 149
1085
  },
1086
  {
1087
  "epoch": 0.89,
1088
- "grad_norm": 0.12558798780888547,
1089
  "learning_rate": 6.336650173127223e-06,
1090
- "loss": 0.5055,
1091
  "step": 150
1092
  },
1093
  {
1094
  "epoch": 0.9,
1095
- "grad_norm": 0.11991457404362381,
1096
  "learning_rate": 5.658670027786561e-06,
1097
- "loss": 0.4711,
1098
  "step": 151
1099
  },
1100
  {
1101
  "epoch": 0.9,
1102
- "grad_norm": 0.14486189343828018,
1103
  "learning_rate": 5.017986827221733e-06,
1104
- "loss": 0.4984,
1105
  "step": 152
1106
  },
1107
  {
1108
  "epoch": 0.91,
1109
- "grad_norm": 0.14292272311516058,
1110
  "learning_rate": 4.4148538594239174e-06,
1111
- "loss": 0.518,
1112
  "step": 153
1113
  },
1114
  {
1115
  "epoch": 0.92,
1116
- "grad_norm": 0.12669351883956026,
1117
  "learning_rate": 3.849509567257959e-06,
1118
- "loss": 0.509,
1119
  "step": 154
1120
  },
1121
  {
1122
  "epoch": 0.92,
1123
- "grad_norm": 0.14745027617256945,
1124
  "learning_rate": 3.3221774541962845e-06,
1125
- "loss": 0.4655,
1126
  "step": 155
1127
  },
1128
  {
1129
  "epoch": 0.93,
1130
- "grad_norm": 0.13904171638337365,
1131
  "learning_rate": 2.8330659959589946e-06,
1132
- "loss": 0.445,
1133
  "step": 156
1134
  },
1135
  {
1136
  "epoch": 0.93,
1137
- "grad_norm": 0.12971202836694892,
1138
  "learning_rate": 2.3823685580949273e-06,
1139
- "loss": 0.4534,
1140
  "step": 157
1141
  },
1142
  {
1143
  "epoch": 0.94,
1144
- "grad_norm": 0.13437185949159883,
1145
  "learning_rate": 1.9702633195363917e-06,
1146
- "loss": 0.4785,
1147
  "step": 158
1148
  },
1149
  {
1150
  "epoch": 0.95,
1151
- "grad_norm": 0.12319276879857373,
1152
  "learning_rate": 1.5969132021579347e-06,
1153
- "loss": 0.5011,
1154
  "step": 159
1155
  },
1156
  {
1157
  "epoch": 0.95,
1158
- "grad_norm": 0.12918661435233847,
1159
  "learning_rate": 1.2624658063666639e-06,
1160
- "loss": 0.5236,
1161
  "step": 160
1162
  },
1163
  {
1164
  "epoch": 0.96,
1165
- "grad_norm": 0.12874241011432666,
1166
  "learning_rate": 9.670533527498137e-07,
1167
- "loss": 0.4439,
1168
  "step": 161
1169
  },
1170
  {
1171
  "epoch": 0.96,
1172
- "grad_norm": 0.1315361924610259,
1173
  "learning_rate": 7.10792629802659e-07,
1174
- "loss": 0.4929,
1175
  "step": 162
1176
  },
1177
  {
1178
  "epoch": 0.97,
1179
- "grad_norm": 0.1336404295007477,
1180
  "learning_rate": 4.937849477572587e-07,
1181
- "loss": 0.433,
1182
  "step": 163
1183
  },
1184
  {
1185
  "epoch": 0.97,
1186
- "grad_norm": 0.11865993758025514,
1187
  "learning_rate": 3.161160985304168e-07,
1188
- "loss": 0.4266,
1189
  "step": 164
1190
  },
1191
  {
1192
  "epoch": 0.98,
1193
- "grad_norm": 0.12559739797694514,
1194
  "learning_rate": 1.7785632180670198e-07,
1195
  "loss": 0.436,
1196
  "step": 165
1197
  },
1198
  {
1199
  "epoch": 0.99,
1200
- "grad_norm": 0.12409278477769135,
1201
  "learning_rate": 7.906027726981568e-08,
1202
- "loss": 0.5017,
1203
  "step": 166
1204
  },
1205
  {
1206
  "epoch": 0.99,
1207
- "grad_norm": 0.1398337808177678,
1208
  "learning_rate": 1.976702299344435e-08,
1209
- "loss": 0.5112,
1210
  "step": 167
1211
  },
1212
  {
1213
  "epoch": 1.0,
1214
- "grad_norm": 0.11434132222175684,
1215
  "learning_rate": 0.0,
1216
- "loss": 0.4616,
1217
  "step": 168
1218
  },
1219
  {
1220
  "epoch": 1.0,
1221
- "eval_loss": 0.9651336073875427,
1222
- "eval_runtime": 154.7455,
1223
- "eval_samples_per_second": 1.609,
1224
- "eval_steps_per_second": 0.808,
1225
  "step": 168
1226
  }
1227
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.01,
13
+ "grad_norm": 0.29289755909939047,
14
  "learning_rate": 2e-05,
15
  "loss": 0.6274,
16
  "step": 1
 
18
  {
19
  "epoch": 0.01,
20
  "eval_loss": 1.0297596454620361,
21
+ "eval_runtime": 153.0715,
22
+ "eval_samples_per_second": 1.627,
23
+ "eval_steps_per_second": 0.817,
24
  "step": 1
25
  },
26
  {
27
  "epoch": 0.01,
28
+ "grad_norm": 0.27956410941469395,
29
  "learning_rate": 4e-05,
30
  "loss": 0.6362,
31
  "step": 2
32
  },
33
  {
34
  "epoch": 0.02,
35
+ "grad_norm": 0.3009590515152092,
36
  "learning_rate": 6e-05,
37
  "loss": 0.6299,
38
  "step": 3
39
  },
40
  {
41
  "epoch": 0.02,
42
+ "grad_norm": 0.34354546270515235,
43
  "learning_rate": 8e-05,
44
+ "loss": 0.6395,
45
  "step": 4
46
  },
47
  {
48
  "epoch": 0.03,
49
+ "grad_norm": 0.2470961998205002,
50
  "learning_rate": 0.0001,
51
+ "loss": 0.6068,
52
  "step": 5
53
  },
54
  {
55
  "epoch": 0.04,
56
+ "grad_norm": 0.2071993792912813,
57
  "learning_rate": 0.00012,
58
+ "loss": 0.5993,
59
  "step": 6
60
  },
61
  {
62
  "epoch": 0.04,
63
+ "grad_norm": 0.20237019487582247,
64
  "learning_rate": 0.00014,
65
+ "loss": 0.6293,
66
  "step": 7
67
  },
68
  {
69
  "epoch": 0.05,
70
+ "grad_norm": 0.13810925455451734,
71
  "learning_rate": 0.00016,
72
+ "loss": 0.5101,
73
  "step": 8
74
  },
75
  {
76
  "epoch": 0.05,
77
+ "grad_norm": 0.22762469698117493,
78
  "learning_rate": 0.00018,
79
+ "loss": 0.5527,
80
  "step": 9
81
  },
82
  {
83
  "epoch": 0.06,
84
+ "grad_norm": 0.15718586910417978,
85
  "learning_rate": 0.0002,
86
+ "loss": 0.5437,
87
  "step": 10
88
  },
89
  {
90
  "epoch": 0.07,
91
+ "grad_norm": 0.13825435646308676,
92
  "learning_rate": 0.00019998023297700658,
93
+ "loss": 0.5858,
94
  "step": 11
95
  },
96
  {
97
  "epoch": 0.07,
98
+ "grad_norm": 0.15072585803075744,
99
  "learning_rate": 0.00019992093972273018,
100
+ "loss": 0.554,
101
  "step": 12
102
  },
103
  {
104
  "epoch": 0.08,
105
+ "grad_norm": 0.13616736806741778,
106
  "learning_rate": 0.00019982214367819328,
107
+ "loss": 0.6199,
108
  "step": 13
109
  },
110
  {
111
  "epoch": 0.08,
112
+ "grad_norm": 0.15841398198140327,
113
  "learning_rate": 0.0001996838839014696,
114
  "loss": 0.5495,
115
  "step": 14
116
  },
117
  {
118
  "epoch": 0.09,
119
+ "grad_norm": 0.16548492090565414,
120
  "learning_rate": 0.00019950621505224273,
121
+ "loss": 0.5043,
122
  "step": 15
123
  },
124
  {
125
  "epoch": 0.1,
126
+ "grad_norm": 0.16965923620151113,
127
  "learning_rate": 0.00019928920737019733,
128
  "loss": 0.5083,
129
  "step": 16
130
  },
131
  {
132
  "epoch": 0.1,
133
+ "grad_norm": 0.1853247768877015,
134
  "learning_rate": 0.0001990329466472502,
135
+ "loss": 0.6318,
136
  "step": 17
137
  },
138
  {
139
  "epoch": 0.11,
140
+ "grad_norm": 0.1824469971767483,
141
  "learning_rate": 0.00019873753419363336,
142
+ "loss": 0.4809,
143
  "step": 18
144
  },
145
  {
146
  "epoch": 0.11,
147
+ "grad_norm": 0.13288488939703025,
148
  "learning_rate": 0.00019840308679784207,
149
+ "loss": 0.4974,
150
  "step": 19
151
  },
152
  {
153
  "epoch": 0.12,
154
+ "grad_norm": 0.18507602579352214,
155
  "learning_rate": 0.00019802973668046363,
156
+ "loss": 0.5288,
157
  "step": 20
158
  },
159
  {
160
  "epoch": 0.12,
161
+ "grad_norm": 0.13591012925799634,
162
  "learning_rate": 0.0001976176314419051,
163
+ "loss": 0.5291,
164
  "step": 21
165
  },
166
  {
167
  "epoch": 0.13,
168
+ "grad_norm": 0.13704233024715257,
169
  "learning_rate": 0.000197166934004041,
170
  "loss": 0.4819,
171
  "step": 22
172
  },
173
  {
174
  "epoch": 0.14,
175
+ "grad_norm": 0.15362008062157553,
176
  "learning_rate": 0.00019667782254580374,
177
+ "loss": 0.5408,
178
  "step": 23
179
  },
180
  {
181
  "epoch": 0.14,
182
+ "grad_norm": 0.11171032733778614,
183
  "learning_rate": 0.00019615049043274205,
184
+ "loss": 0.5101,
185
  "step": 24
186
  },
187
  {
188
  "epoch": 0.15,
189
+ "grad_norm": 0.11449051252849428,
190
  "learning_rate": 0.00019558514614057609,
191
+ "loss": 0.5209,
192
  "step": 25
193
  },
194
  {
195
  "epoch": 0.15,
196
+ "grad_norm": 0.11611407915744347,
197
  "learning_rate": 0.00019498201317277828,
198
+ "loss": 0.5005,
199
  "step": 26
200
  },
201
  {
202
  "epoch": 0.16,
203
+ "grad_norm": 0.12735633641627706,
204
  "learning_rate": 0.00019434132997221345,
205
+ "loss": 0.4741,
206
  "step": 27
207
  },
208
  {
209
  "epoch": 0.17,
210
+ "grad_norm": 0.11871518327376328,
211
  "learning_rate": 0.0001936633498268728,
212
+ "loss": 0.5213,
213
  "step": 28
214
  },
215
  {
216
  "epoch": 0.17,
217
+ "grad_norm": 0.11403376465541806,
218
  "learning_rate": 0.0001929483407697387,
219
+ "loss": 0.4842,
220
  "step": 29
221
  },
222
  {
223
  "epoch": 0.18,
224
+ "grad_norm": 0.11675155934307391,
225
  "learning_rate": 0.00019219658547282067,
226
+ "loss": 0.4825,
227
  "step": 30
228
  },
229
  {
230
  "epoch": 0.18,
231
+ "grad_norm": 0.10789338581384152,
232
  "learning_rate": 0.00019140838113540346,
233
+ "loss": 0.4866,
234
  "step": 31
235
  },
236
  {
237
  "epoch": 0.19,
238
+ "grad_norm": 0.14336837303756964,
239
  "learning_rate": 0.00019058403936655233,
240
+ "loss": 0.5325,
241
  "step": 32
242
  },
243
  {
244
  "epoch": 0.2,
245
+ "grad_norm": 0.10401694793599091,
246
  "learning_rate": 0.00018972388606192125,
247
+ "loss": 0.4292,
248
  "step": 33
249
  },
250
  {
251
  "epoch": 0.2,
252
+ "grad_norm": 0.10614245315138679,
253
  "learning_rate": 0.0001888282612749132,
254
+ "loss": 0.4638,
255
  "step": 34
256
  },
257
  {
258
  "epoch": 0.21,
259
+ "grad_norm": 0.1250509143492961,
260
  "learning_rate": 0.00018789751908224338,
261
+ "loss": 0.4963,
262
  "step": 35
263
  },
264
  {
265
  "epoch": 0.21,
266
+ "grad_norm": 0.12668831423294083,
267
  "learning_rate": 0.00018693202744395827,
268
+ "loss": 0.5043,
269
  "step": 36
270
  },
271
  {
272
  "epoch": 0.22,
273
+ "grad_norm": 0.11594924793976216,
274
  "learning_rate": 0.00018593216805796612,
275
  "loss": 0.5396,
276
  "step": 37
277
  },
278
  {
279
  "epoch": 0.23,
280
+ "grad_norm": 0.12365894749489846,
281
  "learning_rate": 0.00018489833620913642,
282
+ "loss": 0.4899,
283
  "step": 38
284
  },
285
  {
286
  "epoch": 0.23,
287
+ "grad_norm": 0.13164318970805183,
288
  "learning_rate": 0.00018383094061302766,
289
+ "loss": 0.5065,
290
  "step": 39
291
  },
292
  {
293
  "epoch": 0.24,
294
+ "grad_norm": 0.10587870431925453,
295
  "learning_rate": 0.00018273040325430574,
296
+ "loss": 0.4805,
297
  "step": 40
298
  },
299
  {
300
  "epoch": 0.24,
301
+ "grad_norm": 0.12134503625595237,
302
  "learning_rate": 0.00018159715921991612,
303
+ "loss": 0.5103,
304
  "step": 41
305
  },
306
  {
307
  "epoch": 0.25,
308
+ "grad_norm": 0.10855490758268896,
309
  "learning_rate": 0.00018043165652707649,
310
+ "loss": 0.44,
311
  "step": 42
312
  },
313
  {
314
  "epoch": 0.25,
315
+ "eval_loss": 0.9770342707633972,
316
+ "eval_runtime": 154.3933,
317
+ "eval_samples_per_second": 1.613,
318
+ "eval_steps_per_second": 0.81,
319
  "step": 42
320
  },
321
  {
322
  "epoch": 0.26,
323
+ "grad_norm": 0.11753509152196469,
324
  "learning_rate": 0.00017923435594615744,
325
+ "loss": 0.4819,
326
  "step": 43
327
  },
328
  {
329
  "epoch": 0.26,
330
+ "grad_norm": 0.13774603727779988,
331
  "learning_rate": 0.00017800573081852122,
332
+ "loss": 0.5451,
333
  "step": 44
334
  },
335
  {
336
  "epoch": 0.27,
337
+ "grad_norm": 0.11994636996852912,
338
  "learning_rate": 0.0001767462668693908,
339
+ "loss": 0.5079,
340
  "step": 45
341
  },
342
  {
343
  "epoch": 0.27,
344
+ "grad_norm": 0.11803018063108017,
345
  "learning_rate": 0.00017545646201582303,
346
+ "loss": 0.5183,
347
  "step": 46
348
  },
349
  {
350
  "epoch": 0.28,
351
+ "grad_norm": 0.12122026879022209,
352
  "learning_rate": 0.00017413682616986185,
353
+ "loss": 0.4692,
354
  "step": 47
355
  },
356
  {
357
  "epoch": 0.29,
358
+ "grad_norm": 0.12840154129375927,
359
  "learning_rate": 0.00017278788103694943,
360
+ "loss": 0.4538,
361
  "step": 48
362
  },
363
  {
364
  "epoch": 0.29,
365
+ "grad_norm": 0.10754191454075242,
366
  "learning_rate": 0.000171410159909675,
367
+ "loss": 0.4745,
368
  "step": 49
369
  },
370
  {
371
  "epoch": 0.3,
372
+ "grad_norm": 0.10980392972154758,
373
  "learning_rate": 0.00017000420745694254,
374
+ "loss": 0.5077,
375
  "step": 50
376
  },
377
  {
378
  "epoch": 0.3,
379
+ "grad_norm": 0.1018314102740997,
380
  "learning_rate": 0.00016857057950864132,
381
+ "loss": 0.5077,
382
  "step": 51
383
  },
384
  {
385
  "epoch": 0.31,
386
+ "grad_norm": 0.10572512690181787,
387
  "learning_rate": 0.0001671098428359037,
388
+ "loss": 0.4637,
389
  "step": 52
390
  },
391
  {
392
  "epoch": 0.32,
393
+ "grad_norm": 0.13106813432864392,
394
  "learning_rate": 0.00016562257492703757,
395
+ "loss": 0.4718,
396
  "step": 53
397
  },
398
  {
399
  "epoch": 0.32,
400
+ "grad_norm": 0.11024484284605006,
401
  "learning_rate": 0.000164109363759222,
402
+ "loss": 0.5115,
403
  "step": 54
404
  },
405
  {
406
  "epoch": 0.33,
407
+ "grad_norm": 0.12388815222110366,
408
  "learning_rate": 0.000162570807566056,
409
+ "loss": 0.4756,
410
  "step": 55
411
  },
412
  {
413
  "epoch": 0.33,
414
+ "grad_norm": 0.12702188830349206,
415
  "learning_rate": 0.00016100751460105243,
416
+ "loss": 0.4881,
417
  "step": 56
418
  },
419
  {
420
  "epoch": 0.34,
421
+ "grad_norm": 0.11086308659932648,
422
  "learning_rate": 0.00015942010289717105,
423
+ "loss": 0.4701,
424
  "step": 57
425
  },
426
  {
427
  "epoch": 0.34,
428
+ "grad_norm": 0.11517890306226485,
429
  "learning_rate": 0.00015780920002248484,
430
+ "loss": 0.4835,
431
  "step": 58
432
  },
433
  {
434
  "epoch": 0.35,
435
+ "grad_norm": 0.1253541013817757,
436
  "learning_rate": 0.0001561754428320771,
437
+ "loss": 0.5147,
438
  "step": 59
439
  },
440
  {
441
  "epoch": 0.36,
442
+ "grad_norm": 0.11139862438389284,
443
  "learning_rate": 0.00015451947721626676,
444
+ "loss": 0.4552,
445
  "step": 60
446
  },
447
  {
448
  "epoch": 0.36,
449
+ "grad_norm": 0.1152466912880715,
450
  "learning_rate": 0.00015284195784526195,
451
+ "loss": 0.5053,
452
  "step": 61
453
  },
454
  {
455
  "epoch": 0.37,
456
+ "grad_norm": 0.11719477948785388,
457
  "learning_rate": 0.00015114354791034225,
458
+ "loss": 0.5079,
459
  "step": 62
460
  },
461
  {
462
  "epoch": 0.37,
463
+ "grad_norm": 0.12730957754598396,
464
  "learning_rate": 0.0001494249188616723,
465
+ "loss": 0.58,
466
  "step": 63
467
  },
468
  {
469
  "epoch": 0.38,
470
+ "grad_norm": 0.11907511614241814,
471
  "learning_rate": 0.00014768675014285062,
472
+ "loss": 0.4579,
473
  "step": 64
474
  },
475
  {
476
  "epoch": 0.39,
477
+ "grad_norm": 0.13613222195950112,
478
  "learning_rate": 0.00014592972892229778,
479
+ "loss": 0.4965,
480
  "step": 65
481
  },
482
  {
483
  "epoch": 0.39,
484
+ "grad_norm": 0.11610538032286319,
485
  "learning_rate": 0.0001441545498215912,
486
+ "loss": 0.4467,
487
  "step": 66
488
  },
489
  {
490
  "epoch": 0.4,
491
+ "grad_norm": 0.1046724680527729,
492
  "learning_rate": 0.00014236191464085286,
493
+ "loss": 0.4463,
494
  "step": 67
495
  },
496
  {
497
  "epoch": 0.4,
498
+ "grad_norm": 0.11631551012538931,
499
  "learning_rate": 0.00014055253208129938,
500
+ "loss": 0.5304,
501
  "step": 68
502
  },
503
  {
504
  "epoch": 0.41,
505
+ "grad_norm": 0.11792788666231199,
506
  "learning_rate": 0.00013872711746506413,
507
+ "loss": 0.453,
508
  "step": 69
509
  },
510
  {
511
  "epoch": 0.42,
512
+ "grad_norm": 0.11566019183159004,
513
  "learning_rate": 0.00013688639245240078,
514
+ "loss": 0.5192,
515
  "step": 70
516
  },
517
  {
518
  "epoch": 0.42,
519
+ "grad_norm": 0.12967661214418025,
520
  "learning_rate": 0.00013503108475638244,
521
+ "loss": 0.5625,
522
  "step": 71
523
  },
524
  {
525
  "epoch": 0.43,
526
+ "grad_norm": 0.10928605499181634,
527
  "learning_rate": 0.0001331619278552068,
528
+ "loss": 0.4861,
529
  "step": 72
530
  },
531
  {
532
  "epoch": 0.43,
533
+ "grad_norm": 0.12485741141890881,
534
  "learning_rate": 0.00013127966070222274,
535
+ "loss": 0.4782,
536
  "step": 73
537
  },
538
  {
539
  "epoch": 0.44,
540
+ "grad_norm": 0.1271574869759,
541
  "learning_rate": 0.00012938502743379212,
542
+ "loss": 0.4819,
543
  "step": 74
544
  },
545
  {
546
  "epoch": 0.45,
547
+ "grad_norm": 0.14528941719728583,
548
  "learning_rate": 0.00012747877707510252,
549
+ "loss": 0.5132,
550
  "step": 75
551
  },
552
  {
553
  "epoch": 0.45,
554
+ "grad_norm": 0.12760881412243183,
555
  "learning_rate": 0.0001255616632440475,
556
+ "loss": 0.4817,
557
  "step": 76
558
  },
559
  {
560
  "epoch": 0.46,
561
+ "grad_norm": 0.1239773484446177,
562
  "learning_rate": 0.0001236344438532905,
563
+ "loss": 0.5045,
564
  "step": 77
565
  },
566
  {
567
  "epoch": 0.46,
568
+ "grad_norm": 0.1307497276864945,
569
  "learning_rate": 0.0001216978808106318,
570
  "loss": 0.5091,
571
  "step": 78
572
  },
573
  {
574
  "epoch": 0.47,
575
+ "grad_norm": 0.1355749101256534,
576
  "learning_rate": 0.00011975273971779528,
577
+ "loss": 0.5145,
578
  "step": 79
579
  },
580
  {
581
  "epoch": 0.48,
582
+ "grad_norm": 0.12394948316350356,
583
  "learning_rate": 0.00011779978956775506,
584
+ "loss": 0.5057,
585
  "step": 80
586
  },
587
  {
588
  "epoch": 0.48,
589
+ "grad_norm": 0.13424789799433426,
590
  "learning_rate": 0.0001158398024407215,
591
+ "loss": 0.5058,
592
  "step": 81
593
  },
594
  {
595
  "epoch": 0.49,
596
+ "grad_norm": 0.13054427599188898,
597
  "learning_rate": 0.00011387355319890685,
598
+ "loss": 0.4683,
599
  "step": 82
600
  },
601
  {
602
  "epoch": 0.49,
603
+ "grad_norm": 0.11996704114604598,
604
  "learning_rate": 0.00011190181918019049,
605
+ "loss": 0.4748,
606
  "step": 83
607
  },
608
  {
609
  "epoch": 0.5,
610
+ "grad_norm": 0.13071208464837492,
611
  "learning_rate": 0.00010992537989080618,
612
+ "loss": 0.4406,
613
  "step": 84
614
  },
615
  {
616
  "epoch": 0.5,
617
+ "eval_loss": 0.9652944803237915,
618
+ "eval_runtime": 153.5377,
619
+ "eval_samples_per_second": 1.622,
620
+ "eval_steps_per_second": 0.814,
621
  "step": 84
622
  },
623
  {
624
  "epoch": 0.51,
625
+ "grad_norm": 0.11393059638442392,
626
  "learning_rate": 0.00010794501669717145,
627
+ "loss": 0.4877,
628
  "step": 85
629
  },
630
  {
631
  "epoch": 0.51,
632
+ "grad_norm": 0.11364636282385192,
633
  "learning_rate": 0.00010596151251698199,
634
+ "loss": 0.4597,
635
  "step": 86
636
  },
637
  {
638
  "epoch": 0.52,
639
+ "grad_norm": 0.12548992883216656,
640
  "learning_rate": 0.0001039756515096926,
641
+ "loss": 0.4685,
642
  "step": 87
643
  },
644
  {
645
  "epoch": 0.52,
646
+ "grad_norm": 0.12825123946992636,
647
  "learning_rate": 0.00010198821876650701,
648
+ "loss": 0.4924,
649
  "step": 88
650
  },
651
  {
652
  "epoch": 0.53,
653
+ "grad_norm": 0.12264617890724591,
654
  "learning_rate": 0.0001,
655
+ "loss": 0.4678,
656
  "step": 89
657
  },
658
  {
659
  "epoch": 0.53,
660
+ "grad_norm": 0.1238990144553216,
661
  "learning_rate": 9.801178123349298e-05,
662
+ "loss": 0.4854,
663
  "step": 90
664
  },
665
  {
666
  "epoch": 0.54,
667
+ "grad_norm": 0.13533454171482565,
668
  "learning_rate": 9.602434849030745e-05,
669
+ "loss": 0.4784,
670
  "step": 91
671
  },
672
  {
673
  "epoch": 0.55,
674
+ "grad_norm": 0.13623990945679543,
675
  "learning_rate": 9.403848748301802e-05,
676
+ "loss": 0.5322,
677
  "step": 92
678
  },
679
  {
680
  "epoch": 0.55,
681
+ "grad_norm": 0.16411366119133766,
682
  "learning_rate": 9.205498330282856e-05,
683
+ "loss": 0.5258,
684
  "step": 93
685
  },
686
  {
687
  "epoch": 0.56,
688
+ "grad_norm": 0.11646811294381437,
689
  "learning_rate": 9.007462010919386e-05,
690
+ "loss": 0.4599,
691
  "step": 94
692
  },
693
  {
694
  "epoch": 0.56,
695
+ "grad_norm": 0.12610649830308363,
696
  "learning_rate": 8.809818081980953e-05,
697
+ "loss": 0.4891,
698
  "step": 95
699
  },
700
  {
701
  "epoch": 0.57,
702
+ "grad_norm": 0.12324596375061997,
703
  "learning_rate": 8.612644680109319e-05,
704
+ "loss": 0.4771,
705
  "step": 96
706
  },
707
  {
708
  "epoch": 0.58,
709
+ "grad_norm": 0.12390720748290898,
710
  "learning_rate": 8.416019755927851e-05,
711
+ "loss": 0.4814,
712
  "step": 97
713
  },
714
  {
715
  "epoch": 0.58,
716
+ "grad_norm": 0.11156155557793686,
717
  "learning_rate": 8.2200210432245e-05,
718
+ "loss": 0.5041,
719
  "step": 98
720
  },
721
  {
722
  "epoch": 0.59,
723
+ "grad_norm": 0.11235071151397323,
724
  "learning_rate": 8.024726028220474e-05,
725
+ "loss": 0.5023,
726
  "step": 99
727
  },
728
  {
729
  "epoch": 0.59,
730
+ "grad_norm": 0.12382934807374943,
731
  "learning_rate": 7.83021191893682e-05,
732
+ "loss": 0.4917,
733
  "step": 100
734
  },
735
  {
736
  "epoch": 0.6,
737
+ "grad_norm": 0.13051565063971357,
738
  "learning_rate": 7.636555614670953e-05,
739
+ "loss": 0.4567,
740
  "step": 101
741
  },
742
  {
743
  "epoch": 0.61,
744
+ "grad_norm": 0.11975246976802223,
745
  "learning_rate": 7.443833675595255e-05,
746
+ "loss": 0.4584,
747
  "step": 102
748
  },
749
  {
750
  "epoch": 0.61,
751
+ "grad_norm": 0.1390325061190883,
752
  "learning_rate": 7.252122292489747e-05,
753
+ "loss": 0.4853,
754
  "step": 103
755
  },
756
  {
757
  "epoch": 0.62,
758
+ "grad_norm": 0.13092284300351015,
759
  "learning_rate": 7.061497256620793e-05,
760
+ "loss": 0.4623,
761
  "step": 104
762
  },
763
  {
764
  "epoch": 0.62,
765
+ "grad_norm": 0.1291139691922856,
766
  "learning_rate": 6.87203392977773e-05,
767
+ "loss": 0.4826,
768
  "step": 105
769
  },
770
  {
771
  "epoch": 0.63,
772
+ "grad_norm": 0.12630345566649634,
773
  "learning_rate": 6.683807214479323e-05,
774
+ "loss": 0.4597,
775
  "step": 106
776
  },
777
  {
778
  "epoch": 0.64,
779
+ "grad_norm": 0.13061642512554117,
780
  "learning_rate": 6.496891524361757e-05,
781
+ "loss": 0.4415,
782
  "step": 107
783
  },
784
  {
785
  "epoch": 0.64,
786
+ "grad_norm": 0.11414947835097713,
787
  "learning_rate": 6.311360754759923e-05,
788
+ "loss": 0.4011,
789
  "step": 108
790
  },
791
  {
792
  "epoch": 0.65,
793
+ "grad_norm": 0.11453014913188615,
794
  "learning_rate": 6.127288253493591e-05,
795
+ "loss": 0.5118,
796
  "step": 109
797
  },
798
  {
799
  "epoch": 0.65,
800
+ "grad_norm": 0.15000233747727326,
801
  "learning_rate": 5.9447467918700614e-05,
802
+ "loss": 0.482,
803
  "step": 110
804
  },
805
  {
806
  "epoch": 0.66,
807
+ "grad_norm": 0.14402898946913723,
808
  "learning_rate": 5.763808535914723e-05,
809
+ "loss": 0.4881,
810
  "step": 111
811
  },
812
  {
813
  "epoch": 0.67,
814
+ "grad_norm": 0.1371953423024685,
815
  "learning_rate": 5.584545017840885e-05,
816
+ "loss": 0.5178,
817
  "step": 112
818
  },
819
  {
820
  "epoch": 0.67,
821
+ "grad_norm": 0.14105581149485616,
822
  "learning_rate": 5.407027107770219e-05,
823
+ "loss": 0.5584,
824
  "step": 113
825
  },
826
  {
827
  "epoch": 0.68,
828
+ "grad_norm": 0.13646035299068404,
829
  "learning_rate": 5.2313249857149414e-05,
830
+ "loss": 0.4535,
831
  "step": 114
832
  },
833
  {
834
  "epoch": 0.68,
835
+ "grad_norm": 0.14741390311838884,
836
  "learning_rate": 5.0575081138327715e-05,
837
+ "loss": 0.5,
838
  "step": 115
839
  },
840
  {
841
  "epoch": 0.69,
842
+ "grad_norm": 0.12313483515418376,
843
  "learning_rate": 4.885645208965779e-05,
844
+ "loss": 0.4982,
845
  "step": 116
846
  },
847
  {
848
  "epoch": 0.7,
849
+ "grad_norm": 0.13192008123654636,
850
  "learning_rate": 4.715804215473809e-05,
851
+ "loss": 0.4698,
852
  "step": 117
853
  },
854
  {
855
  "epoch": 0.7,
856
+ "grad_norm": 0.11197839688620534,
857
  "learning_rate": 4.548052278373327e-05,
858
+ "loss": 0.4737,
859
  "step": 118
860
  },
861
  {
862
  "epoch": 0.71,
863
+ "grad_norm": 0.12574619847347493,
864
  "learning_rate": 4.382455716792291e-05,
865
+ "loss": 0.472,
866
  "step": 119
867
  },
868
  {
869
  "epoch": 0.71,
870
+ "grad_norm": 0.11785384512597769,
871
  "learning_rate": 4.219079997751515e-05,
872
  "loss": 0.4954,
873
  "step": 120
874
  },
875
  {
876
  "epoch": 0.72,
877
+ "grad_norm": 0.1371137498771671,
878
  "learning_rate": 4.0579897102828966e-05,
879
+ "loss": 0.4645,
880
  "step": 121
881
  },
882
  {
883
  "epoch": 0.73,
884
+ "grad_norm": 0.1562115085397725,
885
  "learning_rate": 3.899248539894757e-05,
886
+ "loss": 0.4798,
887
  "step": 122
888
  },
889
  {
890
  "epoch": 0.73,
891
+ "grad_norm": 0.14251919403809987,
892
  "learning_rate": 3.7429192433944014e-05,
893
+ "loss": 0.4794,
894
  "step": 123
895
  },
896
  {
897
  "epoch": 0.74,
898
+ "grad_norm": 0.13567896269328303,
899
  "learning_rate": 3.589063624077802e-05,
900
+ "loss": 0.4441,
901
  "step": 124
902
  },
903
  {
904
  "epoch": 0.74,
905
+ "grad_norm": 0.14154096161651117,
906
  "learning_rate": 3.4377425072962465e-05,
907
+ "loss": 0.4583,
908
  "step": 125
909
  },
910
  {
911
  "epoch": 0.75,
912
+ "grad_norm": 0.12685010104316322,
913
  "learning_rate": 3.289015716409631e-05,
914
+ "loss": 0.4445,
915
  "step": 126
916
  },
917
  {
918
  "epoch": 0.75,
919
+ "eval_loss": 0.9644750952720642,
920
+ "eval_runtime": 155.1633,
921
+ "eval_samples_per_second": 1.605,
922
+ "eval_steps_per_second": 0.806,
923
  "step": 126
924
  },
925
  {
926
  "epoch": 0.75,
927
+ "grad_norm": 0.127123263865781,
928
  "learning_rate": 3.14294204913587e-05,
929
+ "loss": 0.4723,
930
  "step": 127
931
  },
932
  {
933
  "epoch": 0.76,
934
+ "grad_norm": 0.1220769041189061,
935
  "learning_rate": 2.9995792543057478e-05,
936
+ "loss": 0.478,
937
  "step": 128
938
  },
939
  {
940
  "epoch": 0.77,
941
+ "grad_norm": 0.11180771873074949,
942
  "learning_rate": 2.8589840090325027e-05,
943
+ "loss": 0.4909,
944
  "step": 129
945
  },
946
  {
947
  "epoch": 0.77,
948
+ "grad_norm": 0.13041544813383119,
949
  "learning_rate": 2.7212118963050592e-05,
950
+ "loss": 0.4866,
951
  "step": 130
952
  },
953
  {
954
  "epoch": 0.78,
955
+ "grad_norm": 0.1405254912635375,
956
  "learning_rate": 2.586317383013821e-05,
957
+ "loss": 0.4644,
958
  "step": 131
959
  },
960
  {
961
  "epoch": 0.78,
962
+ "grad_norm": 0.1344866421919383,
963
  "learning_rate": 2.4543537984176978e-05,
964
+ "loss": 0.4456,
965
  "step": 132
966
  },
967
  {
968
  "epoch": 0.79,
969
+ "grad_norm": 0.11186073290835527,
970
  "learning_rate": 2.325373313060919e-05,
971
+ "loss": 0.4994,
972
  "step": 133
973
  },
974
  {
975
  "epoch": 0.8,
976
+ "grad_norm": 0.11996543804410408,
977
  "learning_rate": 2.19942691814788e-05,
978
+ "loss": 0.5109,
979
  "step": 134
980
  },
981
  {
982
  "epoch": 0.8,
983
+ "grad_norm": 0.1213151845276305,
984
  "learning_rate": 2.076564405384258e-05,
985
+ "loss": 0.4519,
986
  "step": 135
987
  },
988
  {
989
  "epoch": 0.81,
990
+ "grad_norm": 0.13448384445509487,
991
  "learning_rate": 1.9568343472923524e-05,
992
+ "loss": 0.5102,
993
  "step": 136
994
  },
995
  {
996
  "epoch": 0.81,
997
+ "grad_norm": 0.12446550745886309,
998
  "learning_rate": 1.840284078008393e-05,
999
+ "loss": 0.5064,
1000
  "step": 137
1001
  },
1002
  {
1003
  "epoch": 0.82,
1004
+ "grad_norm": 0.14018404224721664,
1005
  "learning_rate": 1.7269596745694295e-05,
1006
+ "loss": 0.4805,
1007
  "step": 138
1008
  },
1009
  {
1010
  "epoch": 0.83,
1011
+ "grad_norm": 0.1218286028602253,
1012
  "learning_rate": 1.616905938697234e-05,
1013
+ "loss": 0.4388,
1014
  "step": 139
1015
  },
1016
  {
1017
  "epoch": 0.83,
1018
+ "grad_norm": 0.1293899895212843,
1019
  "learning_rate": 1.5101663790863596e-05,
1020
+ "loss": 0.4935,
1021
  "step": 140
1022
  },
1023
  {
1024
  "epoch": 0.84,
1025
+ "grad_norm": 0.1338746361041889,
1026
  "learning_rate": 1.4067831942033904e-05,
1027
+ "loss": 0.4892,
1028
  "step": 141
1029
  },
1030
  {
1031
  "epoch": 0.84,
1032
+ "grad_norm": 0.12837018186336283,
1033
  "learning_rate": 1.3067972556041752e-05,
1034
+ "loss": 0.4566,
1035
  "step": 142
1036
  },
1037
  {
1038
  "epoch": 0.85,
1039
+ "grad_norm": 0.1359601843202103,
1040
  "learning_rate": 1.210248091775663e-05,
1041
+ "loss": 0.5277,
1042
  "step": 143
1043
  },
1044
  {
1045
  "epoch": 0.86,
1046
+ "grad_norm": 0.14494892289523706,
1047
  "learning_rate": 1.1171738725086833e-05,
1048
+ "loss": 0.4419,
1049
  "step": 144
1050
  },
1051
  {
1052
  "epoch": 0.86,
1053
+ "grad_norm": 0.13277965728610747,
1054
  "learning_rate": 1.0276113938078769e-05,
1055
+ "loss": 0.4662,
1056
  "step": 145
1057
  },
1058
  {
1059
  "epoch": 0.87,
1060
+ "grad_norm": 0.12880953859990588,
1061
  "learning_rate": 9.415960633447674e-06,
1062
  "loss": 0.4452,
1063
  "step": 146
1064
  },
1065
  {
1066
  "epoch": 0.87,
1067
+ "grad_norm": 0.13140413424345554,
1068
  "learning_rate": 8.59161886459654e-06,
1069
+ "loss": 0.485,
1070
  "step": 147
1071
  },
1072
  {
1073
  "epoch": 0.88,
1074
+ "grad_norm": 0.1319983788489183,
1075
  "learning_rate": 7.803414527179343e-06,
1076
+ "loss": 0.4601,
1077
  "step": 148
1078
  },
1079
  {
1080
  "epoch": 0.89,
1081
+ "grad_norm": 0.12708833536742306,
1082
  "learning_rate": 7.051659230261298e-06,
1083
+ "loss": 0.498,
1084
  "step": 149
1085
  },
1086
  {
1087
  "epoch": 0.89,
1088
+ "grad_norm": 0.13606238849102634,
1089
  "learning_rate": 6.336650173127223e-06,
1090
+ "loss": 0.504,
1091
  "step": 150
1092
  },
1093
  {
1094
  "epoch": 0.9,
1095
+ "grad_norm": 0.1295595089005434,
1096
  "learning_rate": 5.658670027786561e-06,
1097
+ "loss": 0.4704,
1098
  "step": 151
1099
  },
1100
  {
1101
  "epoch": 0.9,
1102
+ "grad_norm": 0.1426317101207339,
1103
  "learning_rate": 5.017986827221733e-06,
1104
+ "loss": 0.4979,
1105
  "step": 152
1106
  },
1107
  {
1108
  "epoch": 0.91,
1109
+ "grad_norm": 0.14306220741750347,
1110
  "learning_rate": 4.4148538594239174e-06,
1111
+ "loss": 0.5164,
1112
  "step": 153
1113
  },
1114
  {
1115
  "epoch": 0.92,
1116
+ "grad_norm": 0.11966060296362775,
1117
  "learning_rate": 3.849509567257959e-06,
1118
+ "loss": 0.508,
1119
  "step": 154
1120
  },
1121
  {
1122
  "epoch": 0.92,
1123
+ "grad_norm": 0.13410648920269255,
1124
  "learning_rate": 3.3221774541962845e-06,
1125
+ "loss": 0.4647,
1126
  "step": 155
1127
  },
1128
  {
1129
  "epoch": 0.93,
1130
+ "grad_norm": 0.14279456888939884,
1131
  "learning_rate": 2.8330659959589946e-06,
1132
+ "loss": 0.4446,
1133
  "step": 156
1134
  },
1135
  {
1136
  "epoch": 0.93,
1137
+ "grad_norm": 0.12867690404851015,
1138
  "learning_rate": 2.3823685580949273e-06,
1139
+ "loss": 0.453,
1140
  "step": 157
1141
  },
1142
  {
1143
  "epoch": 0.94,
1144
+ "grad_norm": 0.12934543469906262,
1145
  "learning_rate": 1.9702633195363917e-06,
1146
+ "loss": 0.4782,
1147
  "step": 158
1148
  },
1149
  {
1150
  "epoch": 0.95,
1151
+ "grad_norm": 0.12938497975922164,
1152
  "learning_rate": 1.5969132021579347e-06,
1153
+ "loss": 0.5004,
1154
  "step": 159
1155
  },
1156
  {
1157
  "epoch": 0.95,
1158
+ "grad_norm": 0.12309207540187947,
1159
  "learning_rate": 1.2624658063666639e-06,
1160
+ "loss": 0.5235,
1161
  "step": 160
1162
  },
1163
  {
1164
  "epoch": 0.96,
1165
+ "grad_norm": 0.11548916080113472,
1166
  "learning_rate": 9.670533527498137e-07,
1167
+ "loss": 0.4436,
1168
  "step": 161
1169
  },
1170
  {
1171
  "epoch": 0.96,
1172
+ "grad_norm": 0.12421128547481033,
1173
  "learning_rate": 7.10792629802659e-07,
1174
+ "loss": 0.493,
1175
  "step": 162
1176
  },
1177
  {
1178
  "epoch": 0.97,
1179
+ "grad_norm": 0.12845932182275463,
1180
  "learning_rate": 4.937849477572587e-07,
1181
+ "loss": 0.4322,
1182
  "step": 163
1183
  },
1184
  {
1185
  "epoch": 0.97,
1186
+ "grad_norm": 0.12044022703621939,
1187
  "learning_rate": 3.161160985304168e-07,
1188
+ "loss": 0.4261,
1189
  "step": 164
1190
  },
1191
  {
1192
  "epoch": 0.98,
1193
+ "grad_norm": 0.12854436448690823,
1194
  "learning_rate": 1.7785632180670198e-07,
1195
  "loss": 0.436,
1196
  "step": 165
1197
  },
1198
  {
1199
  "epoch": 0.99,
1200
+ "grad_norm": 0.12592400334168094,
1201
  "learning_rate": 7.906027726981568e-08,
1202
+ "loss": 0.5008,
1203
  "step": 166
1204
  },
1205
  {
1206
  "epoch": 0.99,
1207
+ "grad_norm": 0.13604136642444342,
1208
  "learning_rate": 1.976702299344435e-08,
1209
+ "loss": 0.5117,
1210
  "step": 167
1211
  },
1212
  {
1213
  "epoch": 1.0,
1214
+ "grad_norm": 0.12600479253243235,
1215
  "learning_rate": 0.0,
1216
+ "loss": 0.4609,
1217
  "step": 168
1218
  },
1219
  {
1220
  "epoch": 1.0,
1221
+ "eval_loss": 0.9640664458274841,
1222
+ "eval_runtime": 154.1049,
1223
+ "eval_samples_per_second": 1.616,
1224
+ "eval_steps_per_second": 0.811,
1225
  "step": 168
1226
  }
1227
  ],
checkpoint-168/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbfaffe231fe183758cc2353296944fef75710807fa8ea7663978d9e906d00f7
3
  size 6968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3baf97ea9bcc6b73a05f59bedb019ebf1656742a3d0668caac6a61c4da440046
3
  size 6968
config.json CHANGED
@@ -37,13 +37,28 @@
37
  "num_key_value_heads": 8,
38
  "output_router_logits": false,
39
  "pad_token_id": 0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  "rms_norm_eps": 1e-06,
41
  "router_aux_loss_coef": 0.001,
42
  "sliding_window": null,
43
  "tie_word_embeddings": false,
44
- "torch_dtype": "float16",
45
  "transformers_version": "4.40.0.dev0",
46
- "use_cache": true,
47
  "use_mamba_kernels": true,
48
  "vocab_size": 65536
49
  }
 
37
  "num_key_value_heads": 8,
38
  "output_router_logits": false,
39
  "pad_token_id": 0,
40
+ "quantization_config": {
41
+ "_load_in_4bit": true,
42
+ "_load_in_8bit": false,
43
+ "bnb_4bit_compute_dtype": "bfloat16",
44
+ "bnb_4bit_quant_storage": "bfloat16",
45
+ "bnb_4bit_quant_type": "nf4",
46
+ "bnb_4bit_use_double_quant": true,
47
+ "llm_int8_enable_fp32_cpu_offload": false,
48
+ "llm_int8_has_fp16_weight": false,
49
+ "llm_int8_skip_modules": null,
50
+ "llm_int8_threshold": 6.0,
51
+ "load_in_4bit": true,
52
+ "load_in_8bit": false,
53
+ "quant_method": "bitsandbytes"
54
+ },
55
  "rms_norm_eps": 1e-06,
56
  "router_aux_loss_coef": 0.001,
57
  "sliding_window": null,
58
  "tie_word_embeddings": false,
59
+ "torch_dtype": "bfloat16",
60
  "transformers_version": "4.40.0.dev0",
61
+ "use_cache": false,
62
  "use_mamba_kernels": true,
63
  "vocab_size": 65536
64
  }
tokenizer_config.json CHANGED
@@ -36,6 +36,7 @@
36
  }
37
  },
38
  "bos_token": "<|startoftext|>",
 
39
  "clean_up_tokenization_spaces": false,
40
  "eos_token": "<|endoftext|>",
41
  "model_max_length": 1000000000000000019884624838656,
 
36
  }
37
  },
38
  "bos_token": "<|startoftext|>",
39
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
40
  "clean_up_tokenization_spaces": false,
41
  "eos_token": "<|endoftext|>",
42
  "model_max_length": 1000000000000000019884624838656,