terry69 commited on
Commit
e1a89f2
1 Parent(s): 66e07e7

Model save

Browse files
README.md CHANGED
@@ -4,7 +4,6 @@ library_name: peft
4
  tags:
5
  - trl
6
  - sft
7
- - alignment-handbook
8
  - generated_from_trainer
9
  base_model: mistralai/Mistral-7B-v0.1
10
  model-index:
@@ -56,7 +55,7 @@ The following hyperparameters were used during training:
56
 
57
  | Training Loss | Epoch | Step | Validation Loss |
58
  |:-------------:|:-----:|:----:|:---------------:|
59
- | 0.749 | 1.0 | 325 | 0.7774 |
60
 
61
 
62
  ### Framework versions
 
4
  tags:
5
  - trl
6
  - sft
 
7
  - generated_from_trainer
8
  base_model: mistralai/Mistral-7B-v0.1
9
  model-index:
 
55
 
56
  | Training Loss | Epoch | Step | Validation Loss |
57
  |:-------------:|:-----:|:----:|:---------------:|
58
+ | 0.7491 | 1.0 | 325 | 0.7774 |
59
 
60
 
61
  ### Framework versions
adapter_config.json CHANGED
@@ -19,13 +19,13 @@
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
22
- "v_proj",
23
- "q_proj",
24
- "down_proj",
25
  "gate_proj",
26
  "k_proj",
27
- "up_proj",
28
- "o_proj"
 
29
  ],
30
  "task_type": "CAUSAL_LM"
31
  }
 
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
22
+ "up_proj",
23
+ "o_proj",
 
24
  "gate_proj",
25
  "k_proj",
26
+ "down_proj",
27
+ "q_proj",
28
+ "v_proj"
29
  ],
30
  "task_type": "CAUSAL_LM"
31
  }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e25896f5c5bd693bc056a5478634a80a652b7a627e41d61f0fb6ddbe6ea8d5e8
3
  size 83946192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53af4775901a7163d635d95a3fe57ac3c41ec324e6f8f992e5db1cc53b59f2b4
3
  size 83946192
all_results.json CHANGED
@@ -1,13 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_loss": 0.7775599956512451,
4
- "eval_runtime": 45.2826,
5
- "eval_samples": 231,
6
- "eval_samples_per_second": 5.101,
7
- "eval_steps_per_second": 0.177,
8
- "train_loss": 0.057297961895282454,
9
- "train_runtime": 986.3633,
10
  "train_samples": 20787,
11
- "train_samples_per_second": 21.074,
12
- "train_steps_per_second": 0.329
13
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.7601131853690514,
4
+ "train_runtime": 11993.6222,
 
 
 
 
 
5
  "train_samples": 20787,
6
+ "train_samples_per_second": 1.733,
7
+ "train_steps_per_second": 0.027
8
  }
runs/Apr08_20-25-52_ip-172-31-69-60.ec2.internal/events.out.tfevents.1712607974.ip-172-31-69-60.ec2.internal.23138.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:272707ee25876d39e2d4b937c0d3e81d55db05ff5b99c8bf6800390d0ee51767
3
+ size 5725
runs/Apr08_22-39-05_ip-172-31-69-60.ec2.internal/events.out.tfevents.1712615965.ip-172-31-69-60.ec2.internal.45971.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9858aeaa7e541e20024a4a3996d8551253eb440245d9f61f0bc5738dfa93b4c6
3
+ size 19137
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.057297961895282454,
4
- "train_runtime": 986.3633,
5
  "train_samples": 20787,
6
- "train_samples_per_second": 21.074,
7
- "train_steps_per_second": 0.329
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.7601131853690514,
4
+ "train_runtime": 11993.6222,
5
  "train_samples": 20787,
6
+ "train_samples_per_second": 1.733,
7
+ "train_steps_per_second": 0.027
8
  }
trainer_state.json CHANGED
@@ -10,482 +10,482 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "grad_norm": 0.2686200088875347,
14
  "learning_rate": 6.060606060606061e-06,
15
  "loss": 0.7782,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.02,
20
- "grad_norm": 0.21475453367529,
21
  "learning_rate": 3.0303030303030306e-05,
22
- "loss": 0.7797,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.03,
27
- "grad_norm": 0.2247471478195798,
28
  "learning_rate": 6.060606060606061e-05,
29
  "loss": 0.7851,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.05,
34
- "grad_norm": 0.20570321309918235,
35
  "learning_rate": 9.090909090909092e-05,
36
- "loss": 0.7854,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.06,
41
- "grad_norm": 0.18031349892436885,
42
  "learning_rate": 0.00012121212121212122,
43
- "loss": 0.7919,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.08,
48
- "grad_norm": 0.24146229252382978,
49
  "learning_rate": 0.00015151515151515152,
50
- "loss": 0.738,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.09,
55
- "grad_norm": 0.3320926970306046,
56
  "learning_rate": 0.00018181818181818183,
57
- "loss": 0.7418,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.11,
62
- "grad_norm": 0.19858557544056218,
63
  "learning_rate": 0.00019997685019798912,
64
- "loss": 0.764,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.12,
69
- "grad_norm": 0.19954091433241658,
70
  "learning_rate": 0.0001997165380022878,
71
  "loss": 0.7607,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.14,
76
- "grad_norm": 0.1889847467880647,
77
  "learning_rate": 0.000199167731989929,
78
- "loss": 0.7949,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.15,
83
- "grad_norm": 0.19722641297574658,
84
  "learning_rate": 0.0001983320199330545,
85
- "loss": 0.7642,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.17,
90
- "grad_norm": 0.19369172161622056,
91
  "learning_rate": 0.00019721181966290613,
92
- "loss": 0.7837,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.18,
97
- "grad_norm": 0.18633655144200606,
98
  "learning_rate": 0.00019581037207470382,
99
- "loss": 0.7569,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.2,
104
- "grad_norm": 0.17978045786081515,
105
  "learning_rate": 0.00019413173175128473,
106
- "loss": 0.7615,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.22,
111
- "grad_norm": 0.1946967703536159,
112
  "learning_rate": 0.00019218075523263104,
113
- "loss": 0.7537,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.23,
118
- "grad_norm": 0.1697959010297178,
119
  "learning_rate": 0.00018996308696522433,
120
  "loss": 0.7617,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.25,
125
- "grad_norm": 0.19424776053519632,
126
  "learning_rate": 0.00018748514297187648,
127
  "loss": 0.7995,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.26,
132
- "grad_norm": 0.17591012762345642,
133
  "learning_rate": 0.00018475409228928312,
134
- "loss": 0.754,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.28,
139
- "grad_norm": 0.18668198691171894,
140
  "learning_rate": 0.00018177783622700327,
141
  "loss": 0.7754,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.29,
146
- "grad_norm": 0.1741754965716253,
147
  "learning_rate": 0.00017856498550787144,
148
- "loss": 0.7923,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.31,
153
- "grad_norm": 0.18855993719083636,
154
  "learning_rate": 0.00017512483535597867,
155
- "loss": 0.7546,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.32,
160
- "grad_norm": 0.16677585411695353,
161
  "learning_rate": 0.00017146733860429612,
162
- "loss": 0.7752,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.34,
167
- "grad_norm": 0.2057091607658948,
168
  "learning_rate": 0.0001676030768997445,
169
- "loss": 0.745,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.35,
174
- "grad_norm": 0.18783062116028282,
175
  "learning_rate": 0.00016354323008901776,
176
  "loss": 0.7499,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.37,
181
- "grad_norm": 0.1625675859425024,
182
  "learning_rate": 0.00015929954387373103,
183
- "loss": 0.7347,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.38,
188
- "grad_norm": 0.19803022161416892,
189
  "learning_rate": 0.00015488429582847192,
190
- "loss": 0.791,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.4,
195
- "grad_norm": 0.19340769155849927,
196
  "learning_rate": 0.00015031025988006936,
197
- "loss": 0.7844,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.42,
202
- "grad_norm": 0.18701663817803796,
203
  "learning_rate": 0.00014559066935084588,
204
- "loss": 0.7883,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.43,
209
- "grad_norm": 0.16387851112042653,
210
  "learning_rate": 0.00014073917867277557,
211
- "loss": 0.7477,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.45,
216
- "grad_norm": 0.16958293060069174,
217
  "learning_rate": 0.0001357698238833126,
218
- "loss": 0.7577,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.46,
223
- "grad_norm": 0.17304291851159886,
224
  "learning_rate": 0.000130696982017182,
225
  "loss": 0.7349,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.48,
230
- "grad_norm": 0.20233311247327937,
231
  "learning_rate": 0.0001255353295116187,
232
- "loss": 0.7579,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.49,
237
- "grad_norm": 0.18036287897941758,
238
  "learning_rate": 0.00012029979974539234,
239
  "loss": 0.7428,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.51,
244
- "grad_norm": 0.18711131061340178,
245
  "learning_rate": 0.00011500553983446527,
246
- "loss": 0.7913,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.52,
251
- "grad_norm": 0.17301090514375908,
252
  "learning_rate": 0.00010966786680927874,
253
- "loss": 0.7278,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.54,
258
- "grad_norm": 0.19697238676167705,
259
  "learning_rate": 0.00010430222330045304,
260
- "loss": 0.7802,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.55,
265
- "grad_norm": 0.1688341794712563,
266
  "learning_rate": 9.892413286110886e-05,
267
- "loss": 0.7521,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.57,
272
- "grad_norm": 0.1770041663872631,
273
  "learning_rate": 9.354915505506839e-05,
274
- "loss": 0.7652,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.58,
279
- "grad_norm": 0.17975920055364886,
280
  "learning_rate": 8.81928404408726e-05,
281
- "loss": 0.7423,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.6,
286
- "grad_norm": 0.16596750546842118,
287
  "learning_rate": 8.287068558185225e-05,
288
  "loss": 0.785,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.62,
293
- "grad_norm": 0.17922768297335867,
294
  "learning_rate": 7.759808821241406e-05,
295
- "loss": 0.7469,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.63,
300
- "grad_norm": 0.16797159131863615,
301
  "learning_rate": 7.239030269025311e-05,
302
- "loss": 0.7434,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.65,
307
- "grad_norm": 0.17963390706599044,
308
  "learning_rate": 6.726239586337408e-05,
309
  "loss": 0.76,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.66,
314
- "grad_norm": 0.1632820830493435,
315
  "learning_rate": 6.22292034796035e-05,
316
- "loss": 0.753,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.68,
321
- "grad_norm": 0.1826122995064924,
322
  "learning_rate": 5.730528726470792e-05,
323
- "loss": 0.7681,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.69,
328
- "grad_norm": 0.1613160734854796,
329
  "learning_rate": 5.2504892793295e-05,
330
- "loss": 0.743,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.71,
335
- "grad_norm": 0.18621725463530472,
336
  "learning_rate": 4.7841908274384616e-05,
337
- "loss": 0.7717,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.72,
342
- "grad_norm": 0.1744166288344059,
343
  "learning_rate": 4.332982437088825e-05,
344
- "loss": 0.7149,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.74,
349
- "grad_norm": 0.17659716102656,
350
  "learning_rate": 3.898169516924398e-05,
351
- "loss": 0.751,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.75,
356
- "grad_norm": 0.1715227056995353,
357
  "learning_rate": 3.4810100412128747e-05,
358
- "loss": 0.7561,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.77,
363
- "grad_norm": 0.17506626977459022,
364
  "learning_rate": 3.0827109103512643e-05,
365
  "loss": 0.7669,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.78,
370
- "grad_norm": 0.17815639423266896,
371
  "learning_rate": 2.7044244591351232e-05,
372
- "loss": 0.7589,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.8,
377
- "grad_norm": 0.1636846820858173,
378
  "learning_rate": 2.3472451228937253e-05,
379
- "loss": 0.7393,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.82,
384
- "grad_norm": 0.17100053615356214,
385
  "learning_rate": 2.0122062711363532e-05,
386
- "loss": 0.7764,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.83,
391
- "grad_norm": 0.17768328277125636,
392
  "learning_rate": 1.7002772178705716e-05,
393
- "loss": 0.7331,
394
  "step": 270
395
  },
396
  {
397
  "epoch": 0.85,
398
- "grad_norm": 0.16988419995617995,
399
  "learning_rate": 1.4123604172419713e-05,
400
- "loss": 0.7719,
401
  "step": 275
402
  },
403
  {
404
  "epoch": 0.86,
405
- "grad_norm": 0.18468128236342862,
406
  "learning_rate": 1.149288852608743e-05,
407
  "loss": 0.7558,
408
  "step": 280
409
  },
410
  {
411
  "epoch": 0.88,
412
- "grad_norm": 0.16594356186499187,
413
  "learning_rate": 9.118236266049707e-06,
414
- "loss": 0.7309,
415
  "step": 285
416
  },
417
  {
418
  "epoch": 0.89,
419
- "grad_norm": 0.1726455301629271,
420
  "learning_rate": 7.0065175916482095e-06,
421
- "loss": 0.76,
422
  "step": 290
423
  },
424
  {
425
  "epoch": 0.91,
426
- "grad_norm": 0.1521864857478625,
427
  "learning_rate": 5.163841998782837e-06,
428
- "loss": 0.7642,
429
  "step": 295
430
  },
431
  {
432
  "epoch": 0.92,
433
- "grad_norm": 0.1629366183790264,
434
  "learning_rate": 3.595540604290437e-06,
435
- "loss": 0.7578,
436
  "step": 300
437
  },
438
  {
439
  "epoch": 0.94,
440
- "grad_norm": 0.17093253787814383,
441
  "learning_rate": 2.30615072228183e-06,
442
- "loss": 0.733,
443
  "step": 305
444
  },
445
  {
446
  "epoch": 0.95,
447
- "grad_norm": 0.16234114073804565,
448
  "learning_rate": 1.2994027370611173e-06,
449
- "loss": 0.7626,
450
  "step": 310
451
  },
452
  {
453
  "epoch": 0.97,
454
- "grad_norm": 0.15162800717592476,
455
  "learning_rate": 5.782093106048159e-07,
456
  "loss": 0.7366,
457
  "step": 315
458
  },
459
  {
460
  "epoch": 0.98,
461
- "grad_norm": 0.17565044386960604,
462
  "learning_rate": 1.446569558255395e-07,
463
- "loss": 0.7431,
464
  "step": 320
465
  },
466
  {
467
  "epoch": 1.0,
468
- "grad_norm": 0.1698173841843245,
469
  "learning_rate": 0.0,
470
- "loss": 0.749,
471
  "step": 325
472
  },
473
  {
474
  "epoch": 1.0,
475
- "eval_loss": 0.7774477005004883,
476
- "eval_runtime": 47.2759,
477
- "eval_samples_per_second": 4.886,
478
- "eval_steps_per_second": 0.169,
479
  "step": 325
480
  },
481
  {
482
  "epoch": 1.0,
483
  "step": 325,
484
- "total_flos": 4954791347224576.0,
485
- "train_loss": 0.057297961895282454,
486
- "train_runtime": 986.3633,
487
- "train_samples_per_second": 21.074,
488
- "train_steps_per_second": 0.329
489
  }
490
  ],
491
  "logging_steps": 5,
@@ -493,7 +493,7 @@
493
  "num_input_tokens_seen": 0,
494
  "num_train_epochs": 1,
495
  "save_steps": 100,
496
- "total_flos": 4954791347224576.0,
497
  "train_batch_size": 8,
498
  "trial_name": null,
499
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "grad_norm": 0.26868827253641747,
14
  "learning_rate": 6.060606060606061e-06,
15
  "loss": 0.7782,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.02,
20
+ "grad_norm": 0.21382632169384555,
21
  "learning_rate": 3.0303030303030306e-05,
22
+ "loss": 0.7798,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.03,
27
+ "grad_norm": 0.22445606867737244,
28
  "learning_rate": 6.060606060606061e-05,
29
  "loss": 0.7851,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.05,
34
+ "grad_norm": 0.2072105558411773,
35
  "learning_rate": 9.090909090909092e-05,
36
+ "loss": 0.7855,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.06,
41
+ "grad_norm": 0.17995218448427217,
42
  "learning_rate": 0.00012121212121212122,
43
+ "loss": 0.792,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.08,
48
+ "grad_norm": 0.226508952573423,
49
  "learning_rate": 0.00015151515151515152,
50
+ "loss": 0.7382,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.09,
55
+ "grad_norm": 0.18958014075029098,
56
  "learning_rate": 0.00018181818181818183,
57
+ "loss": 0.7419,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.11,
62
+ "grad_norm": 0.19949889749656097,
63
  "learning_rate": 0.00019997685019798912,
64
+ "loss": 0.7642,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.12,
69
+ "grad_norm": 0.19567934772580736,
70
  "learning_rate": 0.0001997165380022878,
71
  "loss": 0.7607,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.14,
76
+ "grad_norm": 0.18969776374927835,
77
  "learning_rate": 0.000199167731989929,
78
+ "loss": 0.795,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.15,
83
+ "grad_norm": 0.1942706799722158,
84
  "learning_rate": 0.0001983320199330545,
85
+ "loss": 0.764,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.17,
90
+ "grad_norm": 0.19464988909031225,
91
  "learning_rate": 0.00019721181966290613,
92
+ "loss": 0.7836,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.18,
97
+ "grad_norm": 0.1849352970071158,
98
  "learning_rate": 0.00019581037207470382,
99
+ "loss": 0.757,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.2,
104
+ "grad_norm": 0.17976128059849958,
105
  "learning_rate": 0.00019413173175128473,
106
+ "loss": 0.7616,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.22,
111
+ "grad_norm": 0.1934718609207969,
112
  "learning_rate": 0.00019218075523263104,
113
+ "loss": 0.7538,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.23,
118
+ "grad_norm": 0.16955946612078765,
119
  "learning_rate": 0.00018996308696522433,
120
  "loss": 0.7617,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.25,
125
+ "grad_norm": 0.19860710500579018,
126
  "learning_rate": 0.00018748514297187648,
127
  "loss": 0.7995,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.26,
132
+ "grad_norm": 0.17834039868116905,
133
  "learning_rate": 0.00018475409228928312,
134
+ "loss": 0.7539,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.28,
139
+ "grad_norm": 0.1889302979453787,
140
  "learning_rate": 0.00018177783622700327,
141
  "loss": 0.7754,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.29,
146
+ "grad_norm": 0.17928302611917235,
147
  "learning_rate": 0.00017856498550787144,
148
+ "loss": 0.7924,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.31,
153
+ "grad_norm": 0.18965987325021239,
154
  "learning_rate": 0.00017512483535597867,
155
+ "loss": 0.7547,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.32,
160
+ "grad_norm": 0.16948470382160882,
161
  "learning_rate": 0.00017146733860429612,
162
+ "loss": 0.7755,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.34,
167
+ "grad_norm": 0.20373009441973453,
168
  "learning_rate": 0.0001676030768997445,
169
+ "loss": 0.7452,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.35,
174
+ "grad_norm": 0.18030063668475196,
175
  "learning_rate": 0.00016354323008901776,
176
  "loss": 0.7499,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.37,
181
+ "grad_norm": 0.16132188005262432,
182
  "learning_rate": 0.00015929954387373103,
183
+ "loss": 0.7344,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.38,
188
+ "grad_norm": 0.2014655387761585,
189
  "learning_rate": 0.00015488429582847192,
190
+ "loss": 0.7909,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.4,
195
+ "grad_norm": 0.19701253442033323,
196
  "learning_rate": 0.00015031025988006936,
197
+ "loss": 0.7846,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.42,
202
+ "grad_norm": 0.18599715688382074,
203
  "learning_rate": 0.00014559066935084588,
204
+ "loss": 0.7884,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.43,
209
+ "grad_norm": 0.15916898081839567,
210
  "learning_rate": 0.00014073917867277557,
211
+ "loss": 0.7478,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.45,
216
+ "grad_norm": 0.1693289847957747,
217
  "learning_rate": 0.0001357698238833126,
218
+ "loss": 0.758,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.46,
223
+ "grad_norm": 0.1724296952005482,
224
  "learning_rate": 0.000130696982017182,
225
  "loss": 0.7349,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.48,
230
+ "grad_norm": 0.20365543265357713,
231
  "learning_rate": 0.0001255353295116187,
232
+ "loss": 0.7581,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.49,
237
+ "grad_norm": 0.17925645984310104,
238
  "learning_rate": 0.00012029979974539234,
239
  "loss": 0.7428,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.51,
244
+ "grad_norm": 0.1849261223570645,
245
  "learning_rate": 0.00011500553983446527,
246
+ "loss": 0.7915,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.52,
251
+ "grad_norm": 0.17147514694965366,
252
  "learning_rate": 0.00010966786680927874,
253
+ "loss": 0.7282,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.54,
258
+ "grad_norm": 0.19718678657625408,
259
  "learning_rate": 0.00010430222330045304,
260
+ "loss": 0.7807,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.55,
265
+ "grad_norm": 0.1697756132509757,
266
  "learning_rate": 9.892413286110886e-05,
267
+ "loss": 0.7522,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.57,
272
+ "grad_norm": 0.1745926097752596,
273
  "learning_rate": 9.354915505506839e-05,
274
+ "loss": 0.7656,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.58,
279
+ "grad_norm": 0.17991554784663008,
280
  "learning_rate": 8.81928404408726e-05,
281
+ "loss": 0.7425,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.6,
286
+ "grad_norm": 0.1638975774298517,
287
  "learning_rate": 8.287068558185225e-05,
288
  "loss": 0.785,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.62,
293
+ "grad_norm": 0.1721376274882642,
294
  "learning_rate": 7.759808821241406e-05,
295
+ "loss": 0.747,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.63,
300
+ "grad_norm": 0.16645909181505114,
301
  "learning_rate": 7.239030269025311e-05,
302
+ "loss": 0.7436,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.65,
307
+ "grad_norm": 0.17668278165163676,
308
  "learning_rate": 6.726239586337408e-05,
309
  "loss": 0.76,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.66,
314
+ "grad_norm": 0.1592821063866204,
315
  "learning_rate": 6.22292034796035e-05,
316
+ "loss": 0.7532,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.68,
321
+ "grad_norm": 0.18211269157973645,
322
  "learning_rate": 5.730528726470792e-05,
323
+ "loss": 0.7684,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.69,
328
+ "grad_norm": 0.1611844005218168,
329
  "learning_rate": 5.2504892793295e-05,
330
+ "loss": 0.7432,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.71,
335
+ "grad_norm": 0.1770721524356504,
336
  "learning_rate": 4.7841908274384616e-05,
337
+ "loss": 0.7719,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.72,
342
+ "grad_norm": 0.17458138059176753,
343
  "learning_rate": 4.332982437088825e-05,
344
+ "loss": 0.7153,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.74,
349
+ "grad_norm": 0.1764916869298734,
350
  "learning_rate": 3.898169516924398e-05,
351
+ "loss": 0.7509,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.75,
356
+ "grad_norm": 0.17474182112370204,
357
  "learning_rate": 3.4810100412128747e-05,
358
+ "loss": 0.7562,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.77,
363
+ "grad_norm": 0.1741562771880736,
364
  "learning_rate": 3.0827109103512643e-05,
365
  "loss": 0.7669,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.78,
370
+ "grad_norm": 0.17549490206623847,
371
  "learning_rate": 2.7044244591351232e-05,
372
+ "loss": 0.7592,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.8,
377
+ "grad_norm": 0.16399003172635285,
378
  "learning_rate": 2.3472451228937253e-05,
379
+ "loss": 0.7396,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.82,
384
+ "grad_norm": 0.1713214529866966,
385
  "learning_rate": 2.0122062711363532e-05,
386
+ "loss": 0.7766,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.83,
391
+ "grad_norm": 0.17574824064745212,
392
  "learning_rate": 1.7002772178705716e-05,
393
+ "loss": 0.7329,
394
  "step": 270
395
  },
396
  {
397
  "epoch": 0.85,
398
+ "grad_norm": 0.16897840325162736,
399
  "learning_rate": 1.4123604172419713e-05,
400
+ "loss": 0.7723,
401
  "step": 275
402
  },
403
  {
404
  "epoch": 0.86,
405
+ "grad_norm": 0.18239238197655358,
406
  "learning_rate": 1.149288852608743e-05,
407
  "loss": 0.7558,
408
  "step": 280
409
  },
410
  {
411
  "epoch": 0.88,
412
+ "grad_norm": 0.16446806582557563,
413
  "learning_rate": 9.118236266049707e-06,
414
+ "loss": 0.731,
415
  "step": 285
416
  },
417
  {
418
  "epoch": 0.89,
419
+ "grad_norm": 0.17024389183471494,
420
  "learning_rate": 7.0065175916482095e-06,
421
+ "loss": 0.7599,
422
  "step": 290
423
  },
424
  {
425
  "epoch": 0.91,
426
+ "grad_norm": 0.15264356146830035,
427
  "learning_rate": 5.163841998782837e-06,
428
+ "loss": 0.7644,
429
  "step": 295
430
  },
431
  {
432
  "epoch": 0.92,
433
+ "grad_norm": 0.16324609453084554,
434
  "learning_rate": 3.595540604290437e-06,
435
+ "loss": 0.7581,
436
  "step": 300
437
  },
438
  {
439
  "epoch": 0.94,
440
+ "grad_norm": 0.1710138375738759,
441
  "learning_rate": 2.30615072228183e-06,
442
+ "loss": 0.7332,
443
  "step": 305
444
  },
445
  {
446
  "epoch": 0.95,
447
+ "grad_norm": 0.1611044475082952,
448
  "learning_rate": 1.2994027370611173e-06,
449
+ "loss": 0.7627,
450
  "step": 310
451
  },
452
  {
453
  "epoch": 0.97,
454
+ "grad_norm": 0.1514646487097857,
455
  "learning_rate": 5.782093106048159e-07,
456
  "loss": 0.7366,
457
  "step": 315
458
  },
459
  {
460
  "epoch": 0.98,
461
+ "grad_norm": 0.17261842858442877,
462
  "learning_rate": 1.446569558255395e-07,
463
+ "loss": 0.7436,
464
  "step": 320
465
  },
466
  {
467
  "epoch": 1.0,
468
+ "grad_norm": 0.16882560969081586,
469
  "learning_rate": 0.0,
470
+ "loss": 0.7491,
471
  "step": 325
472
  },
473
  {
474
  "epoch": 1.0,
475
+ "eval_loss": 0.7773829698562622,
476
+ "eval_runtime": 46.5077,
477
+ "eval_samples_per_second": 4.967,
478
+ "eval_steps_per_second": 0.172,
479
  "step": 325
480
  },
481
  {
482
  "epoch": 1.0,
483
  "step": 325,
484
+ "total_flos": 4962652747988992.0,
485
+ "train_loss": 0.7601131853690514,
486
+ "train_runtime": 11993.6222,
487
+ "train_samples_per_second": 1.733,
488
+ "train_steps_per_second": 0.027
489
  }
490
  ],
491
  "logging_steps": 5,
 
493
  "num_input_tokens_seen": 0,
494
  "num_train_epochs": 1,
495
  "save_steps": 100,
496
+ "total_flos": 4962652747988992.0,
497
  "train_batch_size": 8,
498
  "trial_name": null,
499
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fa7194cfe3338f80c07cc46268e8cc67ed5c2fbad2b4816eb1f296be0fe00d1f
3
  size 6072
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:971a709049ccad23c89c4e6cee01ec3eb6463ee0f5a631966b03929525be0f01
3
  size 6072