humbertonc commited on
Commit
a35156c
·
1 Parent(s): 1415be6

Training in progress, step 25, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -16,13 +16,13 @@
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
19
- "gate_proj",
20
- "k_proj",
21
  "v_proj",
 
22
  "up_proj",
 
 
23
  "o_proj",
24
- "down_proj",
25
- "q_proj"
26
  ],
27
  "task_type": "CAUSAL_LM"
28
  }
 
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
 
 
19
  "v_proj",
20
+ "gate_proj",
21
  "up_proj",
22
+ "q_proj",
23
+ "k_proj",
24
  "o_proj",
25
+ "down_proj"
 
26
  ],
27
  "task_type": "CAUSAL_LM"
28
  }
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:661995d6fcb6405e7c8ad56ab888ddeab69ea83d2429fbf08b080ea5756f3eb6
3
  size 80013120
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a965a0e35eb565282b2b1fcfc122cb90b82d05bbb2e9f0b12b4691a072f4cb0
3
  size 80013120
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aea908c6d7c1a8815a3a34246a07b139da722c669d76b45323ce9a8b68a35787
3
  size 40570324
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1edde597d127de07be7f782f6dcaf35d1273cdd6b7b1b022121fc26ce68fd22e
3
  size 40570324
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7d8f7f8e1a76d94091cc701a4a00cc6fd2a3c1a746e29e297276a305cc6801f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d504d22c70365e955869229804fc73b5137014a18fd4465b41a6a55d0b2d969
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2cb9069cac0762cbe9eda0682450e4a822dfd10b17c5f7335626649d1825624d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dec2630d82b42dc3f2af7f561c89bb49dcba8fb4678d8138315fec1825cabb01
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.0018656716417910447,
5
  "eval_steps": 1000,
6
- "global_step": 75,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -23,447 +23,147 @@
23
  {
24
  "epoch": 0.0,
25
  "learning_rate": 0.00012,
26
- "loss": 2.0252,
27
  "step": 3
28
  },
29
  {
30
  "epoch": 0.0,
31
  "learning_rate": 0.00016,
32
- "loss": 1.9536,
33
  "step": 4
34
  },
35
  {
36
  "epoch": 0.0,
37
  "learning_rate": 0.0002,
38
- "loss": 2.1671,
39
  "step": 5
40
  },
41
  {
42
  "epoch": 0.0,
43
  "learning_rate": 0.00019789473684210526,
44
- "loss": 1.6695,
45
  "step": 6
46
  },
47
  {
48
  "epoch": 0.0,
49
  "learning_rate": 0.00019578947368421054,
50
- "loss": 1.7043,
51
  "step": 7
52
  },
53
  {
54
  "epoch": 0.0,
55
  "learning_rate": 0.0001936842105263158,
56
- "loss": 1.635,
57
  "step": 8
58
  },
59
  {
60
  "epoch": 0.0,
61
  "learning_rate": 0.00019157894736842104,
62
- "loss": 1.5576,
63
  "step": 9
64
  },
65
  {
66
  "epoch": 0.0,
67
  "learning_rate": 0.00018947368421052632,
68
- "loss": 1.5921,
69
  "step": 10
70
  },
71
  {
72
  "epoch": 0.0,
73
  "learning_rate": 0.0001873684210526316,
74
- "loss": 1.814,
75
  "step": 11
76
  },
77
  {
78
  "epoch": 0.0,
79
  "learning_rate": 0.00018526315789473685,
80
- "loss": 1.7531,
81
  "step": 12
82
  },
83
  {
84
  "epoch": 0.0,
85
  "learning_rate": 0.0001831578947368421,
86
- "loss": 1.6155,
87
  "step": 13
88
  },
89
  {
90
  "epoch": 0.0,
91
  "learning_rate": 0.00018105263157894739,
92
- "loss": 1.6424,
93
  "step": 14
94
  },
95
  {
96
  "epoch": 0.0,
97
  "learning_rate": 0.00017894736842105264,
98
- "loss": 1.453,
99
  "step": 15
100
  },
101
  {
102
  "epoch": 0.0,
103
- "learning_rate": 0.0001768421052631579,
104
- "loss": 1.4445,
105
  "step": 16
106
  },
107
  {
108
  "epoch": 0.0,
109
- "learning_rate": 0.00017473684210526317,
110
- "loss": 1.441,
111
  "step": 17
112
  },
113
  {
114
  "epoch": 0.0,
115
- "learning_rate": 0.00017263157894736842,
116
- "loss": 1.5453,
117
  "step": 18
118
  },
119
  {
120
  "epoch": 0.0,
121
- "learning_rate": 0.0001705263157894737,
122
- "loss": 1.4167,
123
  "step": 19
124
  },
125
  {
126
  "epoch": 0.0,
127
- "learning_rate": 0.00016842105263157895,
128
- "loss": 1.4823,
129
  "step": 20
130
  },
131
  {
132
  "epoch": 0.0,
133
- "learning_rate": 0.00016631578947368423,
134
- "loss": 1.6061,
135
  "step": 21
136
  },
137
  {
138
  "epoch": 0.0,
139
- "learning_rate": 0.00016421052631578948,
140
- "loss": 1.4348,
141
  "step": 22
142
  },
143
  {
144
  "epoch": 0.0,
145
- "learning_rate": 0.00016210526315789473,
146
- "loss": 1.7074,
147
  "step": 23
148
  },
149
  {
150
  "epoch": 0.0,
151
- "learning_rate": 0.00016,
152
- "loss": 1.7641,
153
  "step": 24
154
  },
155
  {
156
  "epoch": 0.0,
157
- "learning_rate": 0.00015789473684210527,
158
- "loss": 1.694,
159
  "step": 25
160
- },
161
- {
162
- "epoch": 0.0,
163
- "learning_rate": 0.00015578947368421052,
164
- "loss": 1.4779,
165
- "step": 26
166
- },
167
- {
168
- "epoch": 0.0,
169
- "learning_rate": 0.0001536842105263158,
170
- "loss": 1.638,
171
- "step": 27
172
- },
173
- {
174
- "epoch": 0.0,
175
- "learning_rate": 0.00015157894736842108,
176
- "loss": 1.5519,
177
- "step": 28
178
- },
179
- {
180
- "epoch": 0.0,
181
- "learning_rate": 0.00014947368421052633,
182
- "loss": 1.7623,
183
- "step": 29
184
- },
185
- {
186
- "epoch": 0.0,
187
- "learning_rate": 0.00014736842105263158,
188
- "loss": 1.2871,
189
- "step": 30
190
- },
191
- {
192
- "epoch": 0.0,
193
- "learning_rate": 0.00014526315789473686,
194
- "loss": 1.5806,
195
- "step": 31
196
- },
197
- {
198
- "epoch": 0.0,
199
- "learning_rate": 0.0001431578947368421,
200
- "loss": 1.5541,
201
- "step": 32
202
- },
203
- {
204
- "epoch": 0.0,
205
- "learning_rate": 0.00014105263157894736,
206
- "loss": 1.6991,
207
- "step": 33
208
- },
209
- {
210
- "epoch": 0.0,
211
- "learning_rate": 0.00013894736842105264,
212
- "loss": 1.5331,
213
- "step": 34
214
- },
215
- {
216
- "epoch": 0.0,
217
- "learning_rate": 0.0001368421052631579,
218
- "loss": 1.9143,
219
- "step": 35
220
- },
221
- {
222
- "epoch": 0.0,
223
- "learning_rate": 0.00013473684210526317,
224
- "loss": 1.6051,
225
- "step": 36
226
- },
227
- {
228
- "epoch": 0.0,
229
- "learning_rate": 0.00013263157894736842,
230
- "loss": 1.6311,
231
- "step": 37
232
- },
233
- {
234
- "epoch": 0.0,
235
- "learning_rate": 0.0001305263157894737,
236
- "loss": 1.4711,
237
- "step": 38
238
- },
239
- {
240
- "epoch": 0.0,
241
- "learning_rate": 0.00012842105263157895,
242
- "loss": 1.4644,
243
- "step": 39
244
- },
245
- {
246
- "epoch": 0.0,
247
- "learning_rate": 0.0001263157894736842,
248
- "loss": 1.7623,
249
- "step": 40
250
- },
251
- {
252
- "epoch": 0.0,
253
- "learning_rate": 0.00012421052631578949,
254
- "loss": 1.5635,
255
- "step": 41
256
- },
257
- {
258
- "epoch": 0.0,
259
- "learning_rate": 0.00012210526315789474,
260
- "loss": 1.3194,
261
- "step": 42
262
- },
263
- {
264
- "epoch": 0.0,
265
- "learning_rate": 0.00012,
266
- "loss": 1.3296,
267
- "step": 43
268
- },
269
- {
270
- "epoch": 0.0,
271
- "learning_rate": 0.00011789473684210525,
272
- "loss": 1.6656,
273
- "step": 44
274
- },
275
- {
276
- "epoch": 0.0,
277
- "learning_rate": 0.00011578947368421053,
278
- "loss": 1.5584,
279
- "step": 45
280
- },
281
- {
282
- "epoch": 0.0,
283
- "learning_rate": 0.0001136842105263158,
284
- "loss": 1.6557,
285
- "step": 46
286
- },
287
- {
288
- "epoch": 0.0,
289
- "learning_rate": 0.00011157894736842105,
290
- "loss": 1.4512,
291
- "step": 47
292
- },
293
- {
294
- "epoch": 0.0,
295
- "learning_rate": 0.00010947368421052633,
296
- "loss": 1.4563,
297
- "step": 48
298
- },
299
- {
300
- "epoch": 0.0,
301
- "learning_rate": 0.00010736842105263158,
302
- "loss": 1.4214,
303
- "step": 49
304
- },
305
- {
306
- "epoch": 0.0,
307
- "learning_rate": 0.00010526315789473685,
308
- "loss": 1.6556,
309
- "step": 50
310
- },
311
- {
312
- "epoch": 0.0,
313
- "learning_rate": 0.00010315789473684211,
314
- "loss": 1.5774,
315
- "step": 51
316
- },
317
- {
318
- "epoch": 0.0,
319
- "learning_rate": 0.00010105263157894738,
320
- "loss": 1.5624,
321
- "step": 52
322
- },
323
- {
324
- "epoch": 0.0,
325
- "learning_rate": 9.894736842105263e-05,
326
- "loss": 1.0224,
327
- "step": 53
328
- },
329
- {
330
- "epoch": 0.0,
331
- "learning_rate": 9.68421052631579e-05,
332
- "loss": 1.5792,
333
- "step": 54
334
- },
335
- {
336
- "epoch": 0.0,
337
- "learning_rate": 9.473684210526316e-05,
338
- "loss": 1.6203,
339
- "step": 55
340
- },
341
- {
342
- "epoch": 0.0,
343
- "learning_rate": 9.263157894736843e-05,
344
- "loss": 1.6145,
345
- "step": 56
346
- },
347
- {
348
- "epoch": 0.0,
349
- "learning_rate": 9.052631578947369e-05,
350
- "loss": 1.5488,
351
- "step": 57
352
- },
353
- {
354
- "epoch": 0.0,
355
- "learning_rate": 8.842105263157894e-05,
356
- "loss": 1.7138,
357
- "step": 58
358
- },
359
- {
360
- "epoch": 0.0,
361
- "learning_rate": 8.631578947368421e-05,
362
- "loss": 1.5731,
363
- "step": 59
364
- },
365
- {
366
- "epoch": 0.0,
367
- "learning_rate": 8.421052631578948e-05,
368
- "loss": 1.2584,
369
- "step": 60
370
- },
371
- {
372
- "epoch": 0.0,
373
- "learning_rate": 8.210526315789474e-05,
374
- "loss": 1.4644,
375
- "step": 61
376
- },
377
- {
378
- "epoch": 0.0,
379
- "learning_rate": 8e-05,
380
- "loss": 1.4227,
381
- "step": 62
382
- },
383
- {
384
- "epoch": 0.0,
385
- "learning_rate": 7.789473684210526e-05,
386
- "loss": 1.4762,
387
- "step": 63
388
- },
389
- {
390
- "epoch": 0.0,
391
- "learning_rate": 7.578947368421054e-05,
392
- "loss": 1.6393,
393
- "step": 64
394
- },
395
- {
396
- "epoch": 0.0,
397
- "learning_rate": 7.368421052631579e-05,
398
- "loss": 1.6137,
399
- "step": 65
400
- },
401
- {
402
- "epoch": 0.0,
403
- "learning_rate": 7.157894736842105e-05,
404
- "loss": 1.4132,
405
- "step": 66
406
- },
407
- {
408
- "epoch": 0.0,
409
- "learning_rate": 6.947368421052632e-05,
410
- "loss": 1.9143,
411
- "step": 67
412
- },
413
- {
414
- "epoch": 0.0,
415
- "learning_rate": 6.736842105263159e-05,
416
- "loss": 1.2177,
417
- "step": 68
418
- },
419
- {
420
- "epoch": 0.0,
421
- "learning_rate": 6.526315789473685e-05,
422
- "loss": 1.6103,
423
- "step": 69
424
- },
425
- {
426
- "epoch": 0.0,
427
- "learning_rate": 6.31578947368421e-05,
428
- "loss": 1.3548,
429
- "step": 70
430
- },
431
- {
432
- "epoch": 0.0,
433
- "learning_rate": 6.105263157894737e-05,
434
- "loss": 1.6381,
435
- "step": 71
436
- },
437
- {
438
- "epoch": 0.0,
439
- "learning_rate": 5.894736842105263e-05,
440
- "loss": 1.6296,
441
- "step": 72
442
- },
443
- {
444
- "epoch": 0.0,
445
- "learning_rate": 5.68421052631579e-05,
446
- "loss": 1.6071,
447
- "step": 73
448
- },
449
- {
450
- "epoch": 0.0,
451
- "learning_rate": 5.4736842105263165e-05,
452
- "loss": 1.7263,
453
- "step": 74
454
- },
455
- {
456
- "epoch": 0.0,
457
- "learning_rate": 5.2631578947368424e-05,
458
- "loss": 1.5891,
459
- "step": 75
460
  }
461
  ],
462
  "logging_steps": 1,
463
  "max_steps": 100,
464
  "num_train_epochs": 1,
465
  "save_steps": 25,
466
- "total_flos": 4183967298846720.0,
467
  "trial_name": null,
468
  "trial_params": null
469
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.0006218905472636816,
5
  "eval_steps": 1000,
6
+ "global_step": 25,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
23
  {
24
  "epoch": 0.0,
25
  "learning_rate": 0.00012,
26
+ "loss": 2.0249,
27
  "step": 3
28
  },
29
  {
30
  "epoch": 0.0,
31
  "learning_rate": 0.00016,
32
+ "loss": 1.9519,
33
  "step": 4
34
  },
35
  {
36
  "epoch": 0.0,
37
  "learning_rate": 0.0002,
38
+ "loss": 2.1586,
39
  "step": 5
40
  },
41
  {
42
  "epoch": 0.0,
43
  "learning_rate": 0.00019789473684210526,
44
+ "loss": 1.6571,
45
  "step": 6
46
  },
47
  {
48
  "epoch": 0.0,
49
  "learning_rate": 0.00019578947368421054,
50
+ "loss": 1.6994,
51
  "step": 7
52
  },
53
  {
54
  "epoch": 0.0,
55
  "learning_rate": 0.0001936842105263158,
56
+ "loss": 1.6263,
57
  "step": 8
58
  },
59
  {
60
  "epoch": 0.0,
61
  "learning_rate": 0.00019157894736842104,
62
+ "loss": 1.5557,
63
  "step": 9
64
  },
65
  {
66
  "epoch": 0.0,
67
  "learning_rate": 0.00018947368421052632,
68
+ "loss": 1.5908,
69
  "step": 10
70
  },
71
  {
72
  "epoch": 0.0,
73
  "learning_rate": 0.0001873684210526316,
74
+ "loss": 1.8123,
75
  "step": 11
76
  },
77
  {
78
  "epoch": 0.0,
79
  "learning_rate": 0.00018526315789473685,
80
+ "loss": 1.7386,
81
  "step": 12
82
  },
83
  {
84
  "epoch": 0.0,
85
  "learning_rate": 0.0001831578947368421,
86
+ "loss": 1.6149,
87
  "step": 13
88
  },
89
  {
90
  "epoch": 0.0,
91
  "learning_rate": 0.00018105263157894739,
92
+ "loss": 1.6512,
93
  "step": 14
94
  },
95
  {
96
  "epoch": 0.0,
97
  "learning_rate": 0.00017894736842105264,
98
+ "loss": 1.4589,
99
  "step": 15
100
  },
101
  {
102
  "epoch": 0.0,
103
+ "learning_rate": 0.00017894736842105264,
104
+ "loss": 1.4591,
105
  "step": 16
106
  },
107
  {
108
  "epoch": 0.0,
109
+ "learning_rate": 0.0001768421052631579,
110
+ "loss": 1.4456,
111
  "step": 17
112
  },
113
  {
114
  "epoch": 0.0,
115
+ "learning_rate": 0.00017473684210526317,
116
+ "loss": 1.5533,
117
  "step": 18
118
  },
119
  {
120
  "epoch": 0.0,
121
+ "learning_rate": 0.00017263157894736842,
122
+ "loss": 1.424,
123
  "step": 19
124
  },
125
  {
126
  "epoch": 0.0,
127
+ "learning_rate": 0.0001705263157894737,
128
+ "loss": 1.4817,
129
  "step": 20
130
  },
131
  {
132
  "epoch": 0.0,
133
+ "learning_rate": 0.00016842105263157895,
134
+ "loss": 1.6008,
135
  "step": 21
136
  },
137
  {
138
  "epoch": 0.0,
139
+ "learning_rate": 0.00016631578947368423,
140
+ "loss": 1.4342,
141
  "step": 22
142
  },
143
  {
144
  "epoch": 0.0,
145
+ "learning_rate": 0.00016421052631578948,
146
+ "loss": 1.7079,
147
  "step": 23
148
  },
149
  {
150
  "epoch": 0.0,
151
+ "learning_rate": 0.00016210526315789473,
152
+ "loss": 1.761,
153
  "step": 24
154
  },
155
  {
156
  "epoch": 0.0,
157
+ "learning_rate": 0.00016,
158
+ "loss": 1.6936,
159
  "step": 25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  }
161
  ],
162
  "logging_steps": 1,
163
  "max_steps": 100,
164
  "num_train_epochs": 1,
165
  "save_steps": 25,
166
+ "total_flos": 1278571417288704.0,
167
  "trial_name": null,
168
  "trial_params": null
169
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:619cb7fe493d4c12996a60232bbe7210fa65382ffafcccaff8d4e67c6e0757fb
3
  size 4600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64bc67c40371d89ad1c0c5799c319b73a6cbfb79d05d7cb27dd81ef6351298a8
3
  size 4600