alxxtexxr commited on
Commit
da0c8c4
1 Parent(s): 3bf9c64

Upload folder using huggingface_hub

Browse files
checkpoint-168/adapter_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model_name_or_path": "codellama/CodeLlama-7b-hf",
3
+ "bias": "none",
4
+ "enable_lora": null,
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "lora_alpha": 16,
9
+ "lora_dropout": 0.05,
10
+ "merge_weights": false,
11
+ "modules_to_save": null,
12
+ "peft_type": "LORA",
13
+ "r": 8,
14
+ "target_modules": [
15
+ "q_proj",
16
+ "v_proj"
17
+ ],
18
+ "task_type": "CAUSAL_LM"
19
+ }
checkpoint-168/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02adea0407823018d080f6791b60bf604af6f50b2690f4a09fad4f9f784f7d69
3
+ size 16822989
checkpoint-168/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a86f2b1b0e1227dad795b876da0db2310220fcd7402e8a7a862199363ad0a5ed
3
+ size 33661637
checkpoint-168/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09b2e0d526507d04a43b8ac1262bd47f484a608f8ab4cc658454f2de4b748c5d
3
+ size 14575
checkpoint-168/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e450fbc929f0f23bbec27f12fe032523324b9a551c437a3d580c33d907eb3f3
3
+ size 627
checkpoint-168/trainer_state.json ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.13684938848018646,
3
+ "best_model_checkpoint": "./lora-out/checkpoint-168",
4
+ "epoch": 11.893805309734514,
5
+ "eval_steps": 4,
6
+ "global_step": 168,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.28,
13
+ "eval_loss": 0.3006412982940674,
14
+ "eval_runtime": 54.7333,
15
+ "eval_samples_per_second": 0.914,
16
+ "eval_steps_per_second": 0.238,
17
+ "step": 4
18
+ },
19
+ {
20
+ "epoch": 0.57,
21
+ "eval_loss": 0.300335168838501,
22
+ "eval_runtime": 54.8113,
23
+ "eval_samples_per_second": 0.912,
24
+ "eval_steps_per_second": 0.237,
25
+ "step": 8
26
+ },
27
+ {
28
+ "epoch": 0.71,
29
+ "learning_rate": 1e-05,
30
+ "loss": 0.3024,
31
+ "step": 10
32
+ },
33
+ {
34
+ "epoch": 0.85,
35
+ "eval_loss": 0.2993900179862976,
36
+ "eval_runtime": 54.782,
37
+ "eval_samples_per_second": 0.913,
38
+ "eval_steps_per_second": 0.237,
39
+ "step": 12
40
+ },
41
+ {
42
+ "epoch": 1.13,
43
+ "eval_loss": 0.29816487431526184,
44
+ "eval_runtime": 54.8049,
45
+ "eval_samples_per_second": 0.912,
46
+ "eval_steps_per_second": 0.237,
47
+ "step": 16
48
+ },
49
+ {
50
+ "epoch": 1.42,
51
+ "learning_rate": 2e-05,
52
+ "loss": 0.3035,
53
+ "step": 20
54
+ },
55
+ {
56
+ "epoch": 1.42,
57
+ "eval_loss": 0.29595255851745605,
58
+ "eval_runtime": 54.7885,
59
+ "eval_samples_per_second": 0.913,
60
+ "eval_steps_per_second": 0.237,
61
+ "step": 20
62
+ },
63
+ {
64
+ "epoch": 1.7,
65
+ "eval_loss": 0.2939557135105133,
66
+ "eval_runtime": 54.7999,
67
+ "eval_samples_per_second": 0.912,
68
+ "eval_steps_per_second": 0.237,
69
+ "step": 24
70
+ },
71
+ {
72
+ "epoch": 1.98,
73
+ "eval_loss": 0.29013773798942566,
74
+ "eval_runtime": 54.7805,
75
+ "eval_samples_per_second": 0.913,
76
+ "eval_steps_per_second": 0.237,
77
+ "step": 28
78
+ },
79
+ {
80
+ "epoch": 2.12,
81
+ "learning_rate": 3e-05,
82
+ "loss": 0.2959,
83
+ "step": 30
84
+ },
85
+ {
86
+ "epoch": 2.27,
87
+ "eval_loss": 0.28251081705093384,
88
+ "eval_runtime": 54.7706,
89
+ "eval_samples_per_second": 0.913,
90
+ "eval_steps_per_second": 0.237,
91
+ "step": 32
92
+ },
93
+ {
94
+ "epoch": 2.55,
95
+ "eval_loss": 0.2771329879760742,
96
+ "eval_runtime": 54.7818,
97
+ "eval_samples_per_second": 0.913,
98
+ "eval_steps_per_second": 0.237,
99
+ "step": 36
100
+ },
101
+ {
102
+ "epoch": 2.83,
103
+ "learning_rate": 4e-05,
104
+ "loss": 0.284,
105
+ "step": 40
106
+ },
107
+ {
108
+ "epoch": 2.83,
109
+ "eval_loss": 0.27146488428115845,
110
+ "eval_runtime": 54.803,
111
+ "eval_samples_per_second": 0.912,
112
+ "eval_steps_per_second": 0.237,
113
+ "step": 40
114
+ },
115
+ {
116
+ "epoch": 3.12,
117
+ "eval_loss": 0.26464152336120605,
118
+ "eval_runtime": 54.8467,
119
+ "eval_samples_per_second": 0.912,
120
+ "eval_steps_per_second": 0.237,
121
+ "step": 44
122
+ },
123
+ {
124
+ "epoch": 3.4,
125
+ "eval_loss": 0.25653430819511414,
126
+ "eval_runtime": 54.8327,
127
+ "eval_samples_per_second": 0.912,
128
+ "eval_steps_per_second": 0.237,
129
+ "step": 48
130
+ },
131
+ {
132
+ "epoch": 3.54,
133
+ "learning_rate": 5e-05,
134
+ "loss": 0.263,
135
+ "step": 50
136
+ },
137
+ {
138
+ "epoch": 3.68,
139
+ "eval_loss": 0.24627122282981873,
140
+ "eval_runtime": 54.813,
141
+ "eval_samples_per_second": 0.912,
142
+ "eval_steps_per_second": 0.237,
143
+ "step": 52
144
+ },
145
+ {
146
+ "epoch": 3.96,
147
+ "eval_loss": 0.23474617302417755,
148
+ "eval_runtime": 54.7901,
149
+ "eval_samples_per_second": 0.913,
150
+ "eval_steps_per_second": 0.237,
151
+ "step": 56
152
+ },
153
+ {
154
+ "epoch": 4.25,
155
+ "learning_rate": 6e-05,
156
+ "loss": 0.241,
157
+ "step": 60
158
+ },
159
+ {
160
+ "epoch": 4.25,
161
+ "eval_loss": 0.2220366895198822,
162
+ "eval_runtime": 54.7983,
163
+ "eval_samples_per_second": 0.912,
164
+ "eval_steps_per_second": 0.237,
165
+ "step": 60
166
+ },
167
+ {
168
+ "epoch": 4.53,
169
+ "eval_loss": 0.20926769077777863,
170
+ "eval_runtime": 54.7403,
171
+ "eval_samples_per_second": 0.913,
172
+ "eval_steps_per_second": 0.237,
173
+ "step": 64
174
+ },
175
+ {
176
+ "epoch": 4.81,
177
+ "eval_loss": 0.19629451632499695,
178
+ "eval_runtime": 54.7525,
179
+ "eval_samples_per_second": 0.913,
180
+ "eval_steps_per_second": 0.237,
181
+ "step": 68
182
+ },
183
+ {
184
+ "epoch": 4.96,
185
+ "learning_rate": 7e-05,
186
+ "loss": 0.2101,
187
+ "step": 70
188
+ },
189
+ {
190
+ "epoch": 5.1,
191
+ "eval_loss": 0.18524658679962158,
192
+ "eval_runtime": 54.7303,
193
+ "eval_samples_per_second": 0.914,
194
+ "eval_steps_per_second": 0.238,
195
+ "step": 72
196
+ },
197
+ {
198
+ "epoch": 5.38,
199
+ "eval_loss": 0.17731742560863495,
200
+ "eval_runtime": 54.7552,
201
+ "eval_samples_per_second": 0.913,
202
+ "eval_steps_per_second": 0.237,
203
+ "step": 76
204
+ },
205
+ {
206
+ "epoch": 5.66,
207
+ "learning_rate": 8e-05,
208
+ "loss": 0.1788,
209
+ "step": 80
210
+ },
211
+ {
212
+ "epoch": 5.66,
213
+ "eval_loss": 0.16993452608585358,
214
+ "eval_runtime": 54.729,
215
+ "eval_samples_per_second": 0.914,
216
+ "eval_steps_per_second": 0.238,
217
+ "step": 80
218
+ },
219
+ {
220
+ "epoch": 5.95,
221
+ "eval_loss": 0.164781853556633,
222
+ "eval_runtime": 54.741,
223
+ "eval_samples_per_second": 0.913,
224
+ "eval_steps_per_second": 0.237,
225
+ "step": 84
226
+ },
227
+ {
228
+ "epoch": 6.23,
229
+ "eval_loss": 0.16103117167949677,
230
+ "eval_runtime": 54.7837,
231
+ "eval_samples_per_second": 0.913,
232
+ "eval_steps_per_second": 0.237,
233
+ "step": 88
234
+ },
235
+ {
236
+ "epoch": 6.37,
237
+ "learning_rate": 9e-05,
238
+ "loss": 0.1615,
239
+ "step": 90
240
+ },
241
+ {
242
+ "epoch": 6.51,
243
+ "eval_loss": 0.15781742334365845,
244
+ "eval_runtime": 54.7138,
245
+ "eval_samples_per_second": 0.914,
246
+ "eval_steps_per_second": 0.238,
247
+ "step": 92
248
+ },
249
+ {
250
+ "epoch": 6.8,
251
+ "eval_loss": 0.15516981482505798,
252
+ "eval_runtime": 54.7516,
253
+ "eval_samples_per_second": 0.913,
254
+ "eval_steps_per_second": 0.237,
255
+ "step": 96
256
+ },
257
+ {
258
+ "epoch": 7.08,
259
+ "learning_rate": 0.0001,
260
+ "loss": 0.1533,
261
+ "step": 100
262
+ },
263
+ {
264
+ "epoch": 7.08,
265
+ "eval_loss": 0.15261690318584442,
266
+ "eval_runtime": 54.6891,
267
+ "eval_samples_per_second": 0.914,
268
+ "eval_steps_per_second": 0.238,
269
+ "step": 100
270
+ },
271
+ {
272
+ "epoch": 7.36,
273
+ "eval_loss": 0.15066812932491302,
274
+ "eval_runtime": 54.6884,
275
+ "eval_samples_per_second": 0.914,
276
+ "eval_steps_per_second": 0.238,
277
+ "step": 104
278
+ },
279
+ {
280
+ "epoch": 7.65,
281
+ "eval_loss": 0.14893724024295807,
282
+ "eval_runtime": 54.6275,
283
+ "eval_samples_per_second": 0.915,
284
+ "eval_steps_per_second": 0.238,
285
+ "step": 108
286
+ },
287
+ {
288
+ "epoch": 7.79,
289
+ "learning_rate": 9.090909090909092e-05,
290
+ "loss": 0.1463,
291
+ "step": 110
292
+ },
293
+ {
294
+ "epoch": 7.93,
295
+ "eval_loss": 0.14742153882980347,
296
+ "eval_runtime": 54.6174,
297
+ "eval_samples_per_second": 0.915,
298
+ "eval_steps_per_second": 0.238,
299
+ "step": 112
300
+ },
301
+ {
302
+ "epoch": 8.21,
303
+ "eval_loss": 0.14575307071208954,
304
+ "eval_runtime": 54.6366,
305
+ "eval_samples_per_second": 0.915,
306
+ "eval_steps_per_second": 0.238,
307
+ "step": 116
308
+ },
309
+ {
310
+ "epoch": 8.5,
311
+ "learning_rate": 8.181818181818183e-05,
312
+ "loss": 0.1399,
313
+ "step": 120
314
+ },
315
+ {
316
+ "epoch": 8.5,
317
+ "eval_loss": 0.14450186491012573,
318
+ "eval_runtime": 54.6303,
319
+ "eval_samples_per_second": 0.915,
320
+ "eval_steps_per_second": 0.238,
321
+ "step": 120
322
+ },
323
+ {
324
+ "epoch": 8.78,
325
+ "eval_loss": 0.1431863009929657,
326
+ "eval_runtime": 54.6358,
327
+ "eval_samples_per_second": 0.915,
328
+ "eval_steps_per_second": 0.238,
329
+ "step": 124
330
+ },
331
+ {
332
+ "epoch": 9.06,
333
+ "eval_loss": 0.1424635797739029,
334
+ "eval_runtime": 54.6449,
335
+ "eval_samples_per_second": 0.915,
336
+ "eval_steps_per_second": 0.238,
337
+ "step": 128
338
+ },
339
+ {
340
+ "epoch": 9.2,
341
+ "learning_rate": 7.272727272727273e-05,
342
+ "loss": 0.1357,
343
+ "step": 130
344
+ },
345
+ {
346
+ "epoch": 9.35,
347
+ "eval_loss": 0.14175941050052643,
348
+ "eval_runtime": 54.6307,
349
+ "eval_samples_per_second": 0.915,
350
+ "eval_steps_per_second": 0.238,
351
+ "step": 132
352
+ },
353
+ {
354
+ "epoch": 9.63,
355
+ "eval_loss": 0.14105737209320068,
356
+ "eval_runtime": 54.6121,
357
+ "eval_samples_per_second": 0.916,
358
+ "eval_steps_per_second": 0.238,
359
+ "step": 136
360
+ },
361
+ {
362
+ "epoch": 9.91,
363
+ "learning_rate": 6.363636363636364e-05,
364
+ "loss": 0.1322,
365
+ "step": 140
366
+ },
367
+ {
368
+ "epoch": 9.91,
369
+ "eval_loss": 0.14027251303195953,
370
+ "eval_runtime": 54.6594,
371
+ "eval_samples_per_second": 0.915,
372
+ "eval_steps_per_second": 0.238,
373
+ "step": 140
374
+ },
375
+ {
376
+ "epoch": 10.19,
377
+ "eval_loss": 0.13963991403579712,
378
+ "eval_runtime": 54.6378,
379
+ "eval_samples_per_second": 0.915,
380
+ "eval_steps_per_second": 0.238,
381
+ "step": 144
382
+ },
383
+ {
384
+ "epoch": 10.48,
385
+ "eval_loss": 0.138994500041008,
386
+ "eval_runtime": 54.6115,
387
+ "eval_samples_per_second": 0.916,
388
+ "eval_steps_per_second": 0.238,
389
+ "step": 148
390
+ },
391
+ {
392
+ "epoch": 10.62,
393
+ "learning_rate": 5.4545454545454546e-05,
394
+ "loss": 0.1355,
395
+ "step": 150
396
+ },
397
+ {
398
+ "epoch": 10.76,
399
+ "eval_loss": 0.13857363164424896,
400
+ "eval_runtime": 54.6622,
401
+ "eval_samples_per_second": 0.915,
402
+ "eval_steps_per_second": 0.238,
403
+ "step": 152
404
+ },
405
+ {
406
+ "epoch": 11.04,
407
+ "eval_loss": 0.13809233903884888,
408
+ "eval_runtime": 54.7824,
409
+ "eval_samples_per_second": 0.913,
410
+ "eval_steps_per_second": 0.237,
411
+ "step": 156
412
+ },
413
+ {
414
+ "epoch": 11.33,
415
+ "learning_rate": 4.545454545454546e-05,
416
+ "loss": 0.1216,
417
+ "step": 160
418
+ },
419
+ {
420
+ "epoch": 11.33,
421
+ "eval_loss": 0.137764573097229,
422
+ "eval_runtime": 54.6049,
423
+ "eval_samples_per_second": 0.916,
424
+ "eval_steps_per_second": 0.238,
425
+ "step": 160
426
+ },
427
+ {
428
+ "epoch": 11.61,
429
+ "eval_loss": 0.1369408369064331,
430
+ "eval_runtime": 54.6188,
431
+ "eval_samples_per_second": 0.915,
432
+ "eval_steps_per_second": 0.238,
433
+ "step": 164
434
+ },
435
+ {
436
+ "epoch": 11.89,
437
+ "eval_loss": 0.13684938848018646,
438
+ "eval_runtime": 54.5949,
439
+ "eval_samples_per_second": 0.916,
440
+ "eval_steps_per_second": 0.238,
441
+ "step": 168
442
+ }
443
+ ],
444
+ "logging_steps": 10,
445
+ "max_steps": 210,
446
+ "num_input_tokens_seen": 0,
447
+ "num_train_epochs": 15,
448
+ "save_steps": 12,
449
+ "total_flos": 5.5312272527405875e+17,
450
+ "train_batch_size": 4,
451
+ "trial_name": null,
452
+ "trial_params": null
453
+ }
checkpoint-168/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85310c54a4f279d40e8badbc8f6f7406b57e15fc0c79500de525827feedf5072
3
+ size 4219
checkpoint-180/adapter_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model_name_or_path": "codellama/CodeLlama-7b-hf",
3
+ "bias": "none",
4
+ "enable_lora": null,
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "lora_alpha": 16,
9
+ "lora_dropout": 0.05,
10
+ "merge_weights": false,
11
+ "modules_to_save": null,
12
+ "peft_type": "LORA",
13
+ "r": 8,
14
+ "target_modules": [
15
+ "q_proj",
16
+ "v_proj"
17
+ ],
18
+ "task_type": "CAUSAL_LM"
19
+ }
checkpoint-180/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c95dfb105eaac0d6c8bb0b14fd9dec487ef6ae0edc512eb12b252acf23d68f22
3
+ size 16822989
checkpoint-180/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69ba420574b9c6ae2287a0258e5589fab466b8f7216ef5cb88e6fe3cca6fb5f0
3
+ size 33661637
checkpoint-180/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd2b9206f02d01ef941455d7a45d7d9abfec382e1421f948275501d070528ef3
3
+ size 14575
checkpoint-180/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:663213d2cc9264a848db29de55c993220bb66d0dd73cf39dcd0d40d0736d6265
3
+ size 627
checkpoint-180/trainer_state.json ADDED
@@ -0,0 +1,489 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.13553684949874878,
3
+ "best_model_checkpoint": "./lora-out/checkpoint-180",
4
+ "epoch": 12.743362831858407,
5
+ "eval_steps": 4,
6
+ "global_step": 180,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.28,
13
+ "eval_loss": 0.3006412982940674,
14
+ "eval_runtime": 54.7333,
15
+ "eval_samples_per_second": 0.914,
16
+ "eval_steps_per_second": 0.238,
17
+ "step": 4
18
+ },
19
+ {
20
+ "epoch": 0.57,
21
+ "eval_loss": 0.300335168838501,
22
+ "eval_runtime": 54.8113,
23
+ "eval_samples_per_second": 0.912,
24
+ "eval_steps_per_second": 0.237,
25
+ "step": 8
26
+ },
27
+ {
28
+ "epoch": 0.71,
29
+ "learning_rate": 1e-05,
30
+ "loss": 0.3024,
31
+ "step": 10
32
+ },
33
+ {
34
+ "epoch": 0.85,
35
+ "eval_loss": 0.2993900179862976,
36
+ "eval_runtime": 54.782,
37
+ "eval_samples_per_second": 0.913,
38
+ "eval_steps_per_second": 0.237,
39
+ "step": 12
40
+ },
41
+ {
42
+ "epoch": 1.13,
43
+ "eval_loss": 0.29816487431526184,
44
+ "eval_runtime": 54.8049,
45
+ "eval_samples_per_second": 0.912,
46
+ "eval_steps_per_second": 0.237,
47
+ "step": 16
48
+ },
49
+ {
50
+ "epoch": 1.42,
51
+ "learning_rate": 2e-05,
52
+ "loss": 0.3035,
53
+ "step": 20
54
+ },
55
+ {
56
+ "epoch": 1.42,
57
+ "eval_loss": 0.29595255851745605,
58
+ "eval_runtime": 54.7885,
59
+ "eval_samples_per_second": 0.913,
60
+ "eval_steps_per_second": 0.237,
61
+ "step": 20
62
+ },
63
+ {
64
+ "epoch": 1.7,
65
+ "eval_loss": 0.2939557135105133,
66
+ "eval_runtime": 54.7999,
67
+ "eval_samples_per_second": 0.912,
68
+ "eval_steps_per_second": 0.237,
69
+ "step": 24
70
+ },
71
+ {
72
+ "epoch": 1.98,
73
+ "eval_loss": 0.29013773798942566,
74
+ "eval_runtime": 54.7805,
75
+ "eval_samples_per_second": 0.913,
76
+ "eval_steps_per_second": 0.237,
77
+ "step": 28
78
+ },
79
+ {
80
+ "epoch": 2.12,
81
+ "learning_rate": 3e-05,
82
+ "loss": 0.2959,
83
+ "step": 30
84
+ },
85
+ {
86
+ "epoch": 2.27,
87
+ "eval_loss": 0.28251081705093384,
88
+ "eval_runtime": 54.7706,
89
+ "eval_samples_per_second": 0.913,
90
+ "eval_steps_per_second": 0.237,
91
+ "step": 32
92
+ },
93
+ {
94
+ "epoch": 2.55,
95
+ "eval_loss": 0.2771329879760742,
96
+ "eval_runtime": 54.7818,
97
+ "eval_samples_per_second": 0.913,
98
+ "eval_steps_per_second": 0.237,
99
+ "step": 36
100
+ },
101
+ {
102
+ "epoch": 2.83,
103
+ "learning_rate": 4e-05,
104
+ "loss": 0.284,
105
+ "step": 40
106
+ },
107
+ {
108
+ "epoch": 2.83,
109
+ "eval_loss": 0.27146488428115845,
110
+ "eval_runtime": 54.803,
111
+ "eval_samples_per_second": 0.912,
112
+ "eval_steps_per_second": 0.237,
113
+ "step": 40
114
+ },
115
+ {
116
+ "epoch": 3.12,
117
+ "eval_loss": 0.26464152336120605,
118
+ "eval_runtime": 54.8467,
119
+ "eval_samples_per_second": 0.912,
120
+ "eval_steps_per_second": 0.237,
121
+ "step": 44
122
+ },
123
+ {
124
+ "epoch": 3.4,
125
+ "eval_loss": 0.25653430819511414,
126
+ "eval_runtime": 54.8327,
127
+ "eval_samples_per_second": 0.912,
128
+ "eval_steps_per_second": 0.237,
129
+ "step": 48
130
+ },
131
+ {
132
+ "epoch": 3.54,
133
+ "learning_rate": 5e-05,
134
+ "loss": 0.263,
135
+ "step": 50
136
+ },
137
+ {
138
+ "epoch": 3.68,
139
+ "eval_loss": 0.24627122282981873,
140
+ "eval_runtime": 54.813,
141
+ "eval_samples_per_second": 0.912,
142
+ "eval_steps_per_second": 0.237,
143
+ "step": 52
144
+ },
145
+ {
146
+ "epoch": 3.96,
147
+ "eval_loss": 0.23474617302417755,
148
+ "eval_runtime": 54.7901,
149
+ "eval_samples_per_second": 0.913,
150
+ "eval_steps_per_second": 0.237,
151
+ "step": 56
152
+ },
153
+ {
154
+ "epoch": 4.25,
155
+ "learning_rate": 6e-05,
156
+ "loss": 0.241,
157
+ "step": 60
158
+ },
159
+ {
160
+ "epoch": 4.25,
161
+ "eval_loss": 0.2220366895198822,
162
+ "eval_runtime": 54.7983,
163
+ "eval_samples_per_second": 0.912,
164
+ "eval_steps_per_second": 0.237,
165
+ "step": 60
166
+ },
167
+ {
168
+ "epoch": 4.53,
169
+ "eval_loss": 0.20926769077777863,
170
+ "eval_runtime": 54.7403,
171
+ "eval_samples_per_second": 0.913,
172
+ "eval_steps_per_second": 0.237,
173
+ "step": 64
174
+ },
175
+ {
176
+ "epoch": 4.81,
177
+ "eval_loss": 0.19629451632499695,
178
+ "eval_runtime": 54.7525,
179
+ "eval_samples_per_second": 0.913,
180
+ "eval_steps_per_second": 0.237,
181
+ "step": 68
182
+ },
183
+ {
184
+ "epoch": 4.96,
185
+ "learning_rate": 7e-05,
186
+ "loss": 0.2101,
187
+ "step": 70
188
+ },
189
+ {
190
+ "epoch": 5.1,
191
+ "eval_loss": 0.18524658679962158,
192
+ "eval_runtime": 54.7303,
193
+ "eval_samples_per_second": 0.914,
194
+ "eval_steps_per_second": 0.238,
195
+ "step": 72
196
+ },
197
+ {
198
+ "epoch": 5.38,
199
+ "eval_loss": 0.17731742560863495,
200
+ "eval_runtime": 54.7552,
201
+ "eval_samples_per_second": 0.913,
202
+ "eval_steps_per_second": 0.237,
203
+ "step": 76
204
+ },
205
+ {
206
+ "epoch": 5.66,
207
+ "learning_rate": 8e-05,
208
+ "loss": 0.1788,
209
+ "step": 80
210
+ },
211
+ {
212
+ "epoch": 5.66,
213
+ "eval_loss": 0.16993452608585358,
214
+ "eval_runtime": 54.729,
215
+ "eval_samples_per_second": 0.914,
216
+ "eval_steps_per_second": 0.238,
217
+ "step": 80
218
+ },
219
+ {
220
+ "epoch": 5.95,
221
+ "eval_loss": 0.164781853556633,
222
+ "eval_runtime": 54.741,
223
+ "eval_samples_per_second": 0.913,
224
+ "eval_steps_per_second": 0.237,
225
+ "step": 84
226
+ },
227
+ {
228
+ "epoch": 6.23,
229
+ "eval_loss": 0.16103117167949677,
230
+ "eval_runtime": 54.7837,
231
+ "eval_samples_per_second": 0.913,
232
+ "eval_steps_per_second": 0.237,
233
+ "step": 88
234
+ },
235
+ {
236
+ "epoch": 6.37,
237
+ "learning_rate": 9e-05,
238
+ "loss": 0.1615,
239
+ "step": 90
240
+ },
241
+ {
242
+ "epoch": 6.51,
243
+ "eval_loss": 0.15781742334365845,
244
+ "eval_runtime": 54.7138,
245
+ "eval_samples_per_second": 0.914,
246
+ "eval_steps_per_second": 0.238,
247
+ "step": 92
248
+ },
249
+ {
250
+ "epoch": 6.8,
251
+ "eval_loss": 0.15516981482505798,
252
+ "eval_runtime": 54.7516,
253
+ "eval_samples_per_second": 0.913,
254
+ "eval_steps_per_second": 0.237,
255
+ "step": 96
256
+ },
257
+ {
258
+ "epoch": 7.08,
259
+ "learning_rate": 0.0001,
260
+ "loss": 0.1533,
261
+ "step": 100
262
+ },
263
+ {
264
+ "epoch": 7.08,
265
+ "eval_loss": 0.15261690318584442,
266
+ "eval_runtime": 54.6891,
267
+ "eval_samples_per_second": 0.914,
268
+ "eval_steps_per_second": 0.238,
269
+ "step": 100
270
+ },
271
+ {
272
+ "epoch": 7.36,
273
+ "eval_loss": 0.15066812932491302,
274
+ "eval_runtime": 54.6884,
275
+ "eval_samples_per_second": 0.914,
276
+ "eval_steps_per_second": 0.238,
277
+ "step": 104
278
+ },
279
+ {
280
+ "epoch": 7.65,
281
+ "eval_loss": 0.14893724024295807,
282
+ "eval_runtime": 54.6275,
283
+ "eval_samples_per_second": 0.915,
284
+ "eval_steps_per_second": 0.238,
285
+ "step": 108
286
+ },
287
+ {
288
+ "epoch": 7.79,
289
+ "learning_rate": 9.090909090909092e-05,
290
+ "loss": 0.1463,
291
+ "step": 110
292
+ },
293
+ {
294
+ "epoch": 7.93,
295
+ "eval_loss": 0.14742153882980347,
296
+ "eval_runtime": 54.6174,
297
+ "eval_samples_per_second": 0.915,
298
+ "eval_steps_per_second": 0.238,
299
+ "step": 112
300
+ },
301
+ {
302
+ "epoch": 8.21,
303
+ "eval_loss": 0.14575307071208954,
304
+ "eval_runtime": 54.6366,
305
+ "eval_samples_per_second": 0.915,
306
+ "eval_steps_per_second": 0.238,
307
+ "step": 116
308
+ },
309
+ {
310
+ "epoch": 8.5,
311
+ "learning_rate": 8.181818181818183e-05,
312
+ "loss": 0.1399,
313
+ "step": 120
314
+ },
315
+ {
316
+ "epoch": 8.5,
317
+ "eval_loss": 0.14450186491012573,
318
+ "eval_runtime": 54.6303,
319
+ "eval_samples_per_second": 0.915,
320
+ "eval_steps_per_second": 0.238,
321
+ "step": 120
322
+ },
323
+ {
324
+ "epoch": 8.78,
325
+ "eval_loss": 0.1431863009929657,
326
+ "eval_runtime": 54.6358,
327
+ "eval_samples_per_second": 0.915,
328
+ "eval_steps_per_second": 0.238,
329
+ "step": 124
330
+ },
331
+ {
332
+ "epoch": 9.06,
333
+ "eval_loss": 0.1424635797739029,
334
+ "eval_runtime": 54.6449,
335
+ "eval_samples_per_second": 0.915,
336
+ "eval_steps_per_second": 0.238,
337
+ "step": 128
338
+ },
339
+ {
340
+ "epoch": 9.2,
341
+ "learning_rate": 7.272727272727273e-05,
342
+ "loss": 0.1357,
343
+ "step": 130
344
+ },
345
+ {
346
+ "epoch": 9.35,
347
+ "eval_loss": 0.14175941050052643,
348
+ "eval_runtime": 54.6307,
349
+ "eval_samples_per_second": 0.915,
350
+ "eval_steps_per_second": 0.238,
351
+ "step": 132
352
+ },
353
+ {
354
+ "epoch": 9.63,
355
+ "eval_loss": 0.14105737209320068,
356
+ "eval_runtime": 54.6121,
357
+ "eval_samples_per_second": 0.916,
358
+ "eval_steps_per_second": 0.238,
359
+ "step": 136
360
+ },
361
+ {
362
+ "epoch": 9.91,
363
+ "learning_rate": 6.363636363636364e-05,
364
+ "loss": 0.1322,
365
+ "step": 140
366
+ },
367
+ {
368
+ "epoch": 9.91,
369
+ "eval_loss": 0.14027251303195953,
370
+ "eval_runtime": 54.6594,
371
+ "eval_samples_per_second": 0.915,
372
+ "eval_steps_per_second": 0.238,
373
+ "step": 140
374
+ },
375
+ {
376
+ "epoch": 10.19,
377
+ "eval_loss": 0.13963991403579712,
378
+ "eval_runtime": 54.6378,
379
+ "eval_samples_per_second": 0.915,
380
+ "eval_steps_per_second": 0.238,
381
+ "step": 144
382
+ },
383
+ {
384
+ "epoch": 10.48,
385
+ "eval_loss": 0.138994500041008,
386
+ "eval_runtime": 54.6115,
387
+ "eval_samples_per_second": 0.916,
388
+ "eval_steps_per_second": 0.238,
389
+ "step": 148
390
+ },
391
+ {
392
+ "epoch": 10.62,
393
+ "learning_rate": 5.4545454545454546e-05,
394
+ "loss": 0.1355,
395
+ "step": 150
396
+ },
397
+ {
398
+ "epoch": 10.76,
399
+ "eval_loss": 0.13857363164424896,
400
+ "eval_runtime": 54.6622,
401
+ "eval_samples_per_second": 0.915,
402
+ "eval_steps_per_second": 0.238,
403
+ "step": 152
404
+ },
405
+ {
406
+ "epoch": 11.04,
407
+ "eval_loss": 0.13809233903884888,
408
+ "eval_runtime": 54.7824,
409
+ "eval_samples_per_second": 0.913,
410
+ "eval_steps_per_second": 0.237,
411
+ "step": 156
412
+ },
413
+ {
414
+ "epoch": 11.33,
415
+ "learning_rate": 4.545454545454546e-05,
416
+ "loss": 0.1216,
417
+ "step": 160
418
+ },
419
+ {
420
+ "epoch": 11.33,
421
+ "eval_loss": 0.137764573097229,
422
+ "eval_runtime": 54.6049,
423
+ "eval_samples_per_second": 0.916,
424
+ "eval_steps_per_second": 0.238,
425
+ "step": 160
426
+ },
427
+ {
428
+ "epoch": 11.61,
429
+ "eval_loss": 0.1369408369064331,
430
+ "eval_runtime": 54.6188,
431
+ "eval_samples_per_second": 0.915,
432
+ "eval_steps_per_second": 0.238,
433
+ "step": 164
434
+ },
435
+ {
436
+ "epoch": 11.89,
437
+ "eval_loss": 0.13684938848018646,
438
+ "eval_runtime": 54.5949,
439
+ "eval_samples_per_second": 0.916,
440
+ "eval_steps_per_second": 0.238,
441
+ "step": 168
442
+ },
443
+ {
444
+ "epoch": 12.04,
445
+ "learning_rate": 3.6363636363636364e-05,
446
+ "loss": 0.1265,
447
+ "step": 170
448
+ },
449
+ {
450
+ "epoch": 12.18,
451
+ "eval_loss": 0.1366124004125595,
452
+ "eval_runtime": 54.5928,
453
+ "eval_samples_per_second": 0.916,
454
+ "eval_steps_per_second": 0.238,
455
+ "step": 172
456
+ },
457
+ {
458
+ "epoch": 12.46,
459
+ "eval_loss": 0.1361435353755951,
460
+ "eval_runtime": 54.6703,
461
+ "eval_samples_per_second": 0.915,
462
+ "eval_steps_per_second": 0.238,
463
+ "step": 176
464
+ },
465
+ {
466
+ "epoch": 12.74,
467
+ "learning_rate": 2.7272727272727273e-05,
468
+ "loss": 0.127,
469
+ "step": 180
470
+ },
471
+ {
472
+ "epoch": 12.74,
473
+ "eval_loss": 0.13553684949874878,
474
+ "eval_runtime": 54.6232,
475
+ "eval_samples_per_second": 0.915,
476
+ "eval_steps_per_second": 0.238,
477
+ "step": 180
478
+ }
479
+ ],
480
+ "logging_steps": 10,
481
+ "max_steps": 210,
482
+ "num_input_tokens_seen": 0,
483
+ "num_train_epochs": 15,
484
+ "save_steps": 12,
485
+ "total_flos": 5.92723796905427e+17,
486
+ "train_batch_size": 4,
487
+ "trial_name": null,
488
+ "trial_params": null
489
+ }
checkpoint-180/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85310c54a4f279d40e8badbc8f6f7406b57e15fc0c79500de525827feedf5072
3
+ size 4219
checkpoint-192/adapter_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model_name_or_path": "codellama/CodeLlama-7b-hf",
3
+ "bias": "none",
4
+ "enable_lora": null,
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "lora_alpha": 16,
9
+ "lora_dropout": 0.05,
10
+ "merge_weights": false,
11
+ "modules_to_save": null,
12
+ "peft_type": "LORA",
13
+ "r": 8,
14
+ "target_modules": [
15
+ "q_proj",
16
+ "v_proj"
17
+ ],
18
+ "task_type": "CAUSAL_LM"
19
+ }
checkpoint-192/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:186553e3e151750f19415361042f6320798f792ea25170393fb7d9d303db2e9b
3
+ size 16822989
checkpoint-192/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ba4baee80fbebd6460e2b608d5943806126c8066db270091f474a8d0c934955
3
+ size 33661637
checkpoint-192/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18cf619a2055041e7682d6cee01f711221e08a105f2a6e3ef7212b90e297fcc9
3
+ size 14575
checkpoint-192/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a09b08b1ed9181d6c79efd221da3dbaf667487db63e0278a59e8107ee4a6b5a
3
+ size 627
checkpoint-192/trainer_state.json ADDED
@@ -0,0 +1,519 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.1348796784877777,
3
+ "best_model_checkpoint": "./lora-out/checkpoint-192",
4
+ "epoch": 13.5929203539823,
5
+ "eval_steps": 4,
6
+ "global_step": 192,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.28,
13
+ "eval_loss": 0.3006412982940674,
14
+ "eval_runtime": 54.7333,
15
+ "eval_samples_per_second": 0.914,
16
+ "eval_steps_per_second": 0.238,
17
+ "step": 4
18
+ },
19
+ {
20
+ "epoch": 0.57,
21
+ "eval_loss": 0.300335168838501,
22
+ "eval_runtime": 54.8113,
23
+ "eval_samples_per_second": 0.912,
24
+ "eval_steps_per_second": 0.237,
25
+ "step": 8
26
+ },
27
+ {
28
+ "epoch": 0.71,
29
+ "learning_rate": 1e-05,
30
+ "loss": 0.3024,
31
+ "step": 10
32
+ },
33
+ {
34
+ "epoch": 0.85,
35
+ "eval_loss": 0.2993900179862976,
36
+ "eval_runtime": 54.782,
37
+ "eval_samples_per_second": 0.913,
38
+ "eval_steps_per_second": 0.237,
39
+ "step": 12
40
+ },
41
+ {
42
+ "epoch": 1.13,
43
+ "eval_loss": 0.29816487431526184,
44
+ "eval_runtime": 54.8049,
45
+ "eval_samples_per_second": 0.912,
46
+ "eval_steps_per_second": 0.237,
47
+ "step": 16
48
+ },
49
+ {
50
+ "epoch": 1.42,
51
+ "learning_rate": 2e-05,
52
+ "loss": 0.3035,
53
+ "step": 20
54
+ },
55
+ {
56
+ "epoch": 1.42,
57
+ "eval_loss": 0.29595255851745605,
58
+ "eval_runtime": 54.7885,
59
+ "eval_samples_per_second": 0.913,
60
+ "eval_steps_per_second": 0.237,
61
+ "step": 20
62
+ },
63
+ {
64
+ "epoch": 1.7,
65
+ "eval_loss": 0.2939557135105133,
66
+ "eval_runtime": 54.7999,
67
+ "eval_samples_per_second": 0.912,
68
+ "eval_steps_per_second": 0.237,
69
+ "step": 24
70
+ },
71
+ {
72
+ "epoch": 1.98,
73
+ "eval_loss": 0.29013773798942566,
74
+ "eval_runtime": 54.7805,
75
+ "eval_samples_per_second": 0.913,
76
+ "eval_steps_per_second": 0.237,
77
+ "step": 28
78
+ },
79
+ {
80
+ "epoch": 2.12,
81
+ "learning_rate": 3e-05,
82
+ "loss": 0.2959,
83
+ "step": 30
84
+ },
85
+ {
86
+ "epoch": 2.27,
87
+ "eval_loss": 0.28251081705093384,
88
+ "eval_runtime": 54.7706,
89
+ "eval_samples_per_second": 0.913,
90
+ "eval_steps_per_second": 0.237,
91
+ "step": 32
92
+ },
93
+ {
94
+ "epoch": 2.55,
95
+ "eval_loss": 0.2771329879760742,
96
+ "eval_runtime": 54.7818,
97
+ "eval_samples_per_second": 0.913,
98
+ "eval_steps_per_second": 0.237,
99
+ "step": 36
100
+ },
101
+ {
102
+ "epoch": 2.83,
103
+ "learning_rate": 4e-05,
104
+ "loss": 0.284,
105
+ "step": 40
106
+ },
107
+ {
108
+ "epoch": 2.83,
109
+ "eval_loss": 0.27146488428115845,
110
+ "eval_runtime": 54.803,
111
+ "eval_samples_per_second": 0.912,
112
+ "eval_steps_per_second": 0.237,
113
+ "step": 40
114
+ },
115
+ {
116
+ "epoch": 3.12,
117
+ "eval_loss": 0.26464152336120605,
118
+ "eval_runtime": 54.8467,
119
+ "eval_samples_per_second": 0.912,
120
+ "eval_steps_per_second": 0.237,
121
+ "step": 44
122
+ },
123
+ {
124
+ "epoch": 3.4,
125
+ "eval_loss": 0.25653430819511414,
126
+ "eval_runtime": 54.8327,
127
+ "eval_samples_per_second": 0.912,
128
+ "eval_steps_per_second": 0.237,
129
+ "step": 48
130
+ },
131
+ {
132
+ "epoch": 3.54,
133
+ "learning_rate": 5e-05,
134
+ "loss": 0.263,
135
+ "step": 50
136
+ },
137
+ {
138
+ "epoch": 3.68,
139
+ "eval_loss": 0.24627122282981873,
140
+ "eval_runtime": 54.813,
141
+ "eval_samples_per_second": 0.912,
142
+ "eval_steps_per_second": 0.237,
143
+ "step": 52
144
+ },
145
+ {
146
+ "epoch": 3.96,
147
+ "eval_loss": 0.23474617302417755,
148
+ "eval_runtime": 54.7901,
149
+ "eval_samples_per_second": 0.913,
150
+ "eval_steps_per_second": 0.237,
151
+ "step": 56
152
+ },
153
+ {
154
+ "epoch": 4.25,
155
+ "learning_rate": 6e-05,
156
+ "loss": 0.241,
157
+ "step": 60
158
+ },
159
+ {
160
+ "epoch": 4.25,
161
+ "eval_loss": 0.2220366895198822,
162
+ "eval_runtime": 54.7983,
163
+ "eval_samples_per_second": 0.912,
164
+ "eval_steps_per_second": 0.237,
165
+ "step": 60
166
+ },
167
+ {
168
+ "epoch": 4.53,
169
+ "eval_loss": 0.20926769077777863,
170
+ "eval_runtime": 54.7403,
171
+ "eval_samples_per_second": 0.913,
172
+ "eval_steps_per_second": 0.237,
173
+ "step": 64
174
+ },
175
+ {
176
+ "epoch": 4.81,
177
+ "eval_loss": 0.19629451632499695,
178
+ "eval_runtime": 54.7525,
179
+ "eval_samples_per_second": 0.913,
180
+ "eval_steps_per_second": 0.237,
181
+ "step": 68
182
+ },
183
+ {
184
+ "epoch": 4.96,
185
+ "learning_rate": 7e-05,
186
+ "loss": 0.2101,
187
+ "step": 70
188
+ },
189
+ {
190
+ "epoch": 5.1,
191
+ "eval_loss": 0.18524658679962158,
192
+ "eval_runtime": 54.7303,
193
+ "eval_samples_per_second": 0.914,
194
+ "eval_steps_per_second": 0.238,
195
+ "step": 72
196
+ },
197
+ {
198
+ "epoch": 5.38,
199
+ "eval_loss": 0.17731742560863495,
200
+ "eval_runtime": 54.7552,
201
+ "eval_samples_per_second": 0.913,
202
+ "eval_steps_per_second": 0.237,
203
+ "step": 76
204
+ },
205
+ {
206
+ "epoch": 5.66,
207
+ "learning_rate": 8e-05,
208
+ "loss": 0.1788,
209
+ "step": 80
210
+ },
211
+ {
212
+ "epoch": 5.66,
213
+ "eval_loss": 0.16993452608585358,
214
+ "eval_runtime": 54.729,
215
+ "eval_samples_per_second": 0.914,
216
+ "eval_steps_per_second": 0.238,
217
+ "step": 80
218
+ },
219
+ {
220
+ "epoch": 5.95,
221
+ "eval_loss": 0.164781853556633,
222
+ "eval_runtime": 54.741,
223
+ "eval_samples_per_second": 0.913,
224
+ "eval_steps_per_second": 0.237,
225
+ "step": 84
226
+ },
227
+ {
228
+ "epoch": 6.23,
229
+ "eval_loss": 0.16103117167949677,
230
+ "eval_runtime": 54.7837,
231
+ "eval_samples_per_second": 0.913,
232
+ "eval_steps_per_second": 0.237,
233
+ "step": 88
234
+ },
235
+ {
236
+ "epoch": 6.37,
237
+ "learning_rate": 9e-05,
238
+ "loss": 0.1615,
239
+ "step": 90
240
+ },
241
+ {
242
+ "epoch": 6.51,
243
+ "eval_loss": 0.15781742334365845,
244
+ "eval_runtime": 54.7138,
245
+ "eval_samples_per_second": 0.914,
246
+ "eval_steps_per_second": 0.238,
247
+ "step": 92
248
+ },
249
+ {
250
+ "epoch": 6.8,
251
+ "eval_loss": 0.15516981482505798,
252
+ "eval_runtime": 54.7516,
253
+ "eval_samples_per_second": 0.913,
254
+ "eval_steps_per_second": 0.237,
255
+ "step": 96
256
+ },
257
+ {
258
+ "epoch": 7.08,
259
+ "learning_rate": 0.0001,
260
+ "loss": 0.1533,
261
+ "step": 100
262
+ },
263
+ {
264
+ "epoch": 7.08,
265
+ "eval_loss": 0.15261690318584442,
266
+ "eval_runtime": 54.6891,
267
+ "eval_samples_per_second": 0.914,
268
+ "eval_steps_per_second": 0.238,
269
+ "step": 100
270
+ },
271
+ {
272
+ "epoch": 7.36,
273
+ "eval_loss": 0.15066812932491302,
274
+ "eval_runtime": 54.6884,
275
+ "eval_samples_per_second": 0.914,
276
+ "eval_steps_per_second": 0.238,
277
+ "step": 104
278
+ },
279
+ {
280
+ "epoch": 7.65,
281
+ "eval_loss": 0.14893724024295807,
282
+ "eval_runtime": 54.6275,
283
+ "eval_samples_per_second": 0.915,
284
+ "eval_steps_per_second": 0.238,
285
+ "step": 108
286
+ },
287
+ {
288
+ "epoch": 7.79,
289
+ "learning_rate": 9.090909090909092e-05,
290
+ "loss": 0.1463,
291
+ "step": 110
292
+ },
293
+ {
294
+ "epoch": 7.93,
295
+ "eval_loss": 0.14742153882980347,
296
+ "eval_runtime": 54.6174,
297
+ "eval_samples_per_second": 0.915,
298
+ "eval_steps_per_second": 0.238,
299
+ "step": 112
300
+ },
301
+ {
302
+ "epoch": 8.21,
303
+ "eval_loss": 0.14575307071208954,
304
+ "eval_runtime": 54.6366,
305
+ "eval_samples_per_second": 0.915,
306
+ "eval_steps_per_second": 0.238,
307
+ "step": 116
308
+ },
309
+ {
310
+ "epoch": 8.5,
311
+ "learning_rate": 8.181818181818183e-05,
312
+ "loss": 0.1399,
313
+ "step": 120
314
+ },
315
+ {
316
+ "epoch": 8.5,
317
+ "eval_loss": 0.14450186491012573,
318
+ "eval_runtime": 54.6303,
319
+ "eval_samples_per_second": 0.915,
320
+ "eval_steps_per_second": 0.238,
321
+ "step": 120
322
+ },
323
+ {
324
+ "epoch": 8.78,
325
+ "eval_loss": 0.1431863009929657,
326
+ "eval_runtime": 54.6358,
327
+ "eval_samples_per_second": 0.915,
328
+ "eval_steps_per_second": 0.238,
329
+ "step": 124
330
+ },
331
+ {
332
+ "epoch": 9.06,
333
+ "eval_loss": 0.1424635797739029,
334
+ "eval_runtime": 54.6449,
335
+ "eval_samples_per_second": 0.915,
336
+ "eval_steps_per_second": 0.238,
337
+ "step": 128
338
+ },
339
+ {
340
+ "epoch": 9.2,
341
+ "learning_rate": 7.272727272727273e-05,
342
+ "loss": 0.1357,
343
+ "step": 130
344
+ },
345
+ {
346
+ "epoch": 9.35,
347
+ "eval_loss": 0.14175941050052643,
348
+ "eval_runtime": 54.6307,
349
+ "eval_samples_per_second": 0.915,
350
+ "eval_steps_per_second": 0.238,
351
+ "step": 132
352
+ },
353
+ {
354
+ "epoch": 9.63,
355
+ "eval_loss": 0.14105737209320068,
356
+ "eval_runtime": 54.6121,
357
+ "eval_samples_per_second": 0.916,
358
+ "eval_steps_per_second": 0.238,
359
+ "step": 136
360
+ },
361
+ {
362
+ "epoch": 9.91,
363
+ "learning_rate": 6.363636363636364e-05,
364
+ "loss": 0.1322,
365
+ "step": 140
366
+ },
367
+ {
368
+ "epoch": 9.91,
369
+ "eval_loss": 0.14027251303195953,
370
+ "eval_runtime": 54.6594,
371
+ "eval_samples_per_second": 0.915,
372
+ "eval_steps_per_second": 0.238,
373
+ "step": 140
374
+ },
375
+ {
376
+ "epoch": 10.19,
377
+ "eval_loss": 0.13963991403579712,
378
+ "eval_runtime": 54.6378,
379
+ "eval_samples_per_second": 0.915,
380
+ "eval_steps_per_second": 0.238,
381
+ "step": 144
382
+ },
383
+ {
384
+ "epoch": 10.48,
385
+ "eval_loss": 0.138994500041008,
386
+ "eval_runtime": 54.6115,
387
+ "eval_samples_per_second": 0.916,
388
+ "eval_steps_per_second": 0.238,
389
+ "step": 148
390
+ },
391
+ {
392
+ "epoch": 10.62,
393
+ "learning_rate": 5.4545454545454546e-05,
394
+ "loss": 0.1355,
395
+ "step": 150
396
+ },
397
+ {
398
+ "epoch": 10.76,
399
+ "eval_loss": 0.13857363164424896,
400
+ "eval_runtime": 54.6622,
401
+ "eval_samples_per_second": 0.915,
402
+ "eval_steps_per_second": 0.238,
403
+ "step": 152
404
+ },
405
+ {
406
+ "epoch": 11.04,
407
+ "eval_loss": 0.13809233903884888,
408
+ "eval_runtime": 54.7824,
409
+ "eval_samples_per_second": 0.913,
410
+ "eval_steps_per_second": 0.237,
411
+ "step": 156
412
+ },
413
+ {
414
+ "epoch": 11.33,
415
+ "learning_rate": 4.545454545454546e-05,
416
+ "loss": 0.1216,
417
+ "step": 160
418
+ },
419
+ {
420
+ "epoch": 11.33,
421
+ "eval_loss": 0.137764573097229,
422
+ "eval_runtime": 54.6049,
423
+ "eval_samples_per_second": 0.916,
424
+ "eval_steps_per_second": 0.238,
425
+ "step": 160
426
+ },
427
+ {
428
+ "epoch": 11.61,
429
+ "eval_loss": 0.1369408369064331,
430
+ "eval_runtime": 54.6188,
431
+ "eval_samples_per_second": 0.915,
432
+ "eval_steps_per_second": 0.238,
433
+ "step": 164
434
+ },
435
+ {
436
+ "epoch": 11.89,
437
+ "eval_loss": 0.13684938848018646,
438
+ "eval_runtime": 54.5949,
439
+ "eval_samples_per_second": 0.916,
440
+ "eval_steps_per_second": 0.238,
441
+ "step": 168
442
+ },
443
+ {
444
+ "epoch": 12.04,
445
+ "learning_rate": 3.6363636363636364e-05,
446
+ "loss": 0.1265,
447
+ "step": 170
448
+ },
449
+ {
450
+ "epoch": 12.18,
451
+ "eval_loss": 0.1366124004125595,
452
+ "eval_runtime": 54.5928,
453
+ "eval_samples_per_second": 0.916,
454
+ "eval_steps_per_second": 0.238,
455
+ "step": 172
456
+ },
457
+ {
458
+ "epoch": 12.46,
459
+ "eval_loss": 0.1361435353755951,
460
+ "eval_runtime": 54.6703,
461
+ "eval_samples_per_second": 0.915,
462
+ "eval_steps_per_second": 0.238,
463
+ "step": 176
464
+ },
465
+ {
466
+ "epoch": 12.74,
467
+ "learning_rate": 2.7272727272727273e-05,
468
+ "loss": 0.127,
469
+ "step": 180
470
+ },
471
+ {
472
+ "epoch": 12.74,
473
+ "eval_loss": 0.13553684949874878,
474
+ "eval_runtime": 54.6232,
475
+ "eval_samples_per_second": 0.915,
476
+ "eval_steps_per_second": 0.238,
477
+ "step": 180
478
+ },
479
+ {
480
+ "epoch": 13.03,
481
+ "eval_loss": 0.13531364500522614,
482
+ "eval_runtime": 54.7712,
483
+ "eval_samples_per_second": 0.913,
484
+ "eval_steps_per_second": 0.237,
485
+ "step": 184
486
+ },
487
+ {
488
+ "epoch": 13.31,
489
+ "eval_loss": 0.1353050172328949,
490
+ "eval_runtime": 54.8052,
491
+ "eval_samples_per_second": 0.912,
492
+ "eval_steps_per_second": 0.237,
493
+ "step": 188
494
+ },
495
+ {
496
+ "epoch": 13.45,
497
+ "learning_rate": 1.8181818181818182e-05,
498
+ "loss": 0.1233,
499
+ "step": 190
500
+ },
501
+ {
502
+ "epoch": 13.59,
503
+ "eval_loss": 0.1348796784877777,
504
+ "eval_runtime": 54.5607,
505
+ "eval_samples_per_second": 0.916,
506
+ "eval_steps_per_second": 0.238,
507
+ "step": 192
508
+ }
509
+ ],
510
+ "logging_steps": 10,
511
+ "max_steps": 210,
512
+ "num_input_tokens_seen": 0,
513
+ "num_train_epochs": 15,
514
+ "save_steps": 12,
515
+ "total_flos": 6.321566693448745e+17,
516
+ "train_batch_size": 4,
517
+ "trial_name": null,
518
+ "trial_params": null
519
+ }
checkpoint-192/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85310c54a4f279d40e8badbc8f6f7406b57e15fc0c79500de525827feedf5072
3
+ size 4219