trapoom555 commited on
Commit
8aaf8ba
1 Parent(s): d453b94

Move checkpoints to another repo

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .DS_Store +0 -0
  2. checkpoints/.DS_Store +0 -0
  3. checkpoints/checkpoint-1000/adapter_config.json +0 -29
  4. checkpoints/checkpoint-1000/adapter_model.safetensors +0 -3
  5. checkpoints/checkpoint-1000/optimizer.pt +0 -3
  6. checkpoints/checkpoint-1000/rng_state_0.pth +0 -3
  7. checkpoints/checkpoint-1000/rng_state_1.pth +0 -3
  8. checkpoints/checkpoint-1000/rng_state_2.pth +0 -3
  9. checkpoints/checkpoint-1000/rng_state_3.pth +0 -3
  10. checkpoints/checkpoint-1000/scheduler.pt +0 -3
  11. checkpoints/checkpoint-1000/trainer_state.json +0 -721
  12. checkpoints/checkpoint-1000/training_args.bin +0 -3
  13. checkpoints/checkpoint-10000/adapter_config.json +0 -29
  14. checkpoints/checkpoint-10000/adapter_model.safetensors +0 -3
  15. checkpoints/checkpoint-10000/optimizer.pt +0 -3
  16. checkpoints/checkpoint-10000/rng_state_0.pth +0 -3
  17. checkpoints/checkpoint-10000/rng_state_1.pth +0 -3
  18. checkpoints/checkpoint-10000/rng_state_2.pth +0 -3
  19. checkpoints/checkpoint-10000/rng_state_3.pth +0 -3
  20. checkpoints/checkpoint-10000/scheduler.pt +0 -3
  21. checkpoints/checkpoint-10000/trainer_state.json +0 -0
  22. checkpoints/checkpoint-10000/training_args.bin +0 -3
  23. checkpoints/checkpoint-10500/adapter_config.json +0 -29
  24. checkpoints/checkpoint-10500/adapter_model.safetensors +0 -3
  25. checkpoints/checkpoint-10500/optimizer.pt +0 -3
  26. checkpoints/checkpoint-10500/rng_state_0.pth +0 -3
  27. checkpoints/checkpoint-10500/rng_state_1.pth +0 -3
  28. checkpoints/checkpoint-10500/rng_state_2.pth +0 -3
  29. checkpoints/checkpoint-10500/rng_state_3.pth +0 -3
  30. checkpoints/checkpoint-10500/scheduler.pt +0 -3
  31. checkpoints/checkpoint-10500/trainer_state.json +0 -0
  32. checkpoints/checkpoint-10500/training_args.bin +0 -3
  33. checkpoints/checkpoint-11000/adapter_config.json +0 -29
  34. checkpoints/checkpoint-11000/adapter_model.safetensors +0 -3
  35. checkpoints/checkpoint-11000/optimizer.pt +0 -3
  36. checkpoints/checkpoint-11000/rng_state_0.pth +0 -3
  37. checkpoints/checkpoint-11000/rng_state_1.pth +0 -3
  38. checkpoints/checkpoint-11000/rng_state_2.pth +0 -3
  39. checkpoints/checkpoint-11000/rng_state_3.pth +0 -3
  40. checkpoints/checkpoint-11000/scheduler.pt +0 -3
  41. checkpoints/checkpoint-11000/trainer_state.json +0 -0
  42. checkpoints/checkpoint-11000/training_args.bin +0 -3
  43. checkpoints/checkpoint-11500/adapter_config.json +0 -29
  44. checkpoints/checkpoint-11500/adapter_model.safetensors +0 -3
  45. checkpoints/checkpoint-11500/optimizer.pt +0 -3
  46. checkpoints/checkpoint-11500/rng_state_0.pth +0 -3
  47. checkpoints/checkpoint-11500/rng_state_1.pth +0 -3
  48. checkpoints/checkpoint-11500/rng_state_2.pth +0 -3
  49. checkpoints/checkpoint-11500/rng_state_3.pth +0 -3
  50. checkpoints/checkpoint-11500/scheduler.pt +0 -3
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
checkpoints/.DS_Store DELETED
Binary file (6.15 kB)
 
checkpoints/checkpoint-1000/adapter_config.json DELETED
@@ -1,29 +0,0 @@
1
- {
2
- "alpha_pattern": {},
3
- "auto_mapping": null,
4
- "base_model_name_or_path": "../pretrained/phi-2/",
5
- "bias": "none",
6
- "fan_in_fan_out": false,
7
- "inference_mode": true,
8
- "init_lora_weights": "gaussian",
9
- "layer_replication": null,
10
- "layers_pattern": null,
11
- "layers_to_transform": null,
12
- "loftq_config": {},
13
- "lora_alpha": 32,
14
- "lora_dropout": 0.1,
15
- "megatron_config": null,
16
- "megatron_core": "megatron.core",
17
- "modules_to_save": null,
18
- "peft_type": "LORA",
19
- "r": 8,
20
- "rank_pattern": {},
21
- "revision": null,
22
- "target_modules": [
23
- "v_proj",
24
- "q_proj"
25
- ],
26
- "task_type": "CAUSAL_LM",
27
- "use_dora": false,
28
- "use_rslora": false
29
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-1000/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f024b12e244af3f991354d7d1e38cec702dbef1d850c62fc1be9589e61196cb6
3
- size 5259848
 
 
 
 
checkpoints/checkpoint-1000/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e6183113abd713d1bbbc9039a3b71bf473ff6a0f0d57cceb6ca63f6ba8cb8b49
3
- size 10593402
 
 
 
 
checkpoints/checkpoint-1000/rng_state_0.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:81f4a320d64085f4beb5d61920a6aa6e80d9f9582a3b5057bcd99e749f436590
3
- size 15024
 
 
 
 
checkpoints/checkpoint-1000/rng_state_1.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e993f6f34789cad8c7afeb088cd5f2880c374834a40598b49660b91f236f67b0
3
- size 15024
 
 
 
 
checkpoints/checkpoint-1000/rng_state_2.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d213df57201a61076b0a8c4252d3e2b2794b47b3fa0498e97076e9ad6e083aa
3
- size 15024
 
 
 
 
checkpoints/checkpoint-1000/rng_state_3.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c50218a277461f267f2806ed4c286b3c2a5fdbc199e920d2f575b762515be8fe
3
- size 15024
 
 
 
 
checkpoints/checkpoint-1000/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a909cc86dc2bb25cb47b301da3b3473b0c55ea709e4a406e429c3b785d1ac94
3
- size 1064
 
 
 
 
checkpoints/checkpoint-1000/trainer_state.json DELETED
@@ -1,721 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 0.07256367462448299,
5
- "eval_steps": 500,
6
- "global_step": 1000,
7
- "is_hyper_param_search": false,
8
- "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
- "log_history": [
11
- {
12
- "epoch": 0.0007256367462448298,
13
- "grad_norm": 13.1875,
14
- "learning_rate": 5e-06,
15
- "loss": 3.553,
16
- "step": 10
17
- },
18
- {
19
- "epoch": 0.0014512734924896596,
20
- "grad_norm": 9.375,
21
- "learning_rate": 1e-05,
22
- "loss": 3.5658,
23
- "step": 20
24
- },
25
- {
26
- "epoch": 0.0021769102387344894,
27
- "grad_norm": 8.8125,
28
- "learning_rate": 1.5e-05,
29
- "loss": 3.4669,
30
- "step": 30
31
- },
32
- {
33
- "epoch": 0.0029025469849793192,
34
- "grad_norm": 13.25,
35
- "learning_rate": 2e-05,
36
- "loss": 3.5007,
37
- "step": 40
38
- },
39
- {
40
- "epoch": 0.003628183731224149,
41
- "grad_norm": 9.3125,
42
- "learning_rate": 2.5e-05,
43
- "loss": 3.5062,
44
- "step": 50
45
- },
46
- {
47
- "epoch": 0.004353820477468979,
48
- "grad_norm": 9.9375,
49
- "learning_rate": 3e-05,
50
- "loss": 3.4178,
51
- "step": 60
52
- },
53
- {
54
- "epoch": 0.005079457223713809,
55
- "grad_norm": 17.125,
56
- "learning_rate": 3.5e-05,
57
- "loss": 3.4136,
58
- "step": 70
59
- },
60
- {
61
- "epoch": 0.0058050939699586385,
62
- "grad_norm": 17.5,
63
- "learning_rate": 4e-05,
64
- "loss": 3.3792,
65
- "step": 80
66
- },
67
- {
68
- "epoch": 0.006530730716203468,
69
- "grad_norm": 17.0,
70
- "learning_rate": 4.5e-05,
71
- "loss": 3.3684,
72
- "step": 90
73
- },
74
- {
75
- "epoch": 0.007256367462448298,
76
- "grad_norm": 21.875,
77
- "learning_rate": 5e-05,
78
- "loss": 3.2698,
79
- "step": 100
80
- },
81
- {
82
- "epoch": 0.007982004208693128,
83
- "grad_norm": 32.5,
84
- "learning_rate": 4.9999934086574596e-05,
85
- "loss": 3.1523,
86
- "step": 110
87
- },
88
- {
89
- "epoch": 0.008707640954937958,
90
- "grad_norm": 73.5,
91
- "learning_rate": 4.9999736346645943e-05,
92
- "loss": 3.0884,
93
- "step": 120
94
- },
95
- {
96
- "epoch": 0.009433277701182788,
97
- "grad_norm": 34.75,
98
- "learning_rate": 4.999940678125673e-05,
99
- "loss": 2.9507,
100
- "step": 130
101
- },
102
- {
103
- "epoch": 0.010158914447427617,
104
- "grad_norm": 47.5,
105
- "learning_rate": 4.9998945392144796e-05,
106
- "loss": 2.8659,
107
- "step": 140
108
- },
109
- {
110
- "epoch": 0.010884551193672447,
111
- "grad_norm": 35.25,
112
- "learning_rate": 4.999835218174307e-05,
113
- "loss": 2.9846,
114
- "step": 150
115
- },
116
- {
117
- "epoch": 0.011610187939917277,
118
- "grad_norm": 27.5,
119
- "learning_rate": 4.99976271531796e-05,
120
- "loss": 2.8913,
121
- "step": 160
122
- },
123
- {
124
- "epoch": 0.012335824686162107,
125
- "grad_norm": 38.5,
126
- "learning_rate": 4.9996770310277506e-05,
127
- "loss": 2.8982,
128
- "step": 170
129
- },
130
- {
131
- "epoch": 0.013061461432406937,
132
- "grad_norm": 52.0,
133
- "learning_rate": 4.9995781657555e-05,
134
- "loss": 2.7165,
135
- "step": 180
136
- },
137
- {
138
- "epoch": 0.013787098178651766,
139
- "grad_norm": 34.75,
140
- "learning_rate": 4.99946612002253e-05,
141
- "loss": 2.4166,
142
- "step": 190
143
- },
144
- {
145
- "epoch": 0.014512734924896596,
146
- "grad_norm": 40.25,
147
- "learning_rate": 4.9993408944196676e-05,
148
- "loss": 2.2161,
149
- "step": 200
150
- },
151
- {
152
- "epoch": 0.015238371671141426,
153
- "grad_norm": 39.25,
154
- "learning_rate": 4.9992024896072364e-05,
155
- "loss": 1.7825,
156
- "step": 210
157
- },
158
- {
159
- "epoch": 0.015964008417386256,
160
- "grad_norm": 37.25,
161
- "learning_rate": 4.999050906315055e-05,
162
- "loss": 1.46,
163
- "step": 220
164
- },
165
- {
166
- "epoch": 0.016689645163631087,
167
- "grad_norm": 13.625,
168
- "learning_rate": 4.998886145342434e-05,
169
- "loss": 1.3157,
170
- "step": 230
171
- },
172
- {
173
- "epoch": 0.017415281909875915,
174
- "grad_norm": 10.4375,
175
- "learning_rate": 4.9987082075581684e-05,
176
- "loss": 1.1408,
177
- "step": 240
178
- },
179
- {
180
- "epoch": 0.018140918656120747,
181
- "grad_norm": 14.0,
182
- "learning_rate": 4.9985170939005386e-05,
183
- "loss": 0.8782,
184
- "step": 250
185
- },
186
- {
187
- "epoch": 0.018866555402365575,
188
- "grad_norm": 22.0,
189
- "learning_rate": 4.998312805377302e-05,
190
- "loss": 0.9109,
191
- "step": 260
192
- },
193
- {
194
- "epoch": 0.019592192148610407,
195
- "grad_norm": 12.625,
196
- "learning_rate": 4.998095343065685e-05,
197
- "loss": 0.7414,
198
- "step": 270
199
- },
200
- {
201
- "epoch": 0.020317828894855235,
202
- "grad_norm": 15.8125,
203
- "learning_rate": 4.997864708112384e-05,
204
- "loss": 0.7926,
205
- "step": 280
206
- },
207
- {
208
- "epoch": 0.021043465641100066,
209
- "grad_norm": 16.25,
210
- "learning_rate": 4.997620901733554e-05,
211
- "loss": 0.7382,
212
- "step": 290
213
- },
214
- {
215
- "epoch": 0.021769102387344894,
216
- "grad_norm": 22.0,
217
- "learning_rate": 4.997363925214803e-05,
218
- "loss": 0.7158,
219
- "step": 300
220
- },
221
- {
222
- "epoch": 0.022494739133589726,
223
- "grad_norm": 13.375,
224
- "learning_rate": 4.9970937799111896e-05,
225
- "loss": 0.526,
226
- "step": 310
227
- },
228
- {
229
- "epoch": 0.023220375879834554,
230
- "grad_norm": 12.9375,
231
- "learning_rate": 4.996810467247207e-05,
232
- "loss": 0.6137,
233
- "step": 320
234
- },
235
- {
236
- "epoch": 0.023946012626079385,
237
- "grad_norm": 6.78125,
238
- "learning_rate": 4.9965139887167856e-05,
239
- "loss": 0.4864,
240
- "step": 330
241
- },
242
- {
243
- "epoch": 0.024671649372324213,
244
- "grad_norm": 10.25,
245
- "learning_rate": 4.996204345883278e-05,
246
- "loss": 0.6921,
247
- "step": 340
248
- },
249
- {
250
- "epoch": 0.025397286118569045,
251
- "grad_norm": 6.3125,
252
- "learning_rate": 4.9958815403794546e-05,
253
- "loss": 0.5826,
254
- "step": 350
255
- },
256
- {
257
- "epoch": 0.026122922864813873,
258
- "grad_norm": 10.625,
259
- "learning_rate": 4.995545573907492e-05,
260
- "loss": 0.5838,
261
- "step": 360
262
- },
263
- {
264
- "epoch": 0.026848559611058705,
265
- "grad_norm": 17.25,
266
- "learning_rate": 4.995196448238966e-05,
267
- "loss": 0.5941,
268
- "step": 370
269
- },
270
- {
271
- "epoch": 0.027574196357303533,
272
- "grad_norm": 9.875,
273
- "learning_rate": 4.9948341652148436e-05,
274
- "loss": 0.5055,
275
- "step": 380
276
- },
277
- {
278
- "epoch": 0.028299833103548364,
279
- "grad_norm": 21.25,
280
- "learning_rate": 4.994458726745468e-05,
281
- "loss": 0.6114,
282
- "step": 390
283
- },
284
- {
285
- "epoch": 0.029025469849793192,
286
- "grad_norm": 3.375,
287
- "learning_rate": 4.9940701348105554e-05,
288
- "loss": 0.5372,
289
- "step": 400
290
- },
291
- {
292
- "epoch": 0.029751106596038024,
293
- "grad_norm": 12.625,
294
- "learning_rate": 4.99366839145918e-05,
295
- "loss": 0.4815,
296
- "step": 410
297
- },
298
- {
299
- "epoch": 0.030476743342282852,
300
- "grad_norm": 10.8125,
301
- "learning_rate": 4.993253498809762e-05,
302
- "loss": 0.4903,
303
- "step": 420
304
- },
305
- {
306
- "epoch": 0.031202380088527683,
307
- "grad_norm": 9.0,
308
- "learning_rate": 4.9928254590500646e-05,
309
- "loss": 0.5927,
310
- "step": 430
311
- },
312
- {
313
- "epoch": 0.03192801683477251,
314
- "grad_norm": 6.71875,
315
- "learning_rate": 4.9923842744371707e-05,
316
- "loss": 0.4424,
317
- "step": 440
318
- },
319
- {
320
- "epoch": 0.03265365358101734,
321
- "grad_norm": 5.0,
322
- "learning_rate": 4.99192994729748e-05,
323
- "loss": 0.362,
324
- "step": 450
325
- },
326
- {
327
- "epoch": 0.033379290327262175,
328
- "grad_norm": 8.625,
329
- "learning_rate": 4.991462480026693e-05,
330
- "loss": 0.593,
331
- "step": 460
332
- },
333
- {
334
- "epoch": 0.034104927073507,
335
- "grad_norm": 4.53125,
336
- "learning_rate": 4.9909818750898e-05,
337
- "loss": 0.4769,
338
- "step": 470
339
- },
340
- {
341
- "epoch": 0.03483056381975183,
342
- "grad_norm": 11.125,
343
- "learning_rate": 4.990488135021065e-05,
344
- "loss": 0.4455,
345
- "step": 480
346
- },
347
- {
348
- "epoch": 0.03555620056599666,
349
- "grad_norm": 8.375,
350
- "learning_rate": 4.989981262424017e-05,
351
- "loss": 0.4713,
352
- "step": 490
353
- },
354
- {
355
- "epoch": 0.036281837312241494,
356
- "grad_norm": 7.75,
357
- "learning_rate": 4.989461259971432e-05,
358
- "loss": 0.2778,
359
- "step": 500
360
- },
361
- {
362
- "epoch": 0.03700747405848632,
363
- "grad_norm": 5.65625,
364
- "learning_rate": 4.988928130405323e-05,
365
- "loss": 0.3604,
366
- "step": 510
367
- },
368
- {
369
- "epoch": 0.03773311080473115,
370
- "grad_norm": 7.5625,
371
- "learning_rate": 4.9883818765369194e-05,
372
- "loss": 0.4234,
373
- "step": 520
374
- },
375
- {
376
- "epoch": 0.03845874755097598,
377
- "grad_norm": 8.9375,
378
- "learning_rate": 4.98782250124666e-05,
379
- "loss": 0.3531,
380
- "step": 530
381
- },
382
- {
383
- "epoch": 0.03918438429722081,
384
- "grad_norm": 6.5,
385
- "learning_rate": 4.987250007484172e-05,
386
- "loss": 0.3281,
387
- "step": 540
388
- },
389
- {
390
- "epoch": 0.03991002104346564,
391
- "grad_norm": 4.59375,
392
- "learning_rate": 4.986664398268256e-05,
393
- "loss": 0.3477,
394
- "step": 550
395
- },
396
- {
397
- "epoch": 0.04063565778971047,
398
- "grad_norm": 6.4375,
399
- "learning_rate": 4.986065676686874e-05,
400
- "loss": 0.369,
401
- "step": 560
402
- },
403
- {
404
- "epoch": 0.0413612945359553,
405
- "grad_norm": 8.0625,
406
- "learning_rate": 4.98545384589713e-05,
407
- "loss": 0.3068,
408
- "step": 570
409
- },
410
- {
411
- "epoch": 0.04208693128220013,
412
- "grad_norm": 6.28125,
413
- "learning_rate": 4.984828909125251e-05,
414
- "loss": 0.2926,
415
- "step": 580
416
- },
417
- {
418
- "epoch": 0.042812568028444964,
419
- "grad_norm": 4.4375,
420
- "learning_rate": 4.9841908696665764e-05,
421
- "loss": 0.2771,
422
- "step": 590
423
- },
424
- {
425
- "epoch": 0.04353820477468979,
426
- "grad_norm": 4.15625,
427
- "learning_rate": 4.9835397308855344e-05,
428
- "loss": 0.3676,
429
- "step": 600
430
- },
431
- {
432
- "epoch": 0.04426384152093462,
433
- "grad_norm": 4.4375,
434
- "learning_rate": 4.9828754962156286e-05,
435
- "loss": 0.3307,
436
- "step": 610
437
- },
438
- {
439
- "epoch": 0.04498947826717945,
440
- "grad_norm": 13.75,
441
- "learning_rate": 4.9821981691594175e-05,
442
- "loss": 0.3601,
443
- "step": 620
444
- },
445
- {
446
- "epoch": 0.04571511501342428,
447
- "grad_norm": 4.59375,
448
- "learning_rate": 4.981507753288497e-05,
449
- "loss": 0.2939,
450
- "step": 630
451
- },
452
- {
453
- "epoch": 0.04644075175966911,
454
- "grad_norm": 6.1875,
455
- "learning_rate": 4.9808042522434814e-05,
456
- "loss": 0.293,
457
- "step": 640
458
- },
459
- {
460
- "epoch": 0.04716638850591394,
461
- "grad_norm": 4.65625,
462
- "learning_rate": 4.9800876697339824e-05,
463
- "loss": 0.3213,
464
- "step": 650
465
- },
466
- {
467
- "epoch": 0.04789202525215877,
468
- "grad_norm": 7.65625,
469
- "learning_rate": 4.979358009538594e-05,
470
- "loss": 0.3903,
471
- "step": 660
472
- },
473
- {
474
- "epoch": 0.0486176619984036,
475
- "grad_norm": 9.3125,
476
- "learning_rate": 4.978615275504869e-05,
477
- "loss": 0.2936,
478
- "step": 670
479
- },
480
- {
481
- "epoch": 0.04934329874464843,
482
- "grad_norm": 8.4375,
483
- "learning_rate": 4.977859471549297e-05,
484
- "loss": 0.3681,
485
- "step": 680
486
- },
487
- {
488
- "epoch": 0.05006893549089326,
489
- "grad_norm": 6.28125,
490
- "learning_rate": 4.977090601657289e-05,
491
- "loss": 0.2591,
492
- "step": 690
493
- },
494
- {
495
- "epoch": 0.05079457223713809,
496
- "grad_norm": 10.8125,
497
- "learning_rate": 4.976308669883153e-05,
498
- "loss": 0.3351,
499
- "step": 700
500
- },
501
- {
502
- "epoch": 0.05152020898338292,
503
- "grad_norm": 6.65625,
504
- "learning_rate": 4.975513680350073e-05,
505
- "loss": 0.3042,
506
- "step": 710
507
- },
508
- {
509
- "epoch": 0.052245845729627746,
510
- "grad_norm": 6.65625,
511
- "learning_rate": 4.974705637250089e-05,
512
- "loss": 0.2396,
513
- "step": 720
514
- },
515
- {
516
- "epoch": 0.05297148247587258,
517
- "grad_norm": 6.1875,
518
- "learning_rate": 4.97388454484407e-05,
519
- "loss": 0.2892,
520
- "step": 730
521
- },
522
- {
523
- "epoch": 0.05369711922211741,
524
- "grad_norm": 3.78125,
525
- "learning_rate": 4.973050407461698e-05,
526
- "loss": 0.3032,
527
- "step": 740
528
- },
529
- {
530
- "epoch": 0.05442275596836224,
531
- "grad_norm": 2.65625,
532
- "learning_rate": 4.972203229501441e-05,
533
- "loss": 0.2522,
534
- "step": 750
535
- },
536
- {
537
- "epoch": 0.055148392714607065,
538
- "grad_norm": 7.84375,
539
- "learning_rate": 4.971343015430532e-05,
540
- "loss": 0.2502,
541
- "step": 760
542
- },
543
- {
544
- "epoch": 0.0558740294608519,
545
- "grad_norm": 10.6875,
546
- "learning_rate": 4.970469769784941e-05,
547
- "loss": 0.2664,
548
- "step": 770
549
- },
550
- {
551
- "epoch": 0.05659966620709673,
552
- "grad_norm": 7.90625,
553
- "learning_rate": 4.96958349716936e-05,
554
- "loss": 0.332,
555
- "step": 780
556
- },
557
- {
558
- "epoch": 0.05732530295334156,
559
- "grad_norm": 10.75,
560
- "learning_rate": 4.968684202257169e-05,
561
- "loss": 0.3737,
562
- "step": 790
563
- },
564
- {
565
- "epoch": 0.058050939699586385,
566
- "grad_norm": 6.5,
567
- "learning_rate": 4.967771889790416e-05,
568
- "loss": 0.2794,
569
- "step": 800
570
- },
571
- {
572
- "epoch": 0.058776576445831216,
573
- "grad_norm": 2.921875,
574
- "learning_rate": 4.966846564579792e-05,
575
- "loss": 0.2375,
576
- "step": 810
577
- },
578
- {
579
- "epoch": 0.05950221319207605,
580
- "grad_norm": 6.09375,
581
- "learning_rate": 4.965908231504607e-05,
582
- "loss": 0.365,
583
- "step": 820
584
- },
585
- {
586
- "epoch": 0.06022784993832088,
587
- "grad_norm": 9.0625,
588
- "learning_rate": 4.964956895512759e-05,
589
- "loss": 0.3469,
590
- "step": 830
591
- },
592
- {
593
- "epoch": 0.060953486684565704,
594
- "grad_norm": 5.25,
595
- "learning_rate": 4.963992561620714e-05,
596
- "loss": 0.2904,
597
- "step": 840
598
- },
599
- {
600
- "epoch": 0.061679123430810535,
601
- "grad_norm": 5.5,
602
- "learning_rate": 4.963015234913475e-05,
603
- "loss": 0.2779,
604
- "step": 850
605
- },
606
- {
607
- "epoch": 0.06240476017705537,
608
- "grad_norm": 5.6875,
609
- "learning_rate": 4.96202492054456e-05,
610
- "loss": 0.2658,
611
- "step": 860
612
- },
613
- {
614
- "epoch": 0.06313039692330019,
615
- "grad_norm": 8.8125,
616
- "learning_rate": 4.9610216237359684e-05,
617
- "loss": 0.3973,
618
- "step": 870
619
- },
620
- {
621
- "epoch": 0.06385603366954502,
622
- "grad_norm": 10.0,
623
- "learning_rate": 4.960005349778159e-05,
624
- "loss": 0.3169,
625
- "step": 880
626
- },
627
- {
628
- "epoch": 0.06458167041578985,
629
- "grad_norm": 11.9375,
630
- "learning_rate": 4.95897610403002e-05,
631
- "loss": 0.3172,
632
- "step": 890
633
- },
634
- {
635
- "epoch": 0.06530730716203469,
636
- "grad_norm": 3.0,
637
- "learning_rate": 4.95793389191884e-05,
638
- "loss": 0.2664,
639
- "step": 900
640
- },
641
- {
642
- "epoch": 0.06603294390827952,
643
- "grad_norm": 6.90625,
644
- "learning_rate": 4.95687871894028e-05,
645
- "loss": 0.2507,
646
- "step": 910
647
- },
648
- {
649
- "epoch": 0.06675858065452435,
650
- "grad_norm": 3.796875,
651
- "learning_rate": 4.9558105906583466e-05,
652
- "loss": 0.2383,
653
- "step": 920
654
- },
655
- {
656
- "epoch": 0.06748421740076918,
657
- "grad_norm": 6.28125,
658
- "learning_rate": 4.9547295127053586e-05,
659
- "loss": 0.3942,
660
- "step": 930
661
- },
662
- {
663
- "epoch": 0.068209854147014,
664
- "grad_norm": 9.0,
665
- "learning_rate": 4.95363549078192e-05,
666
- "loss": 0.2427,
667
- "step": 940
668
- },
669
- {
670
- "epoch": 0.06893549089325883,
671
- "grad_norm": 5.59375,
672
- "learning_rate": 4.952528530656889e-05,
673
- "loss": 0.2351,
674
- "step": 950
675
- },
676
- {
677
- "epoch": 0.06966112763950366,
678
- "grad_norm": 9.5625,
679
- "learning_rate": 4.9514086381673496e-05,
680
- "loss": 0.282,
681
- "step": 960
682
- },
683
- {
684
- "epoch": 0.0703867643857485,
685
- "grad_norm": 10.0625,
686
- "learning_rate": 4.9502758192185774e-05,
687
- "loss": 0.2687,
688
- "step": 970
689
- },
690
- {
691
- "epoch": 0.07111240113199332,
692
- "grad_norm": 9.0625,
693
- "learning_rate": 4.949130079784009e-05,
694
- "loss": 0.3135,
695
- "step": 980
696
- },
697
- {
698
- "epoch": 0.07183803787823816,
699
- "grad_norm": 5.59375,
700
- "learning_rate": 4.9479714259052143e-05,
701
- "loss": 0.3207,
702
- "step": 990
703
- },
704
- {
705
- "epoch": 0.07256367462448299,
706
- "grad_norm": 8.3125,
707
- "learning_rate": 4.946799863691862e-05,
708
- "loss": 0.2572,
709
- "step": 1000
710
- }
711
- ],
712
- "logging_steps": 10,
713
- "max_steps": 13781,
714
- "num_input_tokens_seen": 0,
715
- "num_train_epochs": 1,
716
- "save_steps": 500,
717
- "total_flos": 0.0,
718
- "train_batch_size": 5,
719
- "trial_name": null,
720
- "trial_params": null
721
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-1000/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b107e10c3f16daeb50e445875c72c80191fb75f0a018b11b5e6649f12b4edb0e
3
- size 4984
 
 
 
 
checkpoints/checkpoint-10000/adapter_config.json DELETED
@@ -1,29 +0,0 @@
1
- {
2
- "alpha_pattern": {},
3
- "auto_mapping": null,
4
- "base_model_name_or_path": "../pretrained/phi-2/",
5
- "bias": "none",
6
- "fan_in_fan_out": false,
7
- "inference_mode": true,
8
- "init_lora_weights": "gaussian",
9
- "layer_replication": null,
10
- "layers_pattern": null,
11
- "layers_to_transform": null,
12
- "loftq_config": {},
13
- "lora_alpha": 32,
14
- "lora_dropout": 0.1,
15
- "megatron_config": null,
16
- "megatron_core": "megatron.core",
17
- "modules_to_save": null,
18
- "peft_type": "LORA",
19
- "r": 8,
20
- "rank_pattern": {},
21
- "revision": null,
22
- "target_modules": [
23
- "v_proj",
24
- "q_proj"
25
- ],
26
- "task_type": "CAUSAL_LM",
27
- "use_dora": false,
28
- "use_rslora": false
29
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-10000/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:05c793a29256972f3adea1e6aa819e60cccd69d3e6eb8b3966a9e6fb432dae57
3
- size 5259848
 
 
 
 
checkpoints/checkpoint-10000/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d085bd8cc113ab477e63cc4fda11b2db9a6dc6afcb5dc85e29b9947f9e1fe17e
3
- size 10593402
 
 
 
 
checkpoints/checkpoint-10000/rng_state_0.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:04202dfaee4d27760ad73dbaf97ba4d5f3d7d9406a16d7200b9e8d29eacbc7a6
3
- size 15024
 
 
 
 
checkpoints/checkpoint-10000/rng_state_1.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:aead9d49cb15c4513f290bc16e8c036c878e75977831a54bc11efd4a6ee31c38
3
- size 15024
 
 
 
 
checkpoints/checkpoint-10000/rng_state_2.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff930be2c88e34c4fb4b8fa13b8897b8f37f8b7bef2f9a493f4d24cbadc0e256
3
- size 15024
 
 
 
 
checkpoints/checkpoint-10000/rng_state_3.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:94ac6c79200bd2147d77b72110b997a4a55a632e3f253541cadc8917abb514c6
3
- size 15024
 
 
 
 
checkpoints/checkpoint-10000/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a8cf818c444104efe80cf26c7920df0dab123d9b6903f4cd037c26d5d6bf3857
3
- size 1064
 
 
 
 
checkpoints/checkpoint-10000/trainer_state.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/checkpoint-10000/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b107e10c3f16daeb50e445875c72c80191fb75f0a018b11b5e6649f12b4edb0e
3
- size 4984
 
 
 
 
checkpoints/checkpoint-10500/adapter_config.json DELETED
@@ -1,29 +0,0 @@
1
- {
2
- "alpha_pattern": {},
3
- "auto_mapping": null,
4
- "base_model_name_or_path": "../pretrained/phi-2/",
5
- "bias": "none",
6
- "fan_in_fan_out": false,
7
- "inference_mode": true,
8
- "init_lora_weights": "gaussian",
9
- "layer_replication": null,
10
- "layers_pattern": null,
11
- "layers_to_transform": null,
12
- "loftq_config": {},
13
- "lora_alpha": 32,
14
- "lora_dropout": 0.1,
15
- "megatron_config": null,
16
- "megatron_core": "megatron.core",
17
- "modules_to_save": null,
18
- "peft_type": "LORA",
19
- "r": 8,
20
- "rank_pattern": {},
21
- "revision": null,
22
- "target_modules": [
23
- "v_proj",
24
- "q_proj"
25
- ],
26
- "task_type": "CAUSAL_LM",
27
- "use_dora": false,
28
- "use_rslora": false
29
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-10500/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:de850198a206f5ec9f3a3d203ffe38cbb462903045db60b4ef8f948e890b0b72
3
- size 5259848
 
 
 
 
checkpoints/checkpoint-10500/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:28341e359381f16db94734801749ec3d29efdefbea37c69120539663044daed7
3
- size 10593402
 
 
 
 
checkpoints/checkpoint-10500/rng_state_0.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0cd9108ff256b80a4973314bfabc4b67037cbcf88cca8c18d368c48d650d16f
3
- size 15024
 
 
 
 
checkpoints/checkpoint-10500/rng_state_1.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b738cecfc3fa07068e78a78b3a2890e638fd8d23245931847fe66d4d2acf260
3
- size 15024
 
 
 
 
checkpoints/checkpoint-10500/rng_state_2.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d163ceee4093354833880c0ce456ca8b17a349bb069b4ef1fa8307f8540ba89
3
- size 15024
 
 
 
 
checkpoints/checkpoint-10500/rng_state_3.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:879fa0717d2797c2875ffc34b7cf03c6b7e01a2443e94f45bb3e041d770e5744
3
- size 15024
 
 
 
 
checkpoints/checkpoint-10500/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e416eeaa9dc4cbd14c09317ab177d36bed29ba9a0fc58f3c28a66906f926d68
3
- size 1064
 
 
 
 
checkpoints/checkpoint-10500/trainer_state.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/checkpoint-10500/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b107e10c3f16daeb50e445875c72c80191fb75f0a018b11b5e6649f12b4edb0e
3
- size 4984
 
 
 
 
checkpoints/checkpoint-11000/adapter_config.json DELETED
@@ -1,29 +0,0 @@
1
- {
2
- "alpha_pattern": {},
3
- "auto_mapping": null,
4
- "base_model_name_or_path": "../pretrained/phi-2/",
5
- "bias": "none",
6
- "fan_in_fan_out": false,
7
- "inference_mode": true,
8
- "init_lora_weights": "gaussian",
9
- "layer_replication": null,
10
- "layers_pattern": null,
11
- "layers_to_transform": null,
12
- "loftq_config": {},
13
- "lora_alpha": 32,
14
- "lora_dropout": 0.1,
15
- "megatron_config": null,
16
- "megatron_core": "megatron.core",
17
- "modules_to_save": null,
18
- "peft_type": "LORA",
19
- "r": 8,
20
- "rank_pattern": {},
21
- "revision": null,
22
- "target_modules": [
23
- "v_proj",
24
- "q_proj"
25
- ],
26
- "task_type": "CAUSAL_LM",
27
- "use_dora": false,
28
- "use_rslora": false
29
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-11000/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab1168de17ed2366935138ddc72787d3e21545ba4d1a25e83e99c23f24d01820
3
- size 5259848
 
 
 
 
checkpoints/checkpoint-11000/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fbe51c99adf050f8c3ccdda13803f388708bbc585d1f857843372647e69166c6
3
- size 10593402
 
 
 
 
checkpoints/checkpoint-11000/rng_state_0.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:773775a4eb3370fef3e396ead2e6f3362cd26e4bdf5ca228def261ecab435761
3
- size 15024
 
 
 
 
checkpoints/checkpoint-11000/rng_state_1.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:66afd2775d10732e7e1a4f56dae239a9e973ffbd9cef7584cc45fba2ed58702e
3
- size 15024
 
 
 
 
checkpoints/checkpoint-11000/rng_state_2.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d882df819c79360bf01bdfed79e16fc7d4969bb169ef01eebfc9f1c7ad1d83ca
3
- size 15024
 
 
 
 
checkpoints/checkpoint-11000/rng_state_3.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbc49018d696a032bf9aafd927341c115d99350917f9e8a8dcf4c044da7beb4f
3
- size 15024
 
 
 
 
checkpoints/checkpoint-11000/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0d012a99c88d57e6f359e9b70583cd819536789f1d7ee2b6ff584c2ea8aff39
3
- size 1064
 
 
 
 
checkpoints/checkpoint-11000/trainer_state.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/checkpoint-11000/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b107e10c3f16daeb50e445875c72c80191fb75f0a018b11b5e6649f12b4edb0e
3
- size 4984
 
 
 
 
checkpoints/checkpoint-11500/adapter_config.json DELETED
@@ -1,29 +0,0 @@
1
- {
2
- "alpha_pattern": {},
3
- "auto_mapping": null,
4
- "base_model_name_or_path": "../pretrained/phi-2/",
5
- "bias": "none",
6
- "fan_in_fan_out": false,
7
- "inference_mode": true,
8
- "init_lora_weights": "gaussian",
9
- "layer_replication": null,
10
- "layers_pattern": null,
11
- "layers_to_transform": null,
12
- "loftq_config": {},
13
- "lora_alpha": 32,
14
- "lora_dropout": 0.1,
15
- "megatron_config": null,
16
- "megatron_core": "megatron.core",
17
- "modules_to_save": null,
18
- "peft_type": "LORA",
19
- "r": 8,
20
- "rank_pattern": {},
21
- "revision": null,
22
- "target_modules": [
23
- "v_proj",
24
- "q_proj"
25
- ],
26
- "task_type": "CAUSAL_LM",
27
- "use_dora": false,
28
- "use_rslora": false
29
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/checkpoint-11500/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a9f4fa3e92be343150be854387633b91ccaa350d096ad0dbbb7de3fb3c10a508
3
- size 5259848
 
 
 
 
checkpoints/checkpoint-11500/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:68cb054c2cf108555ca328322c95d12d5d0ec8c7a59fe7bb398d8064c658a4c0
3
- size 10593402
 
 
 
 
checkpoints/checkpoint-11500/rng_state_0.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d09e22cd2824499f52d6a93863298ebe50730b4e3f90445501ba957303a6a46
3
- size 15024
 
 
 
 
checkpoints/checkpoint-11500/rng_state_1.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc79068659845f366b2e7088f6511ef596cc3eb64cf58724c9cce3400aea704a
3
- size 15024
 
 
 
 
checkpoints/checkpoint-11500/rng_state_2.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fa6252e20688a9d26346ba088b4b6f42b1a7fd3d15208d2518f44790e5d65f9a
3
- size 15024
 
 
 
 
checkpoints/checkpoint-11500/rng_state_3.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1381c7993010df794984f44712495f8261e6a4e067254312f086db1cc46b048b
3
- size 15024
 
 
 
 
checkpoints/checkpoint-11500/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:595403f2a9ea191d8160bcd819a3a7c9243011f6b0c3b03eb8010f0a0b08b239
3
- size 1064