error577 commited on
Commit
9d588ca
·
verified ·
1 Parent(s): b820c5b

Training in progress, step 32, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -11,7 +11,7 @@
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
  "lora_alpha": 32,
14
- "lora_dropout": 0.05,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "o_proj",
24
  "down_proj",
25
- "up_proj",
26
  "q_proj",
27
  "gate_proj",
 
28
  "v_proj",
29
- "k_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
  "lora_alpha": 32,
14
+ "lora_dropout": 0.1,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "k_proj",
24
  "down_proj",
 
25
  "q_proj",
26
  "gate_proj",
27
+ "up_proj",
28
  "v_proj",
29
+ "o_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:201e4206f0072c8917512dc8cf146dbefbf2660ac39b28436d9b13f918b3b490
3
  size 83945296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a5eb1cb19fc0c60d664b828e872a3c9fccf47841332e3517fb14668f4ad3e08
3
  size 83945296
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:32256b2c4c4a61eddd4aaf2aaba0b5bb581f6fc968419c62f7dff3dd99ec3145
3
  size 43122580
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:732a08b950183eeb5645d88e5c68a1b78b0843a7c42fc65a380ce442c0ef502f
3
  size 43122580
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab0372fdcad347477f1f6e103fe6b994aa44c8a3bc6c0da45db133064a81b26e
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12eead4dacfbb671ef1fd5f888a398102288bc99c781a4c7577bb0acff26e12b
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:35e92785679980f3fcd23b14ce1acaffcae115e3e9164492d0e4b31775d32447
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c831940ce942c692a0f94542970ff2f02dcc70548c29c8a5b2a0e5efa834004b
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,126 +1,379 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.0608187134502924,
5
- "eval_steps": 13,
6
- "global_step": 13,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.004678362573099415,
13
- "grad_norm": 5.241299629211426,
14
- "learning_rate": 2e-05,
15
- "loss": 2.985,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.004678362573099415,
20
  "eval_loss": 3.3358547687530518,
21
- "eval_runtime": 13.8778,
22
- "eval_samples_per_second": 6.485,
23
- "eval_steps_per_second": 6.485,
24
  "step": 1
25
  },
26
  {
27
- "epoch": 0.00935672514619883,
28
- "grad_norm": 6.737635612487793,
29
- "learning_rate": 4e-05,
30
- "loss": 3.569,
31
  "step": 2
32
  },
33
  {
34
- "epoch": 0.014035087719298246,
35
- "grad_norm": 6.156493663787842,
36
- "learning_rate": 6e-05,
37
- "loss": 3.4481,
 
 
 
 
 
 
 
 
38
  "step": 3
39
  },
40
  {
41
- "epoch": 0.01871345029239766,
42
- "grad_norm": 5.9444403648376465,
43
- "learning_rate": 8e-05,
44
- "loss": 3.2638,
45
  "step": 4
46
  },
47
  {
48
- "epoch": 0.023391812865497075,
49
- "grad_norm": 5.542334079742432,
50
- "learning_rate": 0.0001,
51
- "loss": 3.2655,
 
 
 
 
 
 
 
 
52
  "step": 5
53
  },
54
  {
55
- "epoch": 0.028070175438596492,
56
- "grad_norm": 4.943231582641602,
57
- "learning_rate": 0.00012,
58
- "loss": 3.3256,
59
  "step": 6
60
  },
61
  {
62
- "epoch": 0.03274853801169591,
63
- "grad_norm": 5.165916919708252,
64
- "learning_rate": 0.00014,
65
- "loss": 2.8769,
 
 
 
 
 
 
 
 
66
  "step": 7
67
  },
68
  {
69
- "epoch": 0.03742690058479532,
70
- "grad_norm": 4.797478199005127,
71
- "learning_rate": 0.00016,
72
- "loss": 2.8655,
 
 
 
 
 
 
 
 
73
  "step": 8
74
  },
75
  {
76
- "epoch": 0.042105263157894736,
77
- "grad_norm": 5.1365180015563965,
78
- "learning_rate": 0.00018,
79
- "loss": 3.0914,
80
  "step": 9
81
  },
82
  {
83
- "epoch": 0.04678362573099415,
84
- "grad_norm": 5.111344337463379,
85
- "learning_rate": 0.0002,
86
- "loss": 3.0234,
87
  "step": 10
88
  },
89
  {
90
- "epoch": 0.05146198830409357,
91
- "grad_norm": 4.561822891235352,
92
- "learning_rate": 0.0001996917333733128,
93
- "loss": 2.4678,
 
 
 
 
 
 
 
 
94
  "step": 11
95
  },
96
  {
97
- "epoch": 0.056140350877192984,
98
- "grad_norm": 4.718181610107422,
99
- "learning_rate": 0.00019876883405951377,
100
- "loss": 3.0406,
101
  "step": 12
102
  },
103
  {
104
- "epoch": 0.0608187134502924,
105
- "grad_norm": 5.097424030303955,
106
- "learning_rate": 0.00019723699203976766,
107
- "loss": 2.6678,
108
- "step": 13
 
109
  },
110
  {
111
- "epoch": 0.0608187134502924,
112
- "eval_loss": 2.732684373855591,
113
- "eval_runtime": 13.866,
114
- "eval_samples_per_second": 6.491,
115
- "eval_steps_per_second": 6.491,
116
  "step": 13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  }
118
  ],
119
  "logging_steps": 1,
120
  "max_steps": 50,
121
  "num_input_tokens_seen": 0,
122
  "num_train_epochs": 1,
123
- "save_steps": 13,
124
  "stateful_callbacks": {
125
  "TrainerControl": {
126
  "args": {
@@ -133,7 +386,7 @@
133
  "attributes": {}
134
  }
135
  },
136
- "total_flos": 1108174983659520.0,
137
  "train_batch_size": 1,
138
  "trial_name": null,
139
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.03742690058479532,
5
+ "eval_steps": 2,
6
+ "global_step": 32,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0011695906432748538,
13
+ "grad_norm": 8.476790428161621,
14
+ "learning_rate": 1.0000000000000001e-07,
15
+ "loss": 3.1297,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.0011695906432748538,
20
  "eval_loss": 3.3358547687530518,
21
+ "eval_runtime": 14.0872,
22
+ "eval_samples_per_second": 6.389,
23
+ "eval_steps_per_second": 6.389,
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 0.0023391812865497076,
28
+ "grad_norm": 11.907588958740234,
29
+ "learning_rate": 2.0000000000000002e-07,
30
+ "loss": 3.1984,
31
  "step": 2
32
  },
33
  {
34
+ "epoch": 0.0023391812865497076,
35
+ "eval_loss": 3.336097478866577,
36
+ "eval_runtime": 13.9412,
37
+ "eval_samples_per_second": 6.456,
38
+ "eval_steps_per_second": 6.456,
39
+ "step": 2
40
+ },
41
+ {
42
+ "epoch": 0.0035087719298245615,
43
+ "grad_norm": 9.732194900512695,
44
+ "learning_rate": 3.0000000000000004e-07,
45
+ "loss": 2.9559,
46
  "step": 3
47
  },
48
  {
49
+ "epoch": 0.004678362573099415,
50
+ "grad_norm": 8.285279273986816,
51
+ "learning_rate": 4.0000000000000003e-07,
52
+ "loss": 2.7216,
53
  "step": 4
54
  },
55
  {
56
+ "epoch": 0.004678362573099415,
57
+ "eval_loss": 3.336188793182373,
58
+ "eval_runtime": 13.9965,
59
+ "eval_samples_per_second": 6.43,
60
+ "eval_steps_per_second": 6.43,
61
+ "step": 4
62
+ },
63
+ {
64
+ "epoch": 0.005847953216374269,
65
+ "grad_norm": 11.178544044494629,
66
+ "learning_rate": 5.000000000000001e-07,
67
+ "loss": 3.3045,
68
  "step": 5
69
  },
70
  {
71
+ "epoch": 0.007017543859649123,
72
+ "grad_norm": 31.653640747070312,
73
+ "learning_rate": 6.000000000000001e-07,
74
+ "loss": 5.7524,
75
  "step": 6
76
  },
77
  {
78
+ "epoch": 0.007017543859649123,
79
+ "eval_loss": 3.3365745544433594,
80
+ "eval_runtime": 14.1369,
81
+ "eval_samples_per_second": 6.366,
82
+ "eval_steps_per_second": 6.366,
83
+ "step": 6
84
+ },
85
+ {
86
+ "epoch": 0.008187134502923977,
87
+ "grad_norm": 10.910638809204102,
88
+ "learning_rate": 7.000000000000001e-07,
89
+ "loss": 3.5631,
90
  "step": 7
91
  },
92
  {
93
+ "epoch": 0.00935672514619883,
94
+ "grad_norm": 12.292998313903809,
95
+ "learning_rate": 8.000000000000001e-07,
96
+ "loss": 3.2113,
97
+ "step": 8
98
+ },
99
+ {
100
+ "epoch": 0.00935672514619883,
101
+ "eval_loss": 3.335761785507202,
102
+ "eval_runtime": 14.0918,
103
+ "eval_samples_per_second": 6.387,
104
+ "eval_steps_per_second": 6.387,
105
  "step": 8
106
  },
107
  {
108
+ "epoch": 0.010526315789473684,
109
+ "grad_norm": 10.52215576171875,
110
+ "learning_rate": 9e-07,
111
+ "loss": 3.4784,
112
  "step": 9
113
  },
114
  {
115
+ "epoch": 0.011695906432748537,
116
+ "grad_norm": 10.834870338439941,
117
+ "learning_rate": 1.0000000000000002e-06,
118
+ "loss": 3.5036,
119
  "step": 10
120
  },
121
  {
122
+ "epoch": 0.011695906432748537,
123
+ "eval_loss": 3.335782289505005,
124
+ "eval_runtime": 14.2442,
125
+ "eval_samples_per_second": 6.318,
126
+ "eval_steps_per_second": 6.318,
127
+ "step": 10
128
+ },
129
+ {
130
+ "epoch": 0.012865497076023392,
131
+ "grad_norm": 14.306832313537598,
132
+ "learning_rate": 1.1e-06,
133
+ "loss": 3.7758,
134
  "step": 11
135
  },
136
  {
137
+ "epoch": 0.014035087719298246,
138
+ "grad_norm": 9.94528579711914,
139
+ "learning_rate": 1.2000000000000002e-06,
140
+ "loss": 3.173,
141
  "step": 12
142
  },
143
  {
144
+ "epoch": 0.014035087719298246,
145
+ "eval_loss": 3.33567476272583,
146
+ "eval_runtime": 14.2012,
147
+ "eval_samples_per_second": 6.338,
148
+ "eval_steps_per_second": 6.338,
149
+ "step": 12
150
  },
151
  {
152
+ "epoch": 0.0152046783625731,
153
+ "grad_norm": 12.635165214538574,
154
+ "learning_rate": 1.3e-06,
155
+ "loss": 2.9429,
 
156
  "step": 13
157
+ },
158
+ {
159
+ "epoch": 0.016374269005847954,
160
+ "grad_norm": 10.34188461303711,
161
+ "learning_rate": 1.4000000000000001e-06,
162
+ "loss": 3.5056,
163
+ "step": 14
164
+ },
165
+ {
166
+ "epoch": 0.016374269005847954,
167
+ "eval_loss": 3.33496356010437,
168
+ "eval_runtime": 13.9923,
169
+ "eval_samples_per_second": 6.432,
170
+ "eval_steps_per_second": 6.432,
171
+ "step": 14
172
+ },
173
+ {
174
+ "epoch": 0.017543859649122806,
175
+ "grad_norm": 10.396870613098145,
176
+ "learning_rate": 1.5e-06,
177
+ "loss": 3.1286,
178
+ "step": 15
179
+ },
180
+ {
181
+ "epoch": 0.01871345029239766,
182
+ "grad_norm": 11.446793556213379,
183
+ "learning_rate": 1.6000000000000001e-06,
184
+ "loss": 3.5737,
185
+ "step": 16
186
+ },
187
+ {
188
+ "epoch": 0.01871345029239766,
189
+ "eval_loss": 3.333711862564087,
190
+ "eval_runtime": 13.9756,
191
+ "eval_samples_per_second": 6.44,
192
+ "eval_steps_per_second": 6.44,
193
+ "step": 16
194
+ },
195
+ {
196
+ "epoch": 0.019883040935672516,
197
+ "grad_norm": 8.924163818359375,
198
+ "learning_rate": 1.7000000000000002e-06,
199
+ "loss": 3.246,
200
+ "step": 17
201
+ },
202
+ {
203
+ "epoch": 0.021052631578947368,
204
+ "grad_norm": 12.621112823486328,
205
+ "learning_rate": 1.8e-06,
206
+ "loss": 3.3298,
207
+ "step": 18
208
+ },
209
+ {
210
+ "epoch": 0.021052631578947368,
211
+ "eval_loss": 3.332756996154785,
212
+ "eval_runtime": 13.9715,
213
+ "eval_samples_per_second": 6.442,
214
+ "eval_steps_per_second": 6.442,
215
+ "step": 18
216
+ },
217
+ {
218
+ "epoch": 0.022222222222222223,
219
+ "grad_norm": 16.083580017089844,
220
+ "learning_rate": 1.9e-06,
221
+ "loss": 3.8307,
222
+ "step": 19
223
+ },
224
+ {
225
+ "epoch": 0.023391812865497075,
226
+ "grad_norm": 9.164115905761719,
227
+ "learning_rate": 2.0000000000000003e-06,
228
+ "loss": 3.2996,
229
+ "step": 20
230
+ },
231
+ {
232
+ "epoch": 0.023391812865497075,
233
+ "eval_loss": 3.3321051597595215,
234
+ "eval_runtime": 14.0785,
235
+ "eval_samples_per_second": 6.393,
236
+ "eval_steps_per_second": 6.393,
237
+ "step": 20
238
+ },
239
+ {
240
+ "epoch": 0.02456140350877193,
241
+ "grad_norm": 13.98554801940918,
242
+ "learning_rate": 2.1000000000000002e-06,
243
+ "loss": 3.6964,
244
+ "step": 21
245
+ },
246
+ {
247
+ "epoch": 0.025730994152046785,
248
+ "grad_norm": 9.490047454833984,
249
+ "learning_rate": 2.2e-06,
250
+ "loss": 3.5336,
251
+ "step": 22
252
+ },
253
+ {
254
+ "epoch": 0.025730994152046785,
255
+ "eval_loss": 3.330921173095703,
256
+ "eval_runtime": 13.9892,
257
+ "eval_samples_per_second": 6.434,
258
+ "eval_steps_per_second": 6.434,
259
+ "step": 22
260
+ },
261
+ {
262
+ "epoch": 0.026900584795321637,
263
+ "grad_norm": 11.965221405029297,
264
+ "learning_rate": 2.3e-06,
265
+ "loss": 4.1548,
266
+ "step": 23
267
+ },
268
+ {
269
+ "epoch": 0.028070175438596492,
270
+ "grad_norm": 7.872015953063965,
271
+ "learning_rate": 2.4000000000000003e-06,
272
+ "loss": 2.6803,
273
+ "step": 24
274
+ },
275
+ {
276
+ "epoch": 0.028070175438596492,
277
+ "eval_loss": 3.330366373062134,
278
+ "eval_runtime": 14.0471,
279
+ "eval_samples_per_second": 6.407,
280
+ "eval_steps_per_second": 6.407,
281
+ "step": 24
282
+ },
283
+ {
284
+ "epoch": 0.029239766081871343,
285
+ "grad_norm": 15.018107414245605,
286
+ "learning_rate": 2.5e-06,
287
+ "loss": 3.3743,
288
+ "step": 25
289
+ },
290
+ {
291
+ "epoch": 0.0304093567251462,
292
+ "grad_norm": 8.211061477661133,
293
+ "learning_rate": 2.6e-06,
294
+ "loss": 2.9239,
295
+ "step": 26
296
+ },
297
+ {
298
+ "epoch": 0.0304093567251462,
299
+ "eval_loss": 3.3289644718170166,
300
+ "eval_runtime": 14.0324,
301
+ "eval_samples_per_second": 6.414,
302
+ "eval_steps_per_second": 6.414,
303
+ "step": 26
304
+ },
305
+ {
306
+ "epoch": 0.031578947368421054,
307
+ "grad_norm": 9.60824203491211,
308
+ "learning_rate": 2.7e-06,
309
+ "loss": 2.8382,
310
+ "step": 27
311
+ },
312
+ {
313
+ "epoch": 0.03274853801169591,
314
+ "grad_norm": 16.03299903869629,
315
+ "learning_rate": 2.8000000000000003e-06,
316
+ "loss": 3.9005,
317
+ "step": 28
318
+ },
319
+ {
320
+ "epoch": 0.03274853801169591,
321
+ "eval_loss": 3.3265655040740967,
322
+ "eval_runtime": 13.9247,
323
+ "eval_samples_per_second": 6.463,
324
+ "eval_steps_per_second": 6.463,
325
+ "step": 28
326
+ },
327
+ {
328
+ "epoch": 0.03391812865497076,
329
+ "grad_norm": 7.8519768714904785,
330
+ "learning_rate": 2.9e-06,
331
+ "loss": 2.8852,
332
+ "step": 29
333
+ },
334
+ {
335
+ "epoch": 0.03508771929824561,
336
+ "grad_norm": 11.132136344909668,
337
+ "learning_rate": 3e-06,
338
+ "loss": 2.6383,
339
+ "step": 30
340
+ },
341
+ {
342
+ "epoch": 0.03508771929824561,
343
+ "eval_loss": 3.324815273284912,
344
+ "eval_runtime": 13.9716,
345
+ "eval_samples_per_second": 6.442,
346
+ "eval_steps_per_second": 6.442,
347
+ "step": 30
348
+ },
349
+ {
350
+ "epoch": 0.03625730994152047,
351
+ "grad_norm": 10.680882453918457,
352
+ "learning_rate": 3.1e-06,
353
+ "loss": 3.8337,
354
+ "step": 31
355
+ },
356
+ {
357
+ "epoch": 0.03742690058479532,
358
+ "grad_norm": 10.323698043823242,
359
+ "learning_rate": 3.2000000000000003e-06,
360
+ "loss": 3.2712,
361
+ "step": 32
362
+ },
363
+ {
364
+ "epoch": 0.03742690058479532,
365
+ "eval_loss": 3.3221709728240967,
366
+ "eval_runtime": 14.0736,
367
+ "eval_samples_per_second": 6.395,
368
+ "eval_steps_per_second": 6.395,
369
+ "step": 32
370
  }
371
  ],
372
  "logging_steps": 1,
373
  "max_steps": 50,
374
  "num_input_tokens_seen": 0,
375
  "num_train_epochs": 1,
376
+ "save_steps": 2,
377
  "stateful_callbacks": {
378
  "TrainerControl": {
379
  "args": {
 
386
  "attributes": {}
387
  }
388
  },
389
+ "total_flos": 680655700623360.0,
390
  "train_batch_size": 1,
391
  "trial_name": null,
392
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c7736ce88157ec6f559044bfa1c039268935c73c9b24ebd32fb5c3483c5e3d0
3
  size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98f0f0c46bb8a8aea130ec5fac61dac3c1395e8dd1d272da8feeb25551e173ef
3
  size 6776