BobaZooba commited on
Commit
483de2d
1 Parent(s): ae38f33

Training in progress, step 50, checkpoint

Browse files
last-checkpoint/README.md CHANGED
@@ -217,4 +217,23 @@ The following `bitsandbytes` quantization config was used during training:
217
  ### Framework versions
218
 
219
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  - PEFT 0.6.2
 
217
  ### Framework versions
218
 
219
 
220
+ - PEFT 0.6.2
221
+ ## Training procedure
222
+
223
+
224
+ The following `bitsandbytes` quantization config was used during training:
225
+ - quant_method: bitsandbytes
226
+ - load_in_8bit: False
227
+ - load_in_4bit: True
228
+ - llm_int8_threshold: 6.0
229
+ - llm_int8_skip_modules: None
230
+ - llm_int8_enable_fp32_cpu_offload: False
231
+ - llm_int8_has_fp16_weight: True
232
+ - bnb_4bit_quant_type: nf4
233
+ - bnb_4bit_use_double_quant: True
234
+ - bnb_4bit_compute_dtype: float16
235
+
236
+ ### Framework versions
237
+
238
+
239
  - PEFT 0.6.2
last-checkpoint/adapter_config.json CHANGED
@@ -16,13 +16,13 @@
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
 
19
  "o_proj",
20
- "k_proj",
21
  "up_proj",
22
- "v_proj",
23
- "q_proj",
24
  "down_proj",
25
- "gate_proj"
 
 
26
  ],
27
  "task_type": "CAUSAL_LM"
28
  }
 
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
19
+ "q_proj",
20
  "o_proj",
 
21
  "up_proj",
 
 
22
  "down_proj",
23
+ "gate_proj",
24
+ "v_proj",
25
+ "k_proj"
26
  ],
27
  "task_type": "CAUSAL_LM"
28
  }
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1d26ae4d91ee5406ecdb04754fcb794c49fdbf6bbb41861d8558309f1105dad
3
  size 42002136
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02b49cafa17099fb3f799866f293f74c7421276b1b678b94cf3e64d676ebf640
3
  size 42002136
last-checkpoint/global_step50/mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4361fddc8c415ce1454201b361168cd2d48d0c91ea4864257e518f77d84aca18
3
  size 8182659910
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3663cd25d4d871f3f96de9903f8a5a1c5145270aede215a61322c35013b86e01
3
  size 8182659910
last-checkpoint/global_step50/zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:98d7331c6fcdd957f25dbd69d15a4939c741d5692b698f8c99287f9ad6fb941b
3
  size 251710893
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b891787fb2dd7b75a0aa4236a035d50c8e084175b628f675ac16bc245b5ee71
3
  size 251710893
last-checkpoint/trainer_state.json CHANGED
@@ -23,294 +23,294 @@
23
  {
24
  "epoch": 0.0,
25
  "learning_rate": 0.0,
26
- "loss": 1.664,
27
  "step": 3
28
  },
29
  {
30
  "epoch": 0.0,
31
  "learning_rate": 8.613531161467861e-05,
32
- "loss": 1.7923,
33
  "step": 4
34
  },
35
  {
36
  "epoch": 0.0,
37
  "learning_rate": 0.00013652123889719707,
38
- "loss": 1.8119,
39
  "step": 5
40
  },
41
  {
42
  "epoch": 0.0,
43
  "learning_rate": 0.00017227062322935723,
44
- "loss": 1.6156,
45
  "step": 6
46
  },
47
  {
48
  "epoch": 0.0,
49
- "learning_rate": 0.0002,
50
- "loss": 1.4321,
51
  "step": 7
52
  },
53
  {
54
  "epoch": 0.0,
55
  "learning_rate": 0.0002,
56
- "loss": 1.4738,
57
  "step": 8
58
  },
59
  {
60
  "epoch": 0.0,
61
- "learning_rate": 0.00019789473684210526,
62
- "loss": 1.7251,
63
  "step": 9
64
  },
65
  {
66
  "epoch": 0.0,
67
- "learning_rate": 0.00019578947368421054,
68
- "loss": 1.6262,
69
  "step": 10
70
  },
71
  {
72
  "epoch": 0.0,
73
- "learning_rate": 0.0001936842105263158,
74
- "loss": 1.4947,
75
  "step": 11
76
  },
77
  {
78
  "epoch": 0.0,
79
- "learning_rate": 0.00019157894736842104,
80
- "loss": 1.8006,
81
  "step": 12
82
  },
83
  {
84
  "epoch": 0.0,
85
- "learning_rate": 0.00018947368421052632,
86
- "loss": 1.6487,
87
  "step": 13
88
  },
89
  {
90
  "epoch": 0.0,
91
- "learning_rate": 0.0001873684210526316,
92
- "loss": 1.7926,
93
  "step": 14
94
  },
95
  {
96
  "epoch": 0.0,
97
- "learning_rate": 0.00018526315789473685,
98
- "loss": 1.5979,
99
  "step": 15
100
  },
101
  {
102
  "epoch": 0.0,
103
- "learning_rate": 0.0001831578947368421,
104
- "loss": 1.6008,
105
  "step": 16
106
  },
107
  {
108
  "epoch": 0.0,
109
- "learning_rate": 0.00018105263157894739,
110
- "loss": 1.7569,
111
  "step": 17
112
  },
113
  {
114
  "epoch": 0.0,
115
- "learning_rate": 0.00017894736842105264,
116
- "loss": 1.7647,
117
  "step": 18
118
  },
119
  {
120
  "epoch": 0.0,
121
- "learning_rate": 0.0001768421052631579,
122
- "loss": 1.7386,
123
  "step": 19
124
  },
125
  {
126
  "epoch": 0.0,
127
- "learning_rate": 0.00017473684210526317,
128
- "loss": 1.5272,
129
  "step": 20
130
  },
131
  {
132
  "epoch": 0.0,
133
- "learning_rate": 0.00017263157894736842,
134
- "loss": 1.8591,
135
  "step": 21
136
  },
137
  {
138
  "epoch": 0.0,
139
- "learning_rate": 0.0001705263157894737,
140
- "loss": 1.5613,
141
  "step": 22
142
  },
143
  {
144
  "epoch": 0.0,
145
- "learning_rate": 0.00016842105263157895,
146
- "loss": 1.3797,
147
  "step": 23
148
  },
149
  {
150
  "epoch": 0.0,
151
- "learning_rate": 0.00016631578947368423,
152
- "loss": 1.3594,
153
  "step": 24
154
  },
155
  {
156
  "epoch": 0.0,
157
- "learning_rate": 0.00016421052631578948,
158
- "loss": 1.6027,
159
  "step": 25
160
  },
161
  {
162
  "epoch": 0.0,
163
- "learning_rate": 0.00016210526315789473,
164
- "loss": 1.5134,
165
  "step": 26
166
  },
167
  {
168
  "epoch": 0.0,
169
- "learning_rate": 0.00016,
170
- "loss": 1.7149,
171
  "step": 27
172
  },
173
  {
174
  "epoch": 0.0,
175
- "learning_rate": 0.00015789473684210527,
176
- "loss": 1.8614,
177
  "step": 28
178
  },
179
  {
180
  "epoch": 0.0,
181
- "learning_rate": 0.00015578947368421052,
182
- "loss": 1.4819,
183
  "step": 29
184
  },
185
  {
186
  "epoch": 0.0,
187
- "learning_rate": 0.0001536842105263158,
188
- "loss": 1.623,
189
  "step": 30
190
  },
191
  {
192
  "epoch": 0.0,
193
- "learning_rate": 0.00015157894736842108,
194
- "loss": 1.7829,
195
  "step": 31
196
  },
197
  {
198
  "epoch": 0.0,
199
- "learning_rate": 0.00014947368421052633,
200
- "loss": 1.4878,
201
  "step": 32
202
  },
203
  {
204
  "epoch": 0.0,
205
- "learning_rate": 0.00014736842105263158,
206
- "loss": 1.7675,
207
  "step": 33
208
  },
209
  {
210
  "epoch": 0.0,
211
- "learning_rate": 0.00014526315789473686,
212
- "loss": 1.6652,
213
  "step": 34
214
  },
215
  {
216
  "epoch": 0.0,
217
- "learning_rate": 0.0001431578947368421,
218
- "loss": 1.4829,
219
  "step": 35
220
  },
221
  {
222
  "epoch": 0.0,
223
- "learning_rate": 0.00014105263157894736,
224
- "loss": 1.6084,
225
  "step": 36
226
  },
227
  {
228
  "epoch": 0.0,
229
- "learning_rate": 0.00013894736842105264,
230
- "loss": 1.5299,
231
  "step": 37
232
  },
233
  {
234
  "epoch": 0.0,
235
- "learning_rate": 0.0001368421052631579,
236
- "loss": 1.5337,
237
  "step": 38
238
  },
239
  {
240
  "epoch": 0.0,
241
- "learning_rate": 0.00013473684210526317,
242
- "loss": 1.4584,
243
  "step": 39
244
  },
245
  {
246
  "epoch": 0.0,
247
- "learning_rate": 0.00013263157894736842,
248
- "loss": 1.5648,
249
  "step": 40
250
  },
251
  {
252
  "epoch": 0.0,
253
- "learning_rate": 0.0001305263157894737,
254
- "loss": 1.6003,
255
  "step": 41
256
  },
257
  {
258
  "epoch": 0.0,
259
- "learning_rate": 0.00012842105263157895,
260
- "loss": 1.5679,
261
  "step": 42
262
  },
263
  {
264
  "epoch": 0.0,
265
- "learning_rate": 0.0001263157894736842,
266
- "loss": 1.425,
267
  "step": 43
268
  },
269
  {
270
  "epoch": 0.0,
271
- "learning_rate": 0.00012421052631578949,
272
- "loss": 1.2781,
273
  "step": 44
274
  },
275
  {
276
  "epoch": 0.0,
277
- "learning_rate": 0.00012210526315789474,
278
- "loss": 1.7008,
279
  "step": 45
280
  },
281
  {
282
  "epoch": 0.0,
283
- "learning_rate": 0.00012,
284
- "loss": 1.4367,
285
  "step": 46
286
  },
287
  {
288
  "epoch": 0.0,
289
- "learning_rate": 0.00011789473684210525,
290
- "loss": 1.4628,
291
  "step": 47
292
  },
293
  {
294
  "epoch": 0.0,
295
- "learning_rate": 0.00011578947368421053,
296
- "loss": 1.6098,
297
  "step": 48
298
  },
299
  {
300
  "epoch": 0.0,
301
- "learning_rate": 0.0001136842105263158,
302
- "loss": 1.4618,
303
  "step": 49
304
  },
305
  {
306
  "epoch": 0.0,
307
- "learning_rate": 0.00011157894736842105,
308
- "loss": 1.3719,
309
  "step": 50
310
  }
311
  ],
312
  "logging_steps": 1,
313
- "max_steps": 100,
314
  "num_train_epochs": 1,
315
  "save_steps": 50,
316
  "total_flos": 3065802843488256.0,
 
23
  {
24
  "epoch": 0.0,
25
  "learning_rate": 0.0,
26
+ "loss": 1.6536,
27
  "step": 3
28
  },
29
  {
30
  "epoch": 0.0,
31
  "learning_rate": 8.613531161467861e-05,
32
+ "loss": 1.7934,
33
  "step": 4
34
  },
35
  {
36
  "epoch": 0.0,
37
  "learning_rate": 0.00013652123889719707,
38
+ "loss": 1.8117,
39
  "step": 5
40
  },
41
  {
42
  "epoch": 0.0,
43
  "learning_rate": 0.00017227062322935723,
44
+ "loss": 1.618,
45
  "step": 6
46
  },
47
  {
48
  "epoch": 0.0,
49
+ "learning_rate": 0.00017227062322935723,
50
+ "loss": 1.4355,
51
  "step": 7
52
  },
53
  {
54
  "epoch": 0.0,
55
  "learning_rate": 0.0002,
56
+ "loss": 1.4779,
57
  "step": 8
58
  },
59
  {
60
  "epoch": 0.0,
61
+ "learning_rate": 0.0002,
62
+ "loss": 1.7512,
63
  "step": 9
64
  },
65
  {
66
  "epoch": 0.0,
67
+ "learning_rate": 0.0002,
68
+ "loss": 1.6407,
69
  "step": 10
70
  },
71
  {
72
  "epoch": 0.0,
73
+ "learning_rate": 0.0001979166666666667,
74
+ "loss": 1.4995,
75
  "step": 11
76
  },
77
  {
78
  "epoch": 0.0,
79
+ "learning_rate": 0.00019583333333333334,
80
+ "loss": 1.8377,
81
  "step": 12
82
  },
83
  {
84
  "epoch": 0.0,
85
+ "learning_rate": 0.00019375000000000002,
86
+ "loss": 1.6649,
87
  "step": 13
88
  },
89
  {
90
  "epoch": 0.0,
91
+ "learning_rate": 0.00019166666666666667,
92
+ "loss": 1.8315,
93
  "step": 14
94
  },
95
  {
96
  "epoch": 0.0,
97
+ "learning_rate": 0.00018958333333333332,
98
+ "loss": 1.6504,
99
  "step": 15
100
  },
101
  {
102
  "epoch": 0.0,
103
+ "learning_rate": 0.0001875,
104
+ "loss": 1.6062,
105
  "step": 16
106
  },
107
  {
108
  "epoch": 0.0,
109
+ "learning_rate": 0.00018541666666666668,
110
+ "loss": 1.7712,
111
  "step": 17
112
  },
113
  {
114
  "epoch": 0.0,
115
+ "learning_rate": 0.00018333333333333334,
116
+ "loss": 1.7817,
117
  "step": 18
118
  },
119
  {
120
  "epoch": 0.0,
121
+ "learning_rate": 0.00018125000000000001,
122
+ "loss": 1.7626,
123
  "step": 19
124
  },
125
  {
126
  "epoch": 0.0,
127
+ "learning_rate": 0.0001791666666666667,
128
+ "loss": 1.5501,
129
  "step": 20
130
  },
131
  {
132
  "epoch": 0.0,
133
+ "learning_rate": 0.00017708333333333335,
134
+ "loss": 1.8781,
135
  "step": 21
136
  },
137
  {
138
  "epoch": 0.0,
139
+ "learning_rate": 0.000175,
140
+ "loss": 1.5841,
141
  "step": 22
142
  },
143
  {
144
  "epoch": 0.0,
145
+ "learning_rate": 0.00017291666666666668,
146
+ "loss": 1.3914,
147
  "step": 23
148
  },
149
  {
150
  "epoch": 0.0,
151
+ "learning_rate": 0.00017083333333333333,
152
+ "loss": 1.3798,
153
  "step": 24
154
  },
155
  {
156
  "epoch": 0.0,
157
+ "learning_rate": 0.00016875,
158
+ "loss": 1.599,
159
  "step": 25
160
  },
161
  {
162
  "epoch": 0.0,
163
+ "learning_rate": 0.0001666666666666667,
164
+ "loss": 1.5245,
165
  "step": 26
166
  },
167
  {
168
  "epoch": 0.0,
169
+ "learning_rate": 0.00016458333333333334,
170
+ "loss": 1.719,
171
  "step": 27
172
  },
173
  {
174
  "epoch": 0.0,
175
+ "learning_rate": 0.00016250000000000002,
176
+ "loss": 1.8812,
177
  "step": 28
178
  },
179
  {
180
  "epoch": 0.0,
181
+ "learning_rate": 0.00016041666666666667,
182
+ "loss": 1.4837,
183
  "step": 29
184
  },
185
  {
186
  "epoch": 0.0,
187
+ "learning_rate": 0.00015833333333333332,
188
+ "loss": 1.6244,
189
  "step": 30
190
  },
191
  {
192
  "epoch": 0.0,
193
+ "learning_rate": 0.00015625,
194
+ "loss": 1.7895,
195
  "step": 31
196
  },
197
  {
198
  "epoch": 0.0,
199
+ "learning_rate": 0.00015416666666666668,
200
+ "loss": 1.4903,
201
  "step": 32
202
  },
203
  {
204
  "epoch": 0.0,
205
+ "learning_rate": 0.00015208333333333333,
206
+ "loss": 1.7662,
207
  "step": 33
208
  },
209
  {
210
  "epoch": 0.0,
211
+ "learning_rate": 0.00015000000000000001,
212
+ "loss": 1.668,
213
  "step": 34
214
  },
215
  {
216
  "epoch": 0.0,
217
+ "learning_rate": 0.0001479166666666667,
218
+ "loss": 1.4777,
219
  "step": 35
220
  },
221
  {
222
  "epoch": 0.0,
223
+ "learning_rate": 0.00014583333333333335,
224
+ "loss": 1.6102,
225
  "step": 36
226
  },
227
  {
228
  "epoch": 0.0,
229
+ "learning_rate": 0.00014375,
230
+ "loss": 1.5228,
231
  "step": 37
232
  },
233
  {
234
  "epoch": 0.0,
235
+ "learning_rate": 0.00014166666666666668,
236
+ "loss": 1.5251,
237
  "step": 38
238
  },
239
  {
240
  "epoch": 0.0,
241
+ "learning_rate": 0.00013958333333333333,
242
+ "loss": 1.4569,
243
  "step": 39
244
  },
245
  {
246
  "epoch": 0.0,
247
+ "learning_rate": 0.0001375,
248
+ "loss": 1.5586,
249
  "step": 40
250
  },
251
  {
252
  "epoch": 0.0,
253
+ "learning_rate": 0.0001354166666666667,
254
+ "loss": 1.6027,
255
  "step": 41
256
  },
257
  {
258
  "epoch": 0.0,
259
+ "learning_rate": 0.00013333333333333334,
260
+ "loss": 1.5661,
261
  "step": 42
262
  },
263
  {
264
  "epoch": 0.0,
265
+ "learning_rate": 0.00013125000000000002,
266
+ "loss": 1.4163,
267
  "step": 43
268
  },
269
  {
270
  "epoch": 0.0,
271
+ "learning_rate": 0.00012916666666666667,
272
+ "loss": 1.2751,
273
  "step": 44
274
  },
275
  {
276
  "epoch": 0.0,
277
+ "learning_rate": 0.00012708333333333332,
278
+ "loss": 1.7024,
279
  "step": 45
280
  },
281
  {
282
  "epoch": 0.0,
283
+ "learning_rate": 0.000125,
284
+ "loss": 1.4381,
285
  "step": 46
286
  },
287
  {
288
  "epoch": 0.0,
289
+ "learning_rate": 0.00012291666666666668,
290
+ "loss": 1.4661,
291
  "step": 47
292
  },
293
  {
294
  "epoch": 0.0,
295
+ "learning_rate": 0.00012083333333333333,
296
+ "loss": 1.6117,
297
  "step": 48
298
  },
299
  {
300
  "epoch": 0.0,
301
+ "learning_rate": 0.00011875,
302
+ "loss": 1.4635,
303
  "step": 49
304
  },
305
  {
306
  "epoch": 0.0,
307
+ "learning_rate": 0.00011666666666666668,
308
+ "loss": 1.3727,
309
  "step": 50
310
  }
311
  ],
312
  "logging_steps": 1,
313
+ "max_steps": 101,
314
  "num_train_epochs": 1,
315
  "save_steps": 50,
316
  "total_flos": 3065802843488256.0,
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3b4ec3761ef4757a95a527eb558426cc30befdddfb2e65ccd9b15755214e1aa
3
  size 6328
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7cbf95afd03db05e4a6c60c9e638cc6a5d0f42779b04f46a69a319a2385ecef
3
  size 6328