emilykang commited on
Commit
c355768
1 Parent(s): a63a3be

Training in progress, epoch 1

Browse files
adapter_config.json CHANGED
@@ -20,10 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "k_proj",
24
- "q_proj",
25
  "o_proj",
26
- "v_proj"
 
 
 
 
 
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
 
23
  "o_proj",
24
+ "gate_proj",
25
+ "v_proj",
26
+ "down_proj",
27
+ "up_proj",
28
+ "q_proj",
29
+ "k_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02d98634348ac35ca4646d3d30be57be7c7ef7800350abc4bc36484d49c91354
3
- size 7391832
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b4e76984a3e1a558b51b12e9f33b601eed99f29ddeb03c9f0861577550188a9
3
+ size 39256704
trainer_state.json ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 410,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.24390243902439024,
13
+ "grad_norm": 7.625,
14
+ "learning_rate": 0.00019970658011837404,
15
+ "loss": 2.0086,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.4878048780487805,
20
+ "grad_norm": 0.59033203125,
21
+ "learning_rate": 0.00019882804237803488,
22
+ "loss": 1.1443,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.7317073170731707,
27
+ "grad_norm": 0.736328125,
28
+ "learning_rate": 0.00019736954238777792,
29
+ "loss": 0.985,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.975609756097561,
34
+ "grad_norm": 0.50048828125,
35
+ "learning_rate": 0.00019533963920549306,
36
+ "loss": 0.918,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 1.2195121951219512,
41
+ "grad_norm": 0.36083984375,
42
+ "learning_rate": 0.0001927502451102095,
43
+ "loss": 0.7771,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 1.4634146341463414,
48
+ "grad_norm": 0.376953125,
49
+ "learning_rate": 0.00018961655569610557,
50
+ "loss": 0.8079,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 1.7073170731707317,
55
+ "grad_norm": 0.314208984375,
56
+ "learning_rate": 0.00018595696069872013,
57
+ "loss": 0.7491,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 1.951219512195122,
62
+ "grad_norm": 0.3203125,
63
+ "learning_rate": 0.00018179293607667178,
64
+ "loss": 0.7349,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 2.1951219512195124,
69
+ "grad_norm": 0.331787109375,
70
+ "learning_rate": 0.0001771489179821943,
71
+ "loss": 0.7468,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 2.4390243902439024,
76
+ "grad_norm": 0.374755859375,
77
+ "learning_rate": 0.0001720521593600787,
78
+ "loss": 0.7582,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 2.682926829268293,
83
+ "grad_norm": 0.3662109375,
84
+ "learning_rate": 0.00016653257001655652,
85
+ "loss": 0.715,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 2.926829268292683,
90
+ "grad_norm": 0.413818359375,
91
+ "learning_rate": 0.0001606225410966638,
92
+ "loss": 0.7404,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 3.1707317073170733,
97
+ "grad_norm": 0.349609375,
98
+ "learning_rate": 0.00015435675500012212,
99
+ "loss": 0.6844,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 3.4146341463414633,
104
+ "grad_norm": 0.48095703125,
105
+ "learning_rate": 0.0001477719818512263,
106
+ "loss": 0.7568,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 3.658536585365854,
111
+ "grad_norm": 0.369384765625,
112
+ "learning_rate": 0.00014090686371713402,
113
+ "loss": 0.6647,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 3.902439024390244,
118
+ "grad_norm": 0.422607421875,
119
+ "learning_rate": 0.00013380168784085027,
120
+ "loss": 0.698,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 4.146341463414634,
125
+ "grad_norm": 0.38427734375,
126
+ "learning_rate": 0.0001264981502196662,
127
+ "loss": 0.6861,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 4.390243902439025,
132
+ "grad_norm": 0.38525390625,
133
+ "learning_rate": 0.00011903911091646684,
134
+ "loss": 0.6855,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 4.634146341463414,
139
+ "grad_norm": 0.384033203125,
140
+ "learning_rate": 0.00011146834253984006,
141
+ "loss": 0.6797,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 4.878048780487805,
146
+ "grad_norm": 0.365966796875,
147
+ "learning_rate": 0.00010383027336900355,
148
+ "loss": 0.6865,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 5.121951219512195,
153
+ "grad_norm": 0.39794921875,
154
+ "learning_rate": 9.616972663099647e-05,
155
+ "loss": 0.6859,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 5.365853658536586,
160
+ "grad_norm": 0.3837890625,
161
+ "learning_rate": 8.853165746015997e-05,
162
+ "loss": 0.6636,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 5.609756097560975,
167
+ "grad_norm": 0.445068359375,
168
+ "learning_rate": 8.096088908353315e-05,
169
+ "loss": 0.6883,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 5.853658536585366,
174
+ "grad_norm": 0.4130859375,
175
+ "learning_rate": 7.350184978033386e-05,
176
+ "loss": 0.6432,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 6.097560975609756,
181
+ "grad_norm": 0.380859375,
182
+ "learning_rate": 6.619831215914974e-05,
183
+ "loss": 0.6597,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 6.341463414634147,
188
+ "grad_norm": 0.407958984375,
189
+ "learning_rate": 5.909313628286601e-05,
190
+ "loss": 0.6574,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 6.585365853658536,
195
+ "grad_norm": 0.3974609375,
196
+ "learning_rate": 5.222801814877369e-05,
197
+ "loss": 0.6499,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 6.829268292682927,
202
+ "grad_norm": 0.40625,
203
+ "learning_rate": 4.56432449998779e-05,
204
+ "loss": 0.6436,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 7.073170731707317,
209
+ "grad_norm": 0.3994140625,
210
+ "learning_rate": 3.937745890333623e-05,
211
+ "loss": 0.6644,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 7.317073170731708,
216
+ "grad_norm": 0.40673828125,
217
+ "learning_rate": 3.346742998344348e-05,
218
+ "loss": 0.656,
219
+ "step": 300
220
+ },
221
+ {
222
+ "epoch": 7.560975609756097,
223
+ "grad_norm": 0.408447265625,
224
+ "learning_rate": 2.794784063992131e-05,
225
+ "loss": 0.6244,
226
+ "step": 310
227
+ },
228
+ {
229
+ "epoch": 7.804878048780488,
230
+ "grad_norm": 0.41015625,
231
+ "learning_rate": 2.2851082017805703e-05,
232
+ "loss": 0.6594,
233
+ "step": 320
234
+ },
235
+ {
236
+ "epoch": 8.048780487804878,
237
+ "grad_norm": 0.437744140625,
238
+ "learning_rate": 1.8207063923328237e-05,
239
+ "loss": 0.6423,
240
+ "step": 330
241
+ },
242
+ {
243
+ "epoch": 8.292682926829269,
244
+ "grad_norm": 0.43212890625,
245
+ "learning_rate": 1.4043039301279903e-05,
246
+ "loss": 0.635,
247
+ "step": 340
248
+ },
249
+ {
250
+ "epoch": 8.536585365853659,
251
+ "grad_norm": 0.4013671875,
252
+ "learning_rate": 1.0383444303894452e-05,
253
+ "loss": 0.6408,
254
+ "step": 350
255
+ },
256
+ {
257
+ "epoch": 8.78048780487805,
258
+ "grad_norm": 0.40869140625,
259
+ "learning_rate": 7.249754889790539e-06,
260
+ "loss": 0.6438,
261
+ "step": 360
262
+ },
263
+ {
264
+ "epoch": 9.024390243902438,
265
+ "grad_norm": 0.3857421875,
266
+ "learning_rate": 4.660360794506946e-06,
267
+ "loss": 0.6282,
268
+ "step": 370
269
+ },
270
+ {
271
+ "epoch": 9.268292682926829,
272
+ "grad_norm": 0.400634765625,
273
+ "learning_rate": 2.6304576122221035e-06,
274
+ "loss": 0.6359,
275
+ "step": 380
276
+ },
277
+ {
278
+ "epoch": 9.512195121951219,
279
+ "grad_norm": 0.3935546875,
280
+ "learning_rate": 1.1719576219651585e-06,
281
+ "loss": 0.6467,
282
+ "step": 390
283
+ },
284
+ {
285
+ "epoch": 9.75609756097561,
286
+ "grad_norm": 0.4140625,
287
+ "learning_rate": 2.934198816259559e-07,
288
+ "loss": 0.6479,
289
+ "step": 400
290
+ },
291
+ {
292
+ "epoch": 10.0,
293
+ "grad_norm": 0.400390625,
294
+ "learning_rate": 0.0,
295
+ "loss": 0.6203,
296
+ "step": 410
297
+ },
298
+ {
299
+ "epoch": 10.0,
300
+ "step": 410,
301
+ "total_flos": 2.000692923334656e+16,
302
+ "train_loss": 0.7381390141277778,
303
+ "train_runtime": 366.1191,
304
+ "train_samples_per_second": 4.479,
305
+ "train_steps_per_second": 1.12
306
+ }
307
+ ],
308
+ "logging_steps": 10,
309
+ "max_steps": 410,
310
+ "num_input_tokens_seen": 0,
311
+ "num_train_epochs": 10,
312
+ "save_steps": 500,
313
+ "total_flos": 2.000692923334656e+16,
314
+ "train_batch_size": 1,
315
+ "trial_name": null,
316
+ "trial_params": null
317
+ }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1c53e2098ab48f443df2689d4e6dbdb3d25526eeb63e57661f0a0ff2d5df4b1
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4efee21f229df2d1d33be74e3b8c2ff4958f5d456ec8d17ee1d29d524be5c3ee
3
  size 5048