bchirag commited on
Commit
8ea6672
·
verified ·
1 Parent(s): 662abc3

Upload folder using huggingface_hub

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71c3eec7b0756605f909fb56d8bd47c764d63bb71cd3ceae33f09b51c736f629
3
  size 73911504
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:728e3a0b8303f7228f274b98fc09c983203b21f9b6a3e49444a6c43e91b8a786
3
  size 73911504
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d75c43c4b01e0c50ea6d225e75578bf1d4867c539c22e2cff4276f33ca52bbf2
3
+ size 75477253
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45f8c553e33f895d33936e35068ae80d26b53e8e171309ff9d8457a626c93ec1
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e6217e4a343d366cb4a03823bfc52360a0a0768addc7ab903a9007c55201773
3
+ size 1465
trainer_state.json ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.3438641736514077,
6
+ "eval_steps": 500,
7
+ "global_step": 400,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 0.967700420320034,
14
+ "epoch": 0.008596604341285192,
15
+ "grad_norm": 1.28125,
16
+ "learning_rate": 7.500000000000001e-05,
17
+ "loss": 1.1922,
18
+ "mean_token_accuracy": 0.7462927460670471,
19
+ "num_tokens": 24955.0,
20
+ "step": 10
21
+ },
22
+ {
23
+ "entropy": 0.8809127531945705,
24
+ "epoch": 0.017193208682570384,
25
+ "grad_norm": 0.640625,
26
+ "learning_rate": 0.00015833333333333332,
27
+ "loss": 0.9056,
28
+ "mean_token_accuracy": 0.7825774624943733,
29
+ "num_tokens": 50126.0,
30
+ "step": 20
31
+ },
32
+ {
33
+ "entropy": 0.7974838063120842,
34
+ "epoch": 0.025789813023855575,
35
+ "grad_norm": 0.466796875,
36
+ "learning_rate": 0.0001999795133245889,
37
+ "loss": 0.8262,
38
+ "mean_token_accuracy": 0.7897384226322174,
39
+ "num_tokens": 76211.0,
40
+ "step": 30
41
+ },
42
+ {
43
+ "entropy": 0.6672817915678024,
44
+ "epoch": 0.03438641736514077,
45
+ "grad_norm": 0.53125,
46
+ "learning_rate": 0.00019981567028232514,
47
+ "loss": 0.6561,
48
+ "mean_token_accuracy": 0.8081615135073662,
49
+ "num_tokens": 99676.0,
50
+ "step": 40
51
+ },
52
+ {
53
+ "entropy": 0.7113775327801705,
54
+ "epoch": 0.04298302170642596,
55
+ "grad_norm": 0.41015625,
56
+ "learning_rate": 0.00019948825269822934,
57
+ "loss": 0.6759,
58
+ "mean_token_accuracy": 0.8067922666668892,
59
+ "num_tokens": 122341.0,
60
+ "step": 50
61
+ },
62
+ {
63
+ "entropy": 0.6950222067534924,
64
+ "epoch": 0.05157962604771115,
65
+ "grad_norm": 0.470703125,
66
+ "learning_rate": 0.00019899779713315575,
67
+ "loss": 0.7108,
68
+ "mean_token_accuracy": 0.8030081078410148,
69
+ "num_tokens": 143840.0,
70
+ "step": 60
71
+ },
72
+ {
73
+ "entropy": 0.7753045067191124,
74
+ "epoch": 0.060176230388996346,
75
+ "grad_norm": 0.388671875,
76
+ "learning_rate": 0.00019834510732908315,
77
+ "loss": 0.7512,
78
+ "mean_token_accuracy": 0.7937675878405571,
79
+ "num_tokens": 169630.0,
80
+ "step": 70
81
+ },
82
+ {
83
+ "entropy": 0.7161391645669937,
84
+ "epoch": 0.06877283473028153,
85
+ "grad_norm": 0.384765625,
86
+ "learning_rate": 0.0001975312528919697,
87
+ "loss": 0.7586,
88
+ "mean_token_accuracy": 0.793881094455719,
89
+ "num_tokens": 192514.0,
90
+ "step": 80
91
+ },
92
+ {
93
+ "entropy": 0.7045083746314049,
94
+ "epoch": 0.07736943907156674,
95
+ "grad_norm": 0.353515625,
96
+ "learning_rate": 0.00019655756753891916,
97
+ "loss": 0.6853,
98
+ "mean_token_accuracy": 0.8054596990346908,
99
+ "num_tokens": 212398.0,
100
+ "step": 90
101
+ },
102
+ {
103
+ "entropy": 0.7032747760415077,
104
+ "epoch": 0.08596604341285193,
105
+ "grad_norm": 0.396484375,
106
+ "learning_rate": 0.0001954256469125301,
107
+ "loss": 0.7434,
108
+ "mean_token_accuracy": 0.797465617954731,
109
+ "num_tokens": 236559.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "entropy": 0.6814083471894264,
114
+ "epoch": 0.09456264775413711,
115
+ "grad_norm": 0.328125,
116
+ "learning_rate": 0.00019413734596601104,
117
+ "loss": 0.653,
118
+ "mean_token_accuracy": 0.809566143155098,
119
+ "num_tokens": 259990.0,
120
+ "step": 110
121
+ },
122
+ {
123
+ "entropy": 0.6919607870280743,
124
+ "epoch": 0.1031592520954223,
125
+ "grad_norm": 0.388671875,
126
+ "learning_rate": 0.0001926947759233459,
127
+ "loss": 0.7015,
128
+ "mean_token_accuracy": 0.8029481008648872,
129
+ "num_tokens": 281771.0,
130
+ "step": 120
131
+ },
132
+ {
133
+ "entropy": 0.7389424860477447,
134
+ "epoch": 0.1117558564367075,
135
+ "grad_norm": 0.353515625,
136
+ "learning_rate": 0.00019110030081949156,
137
+ "loss": 0.7648,
138
+ "mean_token_accuracy": 0.7935838386416435,
139
+ "num_tokens": 305329.0,
140
+ "step": 130
141
+ },
142
+ {
143
+ "entropy": 0.7182212144136428,
144
+ "epoch": 0.12035246077799269,
145
+ "grad_norm": 0.37109375,
146
+ "learning_rate": 0.0001893565336262773,
147
+ "loss": 0.7068,
148
+ "mean_token_accuracy": 0.8010134562849999,
149
+ "num_tokens": 329386.0,
150
+ "step": 140
151
+ },
152
+ {
153
+ "entropy": 0.6653173223137856,
154
+ "epoch": 0.1289490651192779,
155
+ "grad_norm": 0.34375,
156
+ "learning_rate": 0.00018746633197035527,
157
+ "loss": 0.658,
158
+ "mean_token_accuracy": 0.8119655668735504,
159
+ "num_tokens": 349961.0,
160
+ "step": 150
161
+ },
162
+ {
163
+ "entropy": 0.7046581447124481,
164
+ "epoch": 0.13754566946056307,
165
+ "grad_norm": 0.3125,
166
+ "learning_rate": 0.00018543279345021834,
167
+ "loss": 0.7212,
168
+ "mean_token_accuracy": 0.7986769005656242,
169
+ "num_tokens": 373425.0,
170
+ "step": 160
171
+ },
172
+ {
173
+ "entropy": 0.7094632089138031,
174
+ "epoch": 0.14614227380184827,
175
+ "grad_norm": 0.333984375,
176
+ "learning_rate": 0.00018325925055996076,
177
+ "loss": 0.6972,
178
+ "mean_token_accuracy": 0.8056794568896294,
179
+ "num_tokens": 399788.0,
180
+ "step": 170
181
+ },
182
+ {
183
+ "entropy": 0.6955713860690593,
184
+ "epoch": 0.15473887814313347,
185
+ "grad_norm": 0.48046875,
186
+ "learning_rate": 0.0001809492652280996,
187
+ "loss": 0.6953,
188
+ "mean_token_accuracy": 0.8080685377120972,
189
+ "num_tokens": 421256.0,
190
+ "step": 180
191
+ },
192
+ {
193
+ "entropy": 0.672984279692173,
194
+ "epoch": 0.16333548248441865,
195
+ "grad_norm": 0.421875,
196
+ "learning_rate": 0.00017850662298040678,
197
+ "loss": 0.6758,
198
+ "mean_token_accuracy": 0.8065061077475548,
199
+ "num_tokens": 444196.0,
200
+ "step": 190
201
+ },
202
+ {
203
+ "entropy": 0.7029214389622211,
204
+ "epoch": 0.17193208682570385,
205
+ "grad_norm": 0.33984375,
206
+ "learning_rate": 0.00017593532673631766,
207
+ "loss": 0.7028,
208
+ "mean_token_accuracy": 0.8078923970460892,
209
+ "num_tokens": 468312.0,
210
+ "step": 200
211
+ },
212
+ {
213
+ "entropy": 0.6828642345964908,
214
+ "epoch": 0.18052869116698905,
215
+ "grad_norm": 0.431640625,
216
+ "learning_rate": 0.00017323959024908209,
217
+ "loss": 0.6942,
218
+ "mean_token_accuracy": 0.8095744833350181,
219
+ "num_tokens": 490571.0,
220
+ "step": 210
221
+ },
222
+ {
223
+ "entropy": 0.731064935028553,
224
+ "epoch": 0.18912529550827423,
225
+ "grad_norm": 0.37109375,
226
+ "learning_rate": 0.00017042383120040834,
227
+ "loss": 0.7514,
228
+ "mean_token_accuracy": 0.8010679826140403,
229
+ "num_tokens": 514374.0,
230
+ "step": 220
231
+ },
232
+ {
233
+ "entropy": 0.6761481508612632,
234
+ "epoch": 0.19772189984955943,
235
+ "grad_norm": 0.40234375,
236
+ "learning_rate": 0.0001674926639609157,
237
+ "loss": 0.6708,
238
+ "mean_token_accuracy": 0.8103428333997726,
239
+ "num_tokens": 536838.0,
240
+ "step": 230
241
+ },
242
+ {
243
+ "entropy": 0.7191244259476661,
244
+ "epoch": 0.2063185041908446,
245
+ "grad_norm": 0.3828125,
246
+ "learning_rate": 0.0001644508920282601,
247
+ "loss": 0.7343,
248
+ "mean_token_accuracy": 0.7990544006228447,
249
+ "num_tokens": 560798.0,
250
+ "step": 240
251
+ },
252
+ {
253
+ "entropy": 0.7184930846095086,
254
+ "epoch": 0.2149151085321298,
255
+ "grad_norm": 0.287109375,
256
+ "learning_rate": 0.00016130350015532496,
257
+ "loss": 0.7516,
258
+ "mean_token_accuracy": 0.7994581028819084,
259
+ "num_tokens": 586321.0,
260
+ "step": 250
261
+ },
262
+ {
263
+ "entropy": 0.7019176624715329,
264
+ "epoch": 0.223511712873415,
265
+ "grad_norm": 0.29296875,
266
+ "learning_rate": 0.0001580556461813766,
267
+ "loss": 0.7465,
268
+ "mean_token_accuracy": 0.8056053712964057,
269
+ "num_tokens": 612379.0,
270
+ "step": 260
271
+ },
272
+ {
273
+ "entropy": 0.7077236130833626,
274
+ "epoch": 0.23210831721470018,
275
+ "grad_norm": 0.408203125,
276
+ "learning_rate": 0.00015471265257957202,
277
+ "loss": 0.734,
278
+ "mean_token_accuracy": 0.8064405977725982,
279
+ "num_tokens": 637314.0,
280
+ "step": 270
281
+ },
282
+ {
283
+ "entropy": 0.7111990034580231,
284
+ "epoch": 0.24070492155598538,
285
+ "grad_norm": 0.318359375,
286
+ "learning_rate": 0.00015127999773467002,
287
+ "loss": 0.7056,
288
+ "mean_token_accuracy": 0.806194306910038,
289
+ "num_tokens": 660474.0,
290
+ "step": 280
291
+ },
292
+ {
293
+ "entropy": 0.7012526571750641,
294
+ "epoch": 0.2493015258972706,
295
+ "grad_norm": 0.3125,
296
+ "learning_rate": 0.00014776330696523963,
297
+ "loss": 0.7173,
298
+ "mean_token_accuracy": 0.8059186190366745,
299
+ "num_tokens": 684161.0,
300
+ "step": 290
301
+ },
302
+ {
303
+ "entropy": 0.7212809801101685,
304
+ "epoch": 0.2578981302385558,
305
+ "grad_norm": 0.337890625,
306
+ "learning_rate": 0.00014416834330507856,
307
+ "loss": 0.7445,
308
+ "mean_token_accuracy": 0.7970411285758019,
309
+ "num_tokens": 705287.0,
310
+ "step": 300
311
+ },
312
+ {
313
+ "entropy": 0.6817912630736828,
314
+ "epoch": 0.26649473457984096,
315
+ "grad_norm": 0.30078125,
316
+ "learning_rate": 0.00014050099805894837,
317
+ "loss": 0.6738,
318
+ "mean_token_accuracy": 0.8141646087169647,
319
+ "num_tokens": 728784.0,
320
+ "step": 310
321
+ },
322
+ {
323
+ "entropy": 0.7327330216765404,
324
+ "epoch": 0.27509133892112614,
325
+ "grad_norm": 0.400390625,
326
+ "learning_rate": 0.00013676728114810367,
327
+ "loss": 0.7564,
328
+ "mean_token_accuracy": 0.7981535136699677,
329
+ "num_tokens": 752938.0,
330
+ "step": 320
331
+ },
332
+ {
333
+ "entropy": 0.6538611635565758,
334
+ "epoch": 0.28368794326241137,
335
+ "grad_norm": 0.34765625,
336
+ "learning_rate": 0.00013297331126143667,
337
+ "loss": 0.6562,
338
+ "mean_token_accuracy": 0.8156133487820625,
339
+ "num_tokens": 776014.0,
340
+ "step": 330
341
+ },
342
+ {
343
+ "entropy": 0.6695164687931537,
344
+ "epoch": 0.29228454760369654,
345
+ "grad_norm": 0.400390625,
346
+ "learning_rate": 0.00012912530582837682,
347
+ "loss": 0.6904,
348
+ "mean_token_accuracy": 0.8143788442015648,
349
+ "num_tokens": 799890.0,
350
+ "step": 340
351
+ },
352
+ {
353
+ "entropy": 0.7247762531042099,
354
+ "epoch": 0.3008811519449817,
355
+ "grad_norm": 0.4296875,
356
+ "learning_rate": 0.000125229570829978,
357
+ "loss": 0.7753,
358
+ "mean_token_accuracy": 0.8005941316485405,
359
+ "num_tokens": 825088.0,
360
+ "step": 350
361
+ },
362
+ {
363
+ "entropy": 0.6589260399341583,
364
+ "epoch": 0.30947775628626695,
365
+ "grad_norm": 0.302734375,
366
+ "learning_rate": 0.0001212924904648902,
367
+ "loss": 0.6323,
368
+ "mean_token_accuracy": 0.8167923167347908,
369
+ "num_tokens": 849091.0,
370
+ "step": 360
371
+ },
372
+ {
373
+ "entropy": 0.6572450198233127,
374
+ "epoch": 0.3180743606275521,
375
+ "grad_norm": 0.427734375,
376
+ "learning_rate": 0.00011732051668715081,
377
+ "loss": 0.6631,
378
+ "mean_token_accuracy": 0.8133938252925873,
379
+ "num_tokens": 871233.0,
380
+ "step": 370
381
+ },
382
+ {
383
+ "entropy": 0.6870896026492119,
384
+ "epoch": 0.3266709649688373,
385
+ "grad_norm": 0.375,
386
+ "learning_rate": 0.00011332015863294076,
387
+ "loss": 0.7134,
388
+ "mean_token_accuracy": 0.807905575633049,
389
+ "num_tokens": 895100.0,
390
+ "step": 380
391
+ },
392
+ {
393
+ "entropy": 0.7120285458862782,
394
+ "epoch": 0.3352675693101225,
395
+ "grad_norm": 0.3203125,
396
+ "learning_rate": 0.00010929797195363259,
397
+ "loss": 0.7283,
398
+ "mean_token_accuracy": 0.8010812431573868,
399
+ "num_tokens": 917556.0,
400
+ "step": 390
401
+ },
402
+ {
403
+ "entropy": 0.6840016417205333,
404
+ "epoch": 0.3438641736514077,
405
+ "grad_norm": 0.3671875,
406
+ "learning_rate": 0.00010526054807261067,
407
+ "loss": 0.688,
408
+ "mean_token_accuracy": 0.8081021830439568,
409
+ "num_tokens": 939116.0,
410
+ "step": 400
411
+ }
412
+ ],
413
+ "logging_steps": 10,
414
+ "max_steps": 800,
415
+ "num_input_tokens_seen": 0,
416
+ "num_train_epochs": 1,
417
+ "save_steps": 200,
418
+ "stateful_callbacks": {
419
+ "TrainerControl": {
420
+ "args": {
421
+ "should_epoch_stop": false,
422
+ "should_evaluate": false,
423
+ "should_log": false,
424
+ "should_save": true,
425
+ "should_training_stop": false
426
+ },
427
+ "attributes": {}
428
+ }
429
+ },
430
+ "total_flos": 1.4054474244501504e+16,
431
+ "train_batch_size": 4,
432
+ "trial_name": null,
433
+ "trial_params": null
434
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b846167b1eefc9ae9402f990ca6668ded5cc7f03818fc6f1f489a92d9baf507b
3
+ size 6225