|
{ |
|
"best_metric": 2.3844265937805176, |
|
"best_model_checkpoint": "ckpt/llama2_13b_fuze15_no_sys/strategyqa_no_sys/checkpoint-500", |
|
"epoch": 10.0, |
|
"eval_steps": 50, |
|
"global_step": 650, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.40633881092071533, |
|
"learning_rate": 5e-05, |
|
"loss": 1.8151, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.5893375873565674, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5545, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.4787861406803131, |
|
"learning_rate": 9.993784606094612e-05, |
|
"loss": 1.2226, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.5218197703361511, |
|
"learning_rate": 9.975153876827008e-05, |
|
"loss": 1.1648, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.4085441827774048, |
|
"learning_rate": 9.944154131125642e-05, |
|
"loss": 1.1108, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"eval_loss": 1.1378295421600342, |
|
"eval_runtime": 1.8486, |
|
"eval_samples_per_second": 148.764, |
|
"eval_steps_per_second": 6.492, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.38595476746559143, |
|
"learning_rate": 9.900862439242719e-05, |
|
"loss": 1.1433, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.38242650032043457, |
|
"learning_rate": 9.84538643114539e-05, |
|
"loss": 1.1424, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.36820024251937866, |
|
"learning_rate": 9.777864028930705e-05, |
|
"loss": 1.092, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.40036314725875854, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 1.0537, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.45688629150390625, |
|
"learning_rate": 9.607381059352038e-05, |
|
"loss": 1.0652, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"eval_loss": 1.1235716342926025, |
|
"eval_runtime": 1.8515, |
|
"eval_samples_per_second": 148.526, |
|
"eval_steps_per_second": 6.481, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.4719078540802002, |
|
"learning_rate": 9.504844339512095e-05, |
|
"loss": 1.0962, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.5598475933074951, |
|
"learning_rate": 9.391107866851143e-05, |
|
"loss": 1.067, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.5334481000900269, |
|
"learning_rate": 9.266454408160779e-05, |
|
"loss": 1.0723, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.5928361415863037, |
|
"learning_rate": 9.131193871579975e-05, |
|
"loss": 0.9459, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 0.8523222804069519, |
|
"learning_rate": 8.985662536114613e-05, |
|
"loss": 0.9679, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"eval_loss": 1.1641343832015991, |
|
"eval_runtime": 1.8561, |
|
"eval_samples_per_second": 148.157, |
|
"eval_steps_per_second": 6.465, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.9512267112731934, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 0.8452, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 0.8635448217391968, |
|
"learning_rate": 8.665259359149132e-05, |
|
"loss": 0.8922, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 1.3180458545684814, |
|
"learning_rate": 8.491184090430364e-05, |
|
"loss": 0.8441, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 1.2508196830749512, |
|
"learning_rate": 8.308429187984297e-05, |
|
"loss": 0.9151, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 1.8130565881729126, |
|
"learning_rate": 8.117449009293668e-05, |
|
"loss": 0.7321, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"eval_loss": 1.3882437944412231, |
|
"eval_runtime": 1.8575, |
|
"eval_samples_per_second": 148.047, |
|
"eval_steps_per_second": 6.46, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 1.3749483823776245, |
|
"learning_rate": 7.91871836117395e-05, |
|
"loss": 0.5473, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 2.105992078781128, |
|
"learning_rate": 7.712731319328798e-05, |
|
"loss": 0.549, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 1.8341376781463623, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.5206, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 1.7375411987304688, |
|
"learning_rate": 7.281053286765815e-05, |
|
"loss": 0.5435, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 1.8721497058868408, |
|
"learning_rate": 7.056435515653059e-05, |
|
"loss": 0.5562, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"eval_loss": 1.3442635536193848, |
|
"eval_runtime": 1.8579, |
|
"eval_samples_per_second": 148.019, |
|
"eval_steps_per_second": 6.459, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.7349268198013306, |
|
"learning_rate": 6.826705121831976e-05, |
|
"loss": 0.5269, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 2.820291757583618, |
|
"learning_rate": 6.592433251258423e-05, |
|
"loss": 0.3063, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 1.9900763034820557, |
|
"learning_rate": 6.354202340715026e-05, |
|
"loss": 0.2743, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 2.0194549560546875, |
|
"learning_rate": 6.112604669781572e-05, |
|
"loss": 0.2718, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 2.1579859256744385, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 0.2699, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"eval_loss": 1.6108604669570923, |
|
"eval_runtime": 1.858, |
|
"eval_samples_per_second": 148.007, |
|
"eval_steps_per_second": 6.458, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.77, |
|
"grad_norm": 1.8816920518875122, |
|
"learning_rate": 5.621718523237427e-05, |
|
"loss": 0.2749, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 1.9615857601165771, |
|
"learning_rate": 5.373650467932122e-05, |
|
"loss": 0.2582, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 5.08, |
|
"grad_norm": 1.4868065118789673, |
|
"learning_rate": 5.124653458690365e-05, |
|
"loss": 0.2131, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 5.23, |
|
"grad_norm": 2.2800121307373047, |
|
"learning_rate": 4.875346541309637e-05, |
|
"loss": 0.1242, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"grad_norm": 1.9804664850234985, |
|
"learning_rate": 4.626349532067879e-05, |
|
"loss": 0.1135, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"eval_loss": 1.902097463607788, |
|
"eval_runtime": 1.8585, |
|
"eval_samples_per_second": 147.969, |
|
"eval_steps_per_second": 6.457, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 5.54, |
|
"grad_norm": 2.2013049125671387, |
|
"learning_rate": 4.378281476762576e-05, |
|
"loss": 0.1092, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 5.69, |
|
"grad_norm": 2.5137171745300293, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 0.1171, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 5.85, |
|
"grad_norm": 1.885507583618164, |
|
"learning_rate": 3.887395330218429e-05, |
|
"loss": 0.1155, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 1.9361807107925415, |
|
"learning_rate": 3.6457976592849754e-05, |
|
"loss": 0.1274, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 6.15, |
|
"grad_norm": 1.8210408687591553, |
|
"learning_rate": 3.4075667487415785e-05, |
|
"loss": 0.0511, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 6.15, |
|
"eval_loss": 2.216161012649536, |
|
"eval_runtime": 1.8599, |
|
"eval_samples_per_second": 147.855, |
|
"eval_steps_per_second": 6.452, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 6.31, |
|
"grad_norm": 1.851035475730896, |
|
"learning_rate": 3.173294878168025e-05, |
|
"loss": 0.0489, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 6.46, |
|
"grad_norm": 1.3450051546096802, |
|
"learning_rate": 2.9435644843469436e-05, |
|
"loss": 0.0419, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 6.62, |
|
"grad_norm": 1.9799576997756958, |
|
"learning_rate": 2.718946713234185e-05, |
|
"loss": 0.053, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 6.77, |
|
"grad_norm": 1.1950169801712036, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 0.0422, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 6.92, |
|
"grad_norm": 1.1742631196975708, |
|
"learning_rate": 2.2872686806712035e-05, |
|
"loss": 0.049, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 6.92, |
|
"eval_loss": 2.0983123779296875, |
|
"eval_runtime": 1.8568, |
|
"eval_samples_per_second": 148.106, |
|
"eval_steps_per_second": 6.463, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 7.08, |
|
"grad_norm": 0.7656117081642151, |
|
"learning_rate": 2.0812816388260518e-05, |
|
"loss": 0.0362, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 7.23, |
|
"grad_norm": 1.1246815919876099, |
|
"learning_rate": 1.8825509907063327e-05, |
|
"loss": 0.0207, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 7.38, |
|
"grad_norm": 0.8675357699394226, |
|
"learning_rate": 1.691570812015704e-05, |
|
"loss": 0.0266, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 7.54, |
|
"grad_norm": 0.9884489178657532, |
|
"learning_rate": 1.5088159095696363e-05, |
|
"loss": 0.0167, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 7.69, |
|
"grad_norm": 0.9568442106246948, |
|
"learning_rate": 1.3347406408508695e-05, |
|
"loss": 0.0212, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 7.69, |
|
"eval_loss": 2.3844265937805176, |
|
"eval_runtime": 1.8585, |
|
"eval_samples_per_second": 147.972, |
|
"eval_steps_per_second": 6.457, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 7.85, |
|
"grad_norm": 0.8196859955787659, |
|
"learning_rate": 1.1697777844051105e-05, |
|
"loss": 0.0186, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.6307012438774109, |
|
"learning_rate": 1.0143374638853891e-05, |
|
"loss": 0.0195, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 8.15, |
|
"grad_norm": 0.2967863976955414, |
|
"learning_rate": 8.688061284200266e-06, |
|
"loss": 0.0087, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 8.31, |
|
"grad_norm": 0.3737642765045166, |
|
"learning_rate": 7.33545591839222e-06, |
|
"loss": 0.0111, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 8.46, |
|
"grad_norm": 0.8899824619293213, |
|
"learning_rate": 6.088921331488568e-06, |
|
"loss": 0.0107, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 8.46, |
|
"eval_loss": 2.5323970317840576, |
|
"eval_runtime": 1.8601, |
|
"eval_samples_per_second": 147.844, |
|
"eval_steps_per_second": 6.451, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 8.62, |
|
"grad_norm": 0.698161244392395, |
|
"learning_rate": 4.951556604879048e-06, |
|
"loss": 0.0102, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 8.77, |
|
"grad_norm": 0.7657814025878906, |
|
"learning_rate": 3.9261894064796135e-06, |
|
"loss": 0.0108, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"grad_norm": 0.3906441926956177, |
|
"learning_rate": 3.0153689607045845e-06, |
|
"loss": 0.0105, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 9.08, |
|
"grad_norm": 0.23798762261867523, |
|
"learning_rate": 2.221359710692961e-06, |
|
"loss": 0.0104, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 9.23, |
|
"grad_norm": 0.18085721135139465, |
|
"learning_rate": 1.5461356885461075e-06, |
|
"loss": 0.006, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 9.23, |
|
"eval_loss": 2.5656862258911133, |
|
"eval_runtime": 1.8572, |
|
"eval_samples_per_second": 148.074, |
|
"eval_steps_per_second": 6.461, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 9.38, |
|
"grad_norm": 0.243088960647583, |
|
"learning_rate": 9.913756075728087e-07, |
|
"loss": 0.009, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 9.54, |
|
"grad_norm": 0.16859322786331177, |
|
"learning_rate": 5.584586887435739e-07, |
|
"loss": 0.0077, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 9.69, |
|
"grad_norm": 0.35239869356155396, |
|
"learning_rate": 2.4846123172992954e-07, |
|
"loss": 0.0085, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 9.85, |
|
"grad_norm": 0.28910598158836365, |
|
"learning_rate": 6.215393905388278e-08, |
|
"loss": 0.0082, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.20681537687778473, |
|
"learning_rate": 0.0, |
|
"loss": 0.0131, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 2.5764715671539307, |
|
"eval_runtime": 1.8597, |
|
"eval_samples_per_second": 147.872, |
|
"eval_steps_per_second": 6.453, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 650, |
|
"total_flos": 1.3867479608590336e+17, |
|
"train_loss": 0.4327951647226627, |
|
"train_runtime": 356.3752, |
|
"train_samples_per_second": 43.69, |
|
"train_steps_per_second": 1.824 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 650, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 1.3867479608590336e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|