|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.961832061068702, |
|
"eval_steps": 500, |
|
"global_step": 325, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.015267175572519083, |
|
"grad_norm": 183.11753845214844, |
|
"learning_rate": 6.060606060606061e-06, |
|
"loss": 46.1063, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.07633587786259542, |
|
"grad_norm": 136.03738403320312, |
|
"learning_rate": 3.0303030303030306e-05, |
|
"loss": 44.0302, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.15267175572519084, |
|
"grad_norm": 69.2432632446289, |
|
"learning_rate": 6.060606060606061e-05, |
|
"loss": 38.4659, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.22900763358778625, |
|
"grad_norm": 17.486797332763672, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 30.3029, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.3053435114503817, |
|
"grad_norm": 13.530756950378418, |
|
"learning_rate": 0.00012121212121212122, |
|
"loss": 26.6709, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3816793893129771, |
|
"grad_norm": 7.521498680114746, |
|
"learning_rate": 0.00015151515151515152, |
|
"loss": 24.4319, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.4580152671755725, |
|
"grad_norm": 5.912084102630615, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 22.862, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5343511450381679, |
|
"grad_norm": 10.610209465026855, |
|
"learning_rate": 0.00019997685019798912, |
|
"loss": 21.5999, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.6106870229007634, |
|
"grad_norm": 20.944725036621094, |
|
"learning_rate": 0.0001997165380022878, |
|
"loss": 19.4719, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6870229007633588, |
|
"grad_norm": 34.12383270263672, |
|
"learning_rate": 0.000199167731989929, |
|
"loss": 14.6832, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.7633587786259542, |
|
"grad_norm": 42.86738204956055, |
|
"learning_rate": 0.0001983320199330545, |
|
"loss": 8.7569, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8396946564885496, |
|
"grad_norm": 12.474686622619629, |
|
"learning_rate": 0.00019721181966290613, |
|
"loss": 4.3457, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.916030534351145, |
|
"grad_norm": 9.623456954956055, |
|
"learning_rate": 0.00019581037207470382, |
|
"loss": 3.4309, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.9923664122137404, |
|
"grad_norm": 3.5216312408447266, |
|
"learning_rate": 0.00019413173175128473, |
|
"loss": 2.9056, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.9923664122137404, |
|
"eval_loss": 2.611328125, |
|
"eval_runtime": 19.2134, |
|
"eval_samples_per_second": 47.935, |
|
"eval_steps_per_second": 0.781, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.0687022900763359, |
|
"grad_norm": 2.9582359790802, |
|
"learning_rate": 0.00019218075523263104, |
|
"loss": 2.7809, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.1450381679389312, |
|
"grad_norm": 2.319239616394043, |
|
"learning_rate": 0.00018996308696522433, |
|
"loss": 2.3224, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.2213740458015268, |
|
"grad_norm": 1.3839267492294312, |
|
"learning_rate": 0.00018748514297187648, |
|
"loss": 2.2039, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.297709923664122, |
|
"grad_norm": 0.5840837955474854, |
|
"learning_rate": 0.00018475409228928312, |
|
"loss": 2.1174, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.3740458015267176, |
|
"grad_norm": 1.5493711233139038, |
|
"learning_rate": 0.00018177783622700327, |
|
"loss": 2.0565, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.450381679389313, |
|
"grad_norm": 0.7415986657142639, |
|
"learning_rate": 0.00017856498550787144, |
|
"loss": 2.003, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.5267175572519083, |
|
"grad_norm": 0.6342356204986572, |
|
"learning_rate": 0.00017512483535597867, |
|
"loss": 1.9686, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.6030534351145038, |
|
"grad_norm": 1.0893248319625854, |
|
"learning_rate": 0.00017146733860429612, |
|
"loss": 1.9499, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.6793893129770994, |
|
"grad_norm": 1.233128547668457, |
|
"learning_rate": 0.0001676030768997445, |
|
"loss": 1.9192, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.7557251908396947, |
|
"grad_norm": 0.7829602360725403, |
|
"learning_rate": 0.00016354323008901776, |
|
"loss": 1.8934, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.83206106870229, |
|
"grad_norm": 1.0393383502960205, |
|
"learning_rate": 0.00015929954387373103, |
|
"loss": 1.8579, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.9083969465648853, |
|
"grad_norm": 2.433302879333496, |
|
"learning_rate": 0.00015488429582847192, |
|
"loss": 1.8576, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.984732824427481, |
|
"grad_norm": 1.2537367343902588, |
|
"learning_rate": 0.00015031025988006936, |
|
"loss": 1.8271, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.8229883909225464, |
|
"eval_runtime": 19.0953, |
|
"eval_samples_per_second": 48.232, |
|
"eval_steps_per_second": 0.786, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.0610687022900764, |
|
"grad_norm": 1.04417085647583, |
|
"learning_rate": 0.00014559066935084588, |
|
"loss": 1.975, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.1374045801526718, |
|
"grad_norm": 0.9754623174667358, |
|
"learning_rate": 0.00014073917867277557, |
|
"loss": 1.7901, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.213740458015267, |
|
"grad_norm": 0.6031882762908936, |
|
"learning_rate": 0.0001357698238833126, |
|
"loss": 1.7584, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.2900763358778624, |
|
"grad_norm": 1.7654844522476196, |
|
"learning_rate": 0.000130696982017182, |
|
"loss": 1.7665, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.366412213740458, |
|
"grad_norm": 1.8184305429458618, |
|
"learning_rate": 0.0001255353295116187, |
|
"loss": 1.7496, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.4427480916030535, |
|
"grad_norm": 2.4291305541992188, |
|
"learning_rate": 0.00012029979974539234, |
|
"loss": 1.7389, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.519083969465649, |
|
"grad_norm": 0.7844381928443909, |
|
"learning_rate": 0.00011500553983446527, |
|
"loss": 1.7327, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.595419847328244, |
|
"grad_norm": 1.0221455097198486, |
|
"learning_rate": 0.00010966786680927874, |
|
"loss": 1.7365, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.67175572519084, |
|
"grad_norm": 1.1956524848937988, |
|
"learning_rate": 0.00010430222330045304, |
|
"loss": 1.7204, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.7480916030534353, |
|
"grad_norm": 0.7325518131256104, |
|
"learning_rate": 9.892413286110886e-05, |
|
"loss": 1.7177, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.8244274809160306, |
|
"grad_norm": 0.8538561463356018, |
|
"learning_rate": 9.354915505506839e-05, |
|
"loss": 1.7193, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.900763358778626, |
|
"grad_norm": 1.252325415611267, |
|
"learning_rate": 8.81928404408726e-05, |
|
"loss": 1.7058, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.9770992366412212, |
|
"grad_norm": 0.7734937071800232, |
|
"learning_rate": 8.287068558185225e-05, |
|
"loss": 1.7019, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.9923664122137406, |
|
"eval_loss": 1.7041354179382324, |
|
"eval_runtime": 19.3108, |
|
"eval_samples_per_second": 47.694, |
|
"eval_steps_per_second": 0.777, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 3.053435114503817, |
|
"grad_norm": 0.6631619334220886, |
|
"learning_rate": 7.759808821241406e-05, |
|
"loss": 1.8697, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.1297709923664123, |
|
"grad_norm": 0.7187236547470093, |
|
"learning_rate": 7.239030269025311e-05, |
|
"loss": 1.7181, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 3.2061068702290076, |
|
"grad_norm": 0.5320985913276672, |
|
"learning_rate": 6.726239586337408e-05, |
|
"loss": 1.7351, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.282442748091603, |
|
"grad_norm": 0.43638336658477783, |
|
"learning_rate": 6.22292034796035e-05, |
|
"loss": 1.7156, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 3.3587786259541983, |
|
"grad_norm": 0.3966742753982544, |
|
"learning_rate": 5.730528726470792e-05, |
|
"loss": 1.7158, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.435114503816794, |
|
"grad_norm": 0.326159805059433, |
|
"learning_rate": 5.2504892793295e-05, |
|
"loss": 1.7055, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 3.5114503816793894, |
|
"grad_norm": 0.4766685664653778, |
|
"learning_rate": 4.7841908274384616e-05, |
|
"loss": 1.7006, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.5877862595419847, |
|
"grad_norm": 0.41363418102264404, |
|
"learning_rate": 4.332982437088825e-05, |
|
"loss": 1.7106, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 3.66412213740458, |
|
"grad_norm": 0.5006980299949646, |
|
"learning_rate": 3.898169516924398e-05, |
|
"loss": 1.6938, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.7404580152671754, |
|
"grad_norm": 0.4720315933227539, |
|
"learning_rate": 3.4810100412128747e-05, |
|
"loss": 1.6886, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 3.816793893129771, |
|
"grad_norm": 0.5057269334793091, |
|
"learning_rate": 3.0827109103512643e-05, |
|
"loss": 1.6912, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.8931297709923665, |
|
"grad_norm": 0.38378995656967163, |
|
"learning_rate": 2.7044244591351232e-05, |
|
"loss": 1.7001, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 3.969465648854962, |
|
"grad_norm": 0.3008043169975281, |
|
"learning_rate": 2.3472451228937253e-05, |
|
"loss": 1.7024, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.6962379217147827, |
|
"eval_runtime": 18.9852, |
|
"eval_samples_per_second": 48.512, |
|
"eval_steps_per_second": 0.79, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 4.0458015267175576, |
|
"grad_norm": 0.9348434805870056, |
|
"learning_rate": 2.0122062711363532e-05, |
|
"loss": 1.8574, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 4.122137404580153, |
|
"grad_norm": 0.7455368638038635, |
|
"learning_rate": 1.7002772178705716e-05, |
|
"loss": 1.6594, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.198473282442748, |
|
"grad_norm": 0.5774383544921875, |
|
"learning_rate": 1.4123604172419713e-05, |
|
"loss": 1.6527, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 4.2748091603053435, |
|
"grad_norm": 0.5370898842811584, |
|
"learning_rate": 1.149288852608743e-05, |
|
"loss": 1.6587, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.351145038167939, |
|
"grad_norm": 0.7321135997772217, |
|
"learning_rate": 9.118236266049707e-06, |
|
"loss": 1.6676, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 4.427480916030534, |
|
"grad_norm": 0.5155964493751526, |
|
"learning_rate": 7.0065175916482095e-06, |
|
"loss": 1.6579, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.5038167938931295, |
|
"grad_norm": 0.6737932562828064, |
|
"learning_rate": 5.163841998782837e-06, |
|
"loss": 1.6508, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 4.580152671755725, |
|
"grad_norm": 0.9017395377159119, |
|
"learning_rate": 3.595540604290437e-06, |
|
"loss": 1.6375, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.65648854961832, |
|
"grad_norm": 0.5460083484649658, |
|
"learning_rate": 2.30615072228183e-06, |
|
"loss": 1.6522, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 4.732824427480916, |
|
"grad_norm": 0.5443113446235657, |
|
"learning_rate": 1.2994027370611173e-06, |
|
"loss": 1.648, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 4.809160305343512, |
|
"grad_norm": 0.6177972555160522, |
|
"learning_rate": 5.782093106048159e-07, |
|
"loss": 1.6559, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 4.885496183206107, |
|
"grad_norm": 0.4734289050102234, |
|
"learning_rate": 1.446569558255395e-07, |
|
"loss": 1.6443, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.961832061068702, |
|
"grad_norm": 0.6619871854782104, |
|
"learning_rate": 0.0, |
|
"loss": 1.6463, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 4.961832061068702, |
|
"eval_loss": 1.664337158203125, |
|
"eval_runtime": 18.9808, |
|
"eval_samples_per_second": 48.523, |
|
"eval_steps_per_second": 0.79, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 4.961832061068702, |
|
"step": 325, |
|
"total_flos": 9.909828121379471e+17, |
|
"train_loss": 5.476599056537335, |
|
"train_runtime": 4095.1846, |
|
"train_samples_per_second": 10.222, |
|
"train_steps_per_second": 0.079 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 325, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.909828121379471e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|