|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9987628865979383, |
|
"eval_steps": 500, |
|
"global_step": 909, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.032989690721649485, |
|
"grad_norm": 3.190283465974621, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0361, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06597938144329897, |
|
"grad_norm": 1.2412136578455273, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9152, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09896907216494845, |
|
"grad_norm": 0.9937983329085553, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8771, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13195876288659794, |
|
"grad_norm": 0.9565722646796444, |
|
"learning_rate": 5e-06, |
|
"loss": 0.855, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.16494845360824742, |
|
"grad_norm": 0.8422015410274669, |
|
"learning_rate": 5e-06, |
|
"loss": 0.834, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1979381443298969, |
|
"grad_norm": 0.9927761059685116, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8249, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2309278350515464, |
|
"grad_norm": 1.0124625468461579, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8092, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2639175257731959, |
|
"grad_norm": 0.9162707322543721, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8002, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.29690721649484536, |
|
"grad_norm": 0.7438936667406614, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7953, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.32989690721649484, |
|
"grad_norm": 0.74943987165883, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7912, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3628865979381443, |
|
"grad_norm": 0.6393536784375358, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7835, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3958762886597938, |
|
"grad_norm": 0.6732184370236527, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7846, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4288659793814433, |
|
"grad_norm": 0.675715599863121, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7835, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4618556701030928, |
|
"grad_norm": 0.638472159519646, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7797, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4948453608247423, |
|
"grad_norm": 1.1212007213917934, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7759, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5278350515463918, |
|
"grad_norm": 0.5929058650136099, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7758, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5608247422680412, |
|
"grad_norm": 0.7325664578284696, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7746, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5938144329896907, |
|
"grad_norm": 0.7146407893100764, |
|
"learning_rate": 5e-06, |
|
"loss": 0.772, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6268041237113402, |
|
"grad_norm": 0.6297828498939105, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7686, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6597938144329897, |
|
"grad_norm": 0.650337063259678, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7668, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6927835051546392, |
|
"grad_norm": 0.577352278155154, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7633, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7257731958762886, |
|
"grad_norm": 0.6351053699389445, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7606, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7587628865979381, |
|
"grad_norm": 0.6179445706530043, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7644, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7917525773195876, |
|
"grad_norm": 0.7772047208925177, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7585, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8247422680412371, |
|
"grad_norm": 0.6393400921262609, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7597, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8577319587628865, |
|
"grad_norm": 0.5366628092052804, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7559, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8907216494845361, |
|
"grad_norm": 0.6897451596502111, |
|
"learning_rate": 5e-06, |
|
"loss": 0.757, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9237113402061856, |
|
"grad_norm": 0.683076031456689, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7595, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9567010309278351, |
|
"grad_norm": 0.6342586759859082, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7576, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9896907216494846, |
|
"grad_norm": 0.6339977397184522, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7548, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9995876288659794, |
|
"eval_loss": 0.7519278526306152, |
|
"eval_runtime": 322.6674, |
|
"eval_samples_per_second": 25.314, |
|
"eval_steps_per_second": 0.397, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.022680412371134, |
|
"grad_norm": 0.810990287818241, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7934, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.0556701030927835, |
|
"grad_norm": 0.5989713675220099, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7044, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.088659793814433, |
|
"grad_norm": 0.584782182855064, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7115, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.1216494845360825, |
|
"grad_norm": 0.7858920415247334, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7094, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.1546391752577319, |
|
"grad_norm": 0.9401995798606461, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7079, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1876288659793814, |
|
"grad_norm": 0.6150009311102699, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7073, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.220618556701031, |
|
"grad_norm": 0.6009149100944755, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7096, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.2536082474226804, |
|
"grad_norm": 0.6115518108906659, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7066, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2865979381443298, |
|
"grad_norm": 0.7496882281145417, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7076, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.3195876288659794, |
|
"grad_norm": 0.6685224897984725, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7062, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.352577319587629, |
|
"grad_norm": 0.641185927057492, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7117, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.3855670103092783, |
|
"grad_norm": 0.5361388827305237, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7094, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.418556701030928, |
|
"grad_norm": 1.002359631516242, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7054, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.4515463917525773, |
|
"grad_norm": 0.8431450479727091, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7075, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.4845360824742269, |
|
"grad_norm": 0.6447323729739957, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7099, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.5175257731958762, |
|
"grad_norm": 0.8431314429320579, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7018, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.5505154639175258, |
|
"grad_norm": 0.6273662519128372, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7051, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.5835051546391754, |
|
"grad_norm": 0.8396735090007554, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7106, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.6164948453608248, |
|
"grad_norm": 0.5802654475284174, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7043, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.6494845360824741, |
|
"grad_norm": 0.6224806599884348, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7086, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.6824742268041237, |
|
"grad_norm": 0.6154446076130442, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7026, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.7154639175257733, |
|
"grad_norm": 0.5857753449684375, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7037, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.7484536082474227, |
|
"grad_norm": 0.5716099691987403, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7066, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.781443298969072, |
|
"grad_norm": 0.6774790897099987, |
|
"learning_rate": 5e-06, |
|
"loss": 0.707, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.8144329896907216, |
|
"grad_norm": 0.6117062221128381, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6996, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.8474226804123712, |
|
"grad_norm": 0.5362825787566358, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7036, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.8804123711340206, |
|
"grad_norm": 0.7851595485494056, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7066, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.91340206185567, |
|
"grad_norm": 0.642752104749906, |
|
"learning_rate": 5e-06, |
|
"loss": 0.705, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.9463917525773196, |
|
"grad_norm": 0.6663997639727156, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7051, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.9793814432989691, |
|
"grad_norm": 0.6435653630361237, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7026, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.9991752577319588, |
|
"eval_loss": 0.7376570701599121, |
|
"eval_runtime": 321.676, |
|
"eval_samples_per_second": 25.392, |
|
"eval_steps_per_second": 0.398, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 2.0123711340206185, |
|
"grad_norm": 0.7966992322635892, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7418, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.045360824742268, |
|
"grad_norm": 0.5919842716689093, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6581, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.0783505154639177, |
|
"grad_norm": 0.6225831303900108, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6494, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.111340206185567, |
|
"grad_norm": 0.5720666970317613, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6557, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.1443298969072164, |
|
"grad_norm": 0.625621284764116, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6562, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.177319587628866, |
|
"grad_norm": 0.722621026378947, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6592, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.2103092783505156, |
|
"grad_norm": 0.6611874958125228, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6576, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.243298969072165, |
|
"grad_norm": 0.624720046082098, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6534, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.2762886597938143, |
|
"grad_norm": 0.6227890769590231, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6561, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.3092783505154637, |
|
"grad_norm": 0.6353543358518403, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6564, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.3422680412371135, |
|
"grad_norm": 0.616682251013517, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6558, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.375257731958763, |
|
"grad_norm": 0.5785627398529801, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6579, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.4082474226804123, |
|
"grad_norm": 0.7087632640527876, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6578, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.441237113402062, |
|
"grad_norm": 0.7221097669514308, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6555, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.4742268041237114, |
|
"grad_norm": 0.6845092133296887, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6589, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.507216494845361, |
|
"grad_norm": 0.6131735355128494, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6597, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.54020618556701, |
|
"grad_norm": 0.74499117668607, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6604, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.5731958762886595, |
|
"grad_norm": 0.6953072761863929, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6599, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.6061855670103093, |
|
"grad_norm": 0.7683634702318719, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6584, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.6391752577319587, |
|
"grad_norm": 0.9202931242949187, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6599, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.6721649484536085, |
|
"grad_norm": 0.6785534766587453, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6605, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.705154639175258, |
|
"grad_norm": 0.9373759072613878, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6633, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.7381443298969073, |
|
"grad_norm": 0.5316447851690145, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6582, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.7711340206185566, |
|
"grad_norm": 0.7810499110998566, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6638, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.804123711340206, |
|
"grad_norm": 0.5581690358208933, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6572, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.837113402061856, |
|
"grad_norm": 0.5757480690524878, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6629, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.870103092783505, |
|
"grad_norm": 0.6570394054126519, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6603, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.9030927835051545, |
|
"grad_norm": 0.5532161107989387, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6596, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.9360824742268044, |
|
"grad_norm": 0.6779485831959426, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6615, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.9690721649484537, |
|
"grad_norm": 0.6105580266011457, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6573, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.9987628865979383, |
|
"eval_loss": 0.737443208694458, |
|
"eval_runtime": 321.3627, |
|
"eval_samples_per_second": 25.417, |
|
"eval_steps_per_second": 0.398, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 2.9987628865979383, |
|
"step": 909, |
|
"total_flos": 1522399476449280.0, |
|
"train_loss": 0.7216839113644641, |
|
"train_runtime": 53747.0627, |
|
"train_samples_per_second": 8.662, |
|
"train_steps_per_second": 0.017 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 909, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1522399476449280.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|