{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9987628865979383, "eval_steps": 500, "global_step": 909, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.032989690721649485, "grad_norm": 3.190283465974621, "learning_rate": 5e-06, "loss": 1.0361, "step": 10 }, { "epoch": 0.06597938144329897, "grad_norm": 1.2412136578455273, "learning_rate": 5e-06, "loss": 0.9152, "step": 20 }, { "epoch": 0.09896907216494845, "grad_norm": 0.9937983329085553, "learning_rate": 5e-06, "loss": 0.8771, "step": 30 }, { "epoch": 0.13195876288659794, "grad_norm": 0.9565722646796444, "learning_rate": 5e-06, "loss": 0.855, "step": 40 }, { "epoch": 0.16494845360824742, "grad_norm": 0.8422015410274669, "learning_rate": 5e-06, "loss": 0.834, "step": 50 }, { "epoch": 0.1979381443298969, "grad_norm": 0.9927761059685116, "learning_rate": 5e-06, "loss": 0.8249, "step": 60 }, { "epoch": 0.2309278350515464, "grad_norm": 1.0124625468461579, "learning_rate": 5e-06, "loss": 0.8092, "step": 70 }, { "epoch": 0.2639175257731959, "grad_norm": 0.9162707322543721, "learning_rate": 5e-06, "loss": 0.8002, "step": 80 }, { "epoch": 0.29690721649484536, "grad_norm": 0.7438936667406614, "learning_rate": 5e-06, "loss": 0.7953, "step": 90 }, { "epoch": 0.32989690721649484, "grad_norm": 0.74943987165883, "learning_rate": 5e-06, "loss": 0.7912, "step": 100 }, { "epoch": 0.3628865979381443, "grad_norm": 0.6393536784375358, "learning_rate": 5e-06, "loss": 0.7835, "step": 110 }, { "epoch": 0.3958762886597938, "grad_norm": 0.6732184370236527, "learning_rate": 5e-06, "loss": 0.7846, "step": 120 }, { "epoch": 0.4288659793814433, "grad_norm": 0.675715599863121, "learning_rate": 5e-06, "loss": 0.7835, "step": 130 }, { "epoch": 0.4618556701030928, "grad_norm": 0.638472159519646, "learning_rate": 5e-06, "loss": 0.7797, "step": 140 }, { "epoch": 0.4948453608247423, "grad_norm": 1.1212007213917934, "learning_rate": 5e-06, "loss": 0.7759, "step": 150 }, { "epoch": 0.5278350515463918, "grad_norm": 0.5929058650136099, "learning_rate": 5e-06, "loss": 0.7758, "step": 160 }, { "epoch": 0.5608247422680412, "grad_norm": 0.7325664578284696, "learning_rate": 5e-06, "loss": 0.7746, "step": 170 }, { "epoch": 0.5938144329896907, "grad_norm": 0.7146407893100764, "learning_rate": 5e-06, "loss": 0.772, "step": 180 }, { "epoch": 0.6268041237113402, "grad_norm": 0.6297828498939105, "learning_rate": 5e-06, "loss": 0.7686, "step": 190 }, { "epoch": 0.6597938144329897, "grad_norm": 0.650337063259678, "learning_rate": 5e-06, "loss": 0.7668, "step": 200 }, { "epoch": 0.6927835051546392, "grad_norm": 0.577352278155154, "learning_rate": 5e-06, "loss": 0.7633, "step": 210 }, { "epoch": 0.7257731958762886, "grad_norm": 0.6351053699389445, "learning_rate": 5e-06, "loss": 0.7606, "step": 220 }, { "epoch": 0.7587628865979381, "grad_norm": 0.6179445706530043, "learning_rate": 5e-06, "loss": 0.7644, "step": 230 }, { "epoch": 0.7917525773195876, "grad_norm": 0.7772047208925177, "learning_rate": 5e-06, "loss": 0.7585, "step": 240 }, { "epoch": 0.8247422680412371, "grad_norm": 0.6393400921262609, "learning_rate": 5e-06, "loss": 0.7597, "step": 250 }, { "epoch": 0.8577319587628865, "grad_norm": 0.5366628092052804, "learning_rate": 5e-06, "loss": 0.7559, "step": 260 }, { "epoch": 0.8907216494845361, "grad_norm": 0.6897451596502111, "learning_rate": 5e-06, "loss": 0.757, "step": 270 }, { "epoch": 0.9237113402061856, "grad_norm": 0.683076031456689, "learning_rate": 5e-06, "loss": 0.7595, "step": 280 }, { "epoch": 0.9567010309278351, "grad_norm": 0.6342586759859082, "learning_rate": 5e-06, "loss": 0.7576, "step": 290 }, { "epoch": 0.9896907216494846, "grad_norm": 0.6339977397184522, "learning_rate": 5e-06, "loss": 0.7548, "step": 300 }, { "epoch": 0.9995876288659794, "eval_loss": 0.7519278526306152, "eval_runtime": 322.6674, "eval_samples_per_second": 25.314, "eval_steps_per_second": 0.397, "step": 303 }, { "epoch": 1.022680412371134, "grad_norm": 0.810990287818241, "learning_rate": 5e-06, "loss": 0.7934, "step": 310 }, { "epoch": 1.0556701030927835, "grad_norm": 0.5989713675220099, "learning_rate": 5e-06, "loss": 0.7044, "step": 320 }, { "epoch": 1.088659793814433, "grad_norm": 0.584782182855064, "learning_rate": 5e-06, "loss": 0.7115, "step": 330 }, { "epoch": 1.1216494845360825, "grad_norm": 0.7858920415247334, "learning_rate": 5e-06, "loss": 0.7094, "step": 340 }, { "epoch": 1.1546391752577319, "grad_norm": 0.9401995798606461, "learning_rate": 5e-06, "loss": 0.7079, "step": 350 }, { "epoch": 1.1876288659793814, "grad_norm": 0.6150009311102699, "learning_rate": 5e-06, "loss": 0.7073, "step": 360 }, { "epoch": 1.220618556701031, "grad_norm": 0.6009149100944755, "learning_rate": 5e-06, "loss": 0.7096, "step": 370 }, { "epoch": 1.2536082474226804, "grad_norm": 0.6115518108906659, "learning_rate": 5e-06, "loss": 0.7066, "step": 380 }, { "epoch": 1.2865979381443298, "grad_norm": 0.7496882281145417, "learning_rate": 5e-06, "loss": 0.7076, "step": 390 }, { "epoch": 1.3195876288659794, "grad_norm": 0.6685224897984725, "learning_rate": 5e-06, "loss": 0.7062, "step": 400 }, { "epoch": 1.352577319587629, "grad_norm": 0.641185927057492, "learning_rate": 5e-06, "loss": 0.7117, "step": 410 }, { "epoch": 1.3855670103092783, "grad_norm": 0.5361388827305237, "learning_rate": 5e-06, "loss": 0.7094, "step": 420 }, { "epoch": 1.418556701030928, "grad_norm": 1.002359631516242, "learning_rate": 5e-06, "loss": 0.7054, "step": 430 }, { "epoch": 1.4515463917525773, "grad_norm": 0.8431450479727091, "learning_rate": 5e-06, "loss": 0.7075, "step": 440 }, { "epoch": 1.4845360824742269, "grad_norm": 0.6447323729739957, "learning_rate": 5e-06, "loss": 0.7099, "step": 450 }, { "epoch": 1.5175257731958762, "grad_norm": 0.8431314429320579, "learning_rate": 5e-06, "loss": 0.7018, "step": 460 }, { "epoch": 1.5505154639175258, "grad_norm": 0.6273662519128372, "learning_rate": 5e-06, "loss": 0.7051, "step": 470 }, { "epoch": 1.5835051546391754, "grad_norm": 0.8396735090007554, "learning_rate": 5e-06, "loss": 0.7106, "step": 480 }, { "epoch": 1.6164948453608248, "grad_norm": 0.5802654475284174, "learning_rate": 5e-06, "loss": 0.7043, "step": 490 }, { "epoch": 1.6494845360824741, "grad_norm": 0.6224806599884348, "learning_rate": 5e-06, "loss": 0.7086, "step": 500 }, { "epoch": 1.6824742268041237, "grad_norm": 0.6154446076130442, "learning_rate": 5e-06, "loss": 0.7026, "step": 510 }, { "epoch": 1.7154639175257733, "grad_norm": 0.5857753449684375, "learning_rate": 5e-06, "loss": 0.7037, "step": 520 }, { "epoch": 1.7484536082474227, "grad_norm": 0.5716099691987403, "learning_rate": 5e-06, "loss": 0.7066, "step": 530 }, { "epoch": 1.781443298969072, "grad_norm": 0.6774790897099987, "learning_rate": 5e-06, "loss": 0.707, "step": 540 }, { "epoch": 1.8144329896907216, "grad_norm": 0.6117062221128381, "learning_rate": 5e-06, "loss": 0.6996, "step": 550 }, { "epoch": 1.8474226804123712, "grad_norm": 0.5362825787566358, "learning_rate": 5e-06, "loss": 0.7036, "step": 560 }, { "epoch": 1.8804123711340206, "grad_norm": 0.7851595485494056, "learning_rate": 5e-06, "loss": 0.7066, "step": 570 }, { "epoch": 1.91340206185567, "grad_norm": 0.642752104749906, "learning_rate": 5e-06, "loss": 0.705, "step": 580 }, { "epoch": 1.9463917525773196, "grad_norm": 0.6663997639727156, "learning_rate": 5e-06, "loss": 0.7051, "step": 590 }, { "epoch": 1.9793814432989691, "grad_norm": 0.6435653630361237, "learning_rate": 5e-06, "loss": 0.7026, "step": 600 }, { "epoch": 1.9991752577319588, "eval_loss": 0.7376570701599121, "eval_runtime": 321.676, "eval_samples_per_second": 25.392, "eval_steps_per_second": 0.398, "step": 606 }, { "epoch": 2.0123711340206185, "grad_norm": 0.7966992322635892, "learning_rate": 5e-06, "loss": 0.7418, "step": 610 }, { "epoch": 2.045360824742268, "grad_norm": 0.5919842716689093, "learning_rate": 5e-06, "loss": 0.6581, "step": 620 }, { "epoch": 2.0783505154639177, "grad_norm": 0.6225831303900108, "learning_rate": 5e-06, "loss": 0.6494, "step": 630 }, { "epoch": 2.111340206185567, "grad_norm": 0.5720666970317613, "learning_rate": 5e-06, "loss": 0.6557, "step": 640 }, { "epoch": 2.1443298969072164, "grad_norm": 0.625621284764116, "learning_rate": 5e-06, "loss": 0.6562, "step": 650 }, { "epoch": 2.177319587628866, "grad_norm": 0.722621026378947, "learning_rate": 5e-06, "loss": 0.6592, "step": 660 }, { "epoch": 2.2103092783505156, "grad_norm": 0.6611874958125228, "learning_rate": 5e-06, "loss": 0.6576, "step": 670 }, { "epoch": 2.243298969072165, "grad_norm": 0.624720046082098, "learning_rate": 5e-06, "loss": 0.6534, "step": 680 }, { "epoch": 2.2762886597938143, "grad_norm": 0.6227890769590231, "learning_rate": 5e-06, "loss": 0.6561, "step": 690 }, { "epoch": 2.3092783505154637, "grad_norm": 0.6353543358518403, "learning_rate": 5e-06, "loss": 0.6564, "step": 700 }, { "epoch": 2.3422680412371135, "grad_norm": 0.616682251013517, "learning_rate": 5e-06, "loss": 0.6558, "step": 710 }, { "epoch": 2.375257731958763, "grad_norm": 0.5785627398529801, "learning_rate": 5e-06, "loss": 0.6579, "step": 720 }, { "epoch": 2.4082474226804123, "grad_norm": 0.7087632640527876, "learning_rate": 5e-06, "loss": 0.6578, "step": 730 }, { "epoch": 2.441237113402062, "grad_norm": 0.7221097669514308, "learning_rate": 5e-06, "loss": 0.6555, "step": 740 }, { "epoch": 2.4742268041237114, "grad_norm": 0.6845092133296887, "learning_rate": 5e-06, "loss": 0.6589, "step": 750 }, { "epoch": 2.507216494845361, "grad_norm": 0.6131735355128494, "learning_rate": 5e-06, "loss": 0.6597, "step": 760 }, { "epoch": 2.54020618556701, "grad_norm": 0.74499117668607, "learning_rate": 5e-06, "loss": 0.6604, "step": 770 }, { "epoch": 2.5731958762886595, "grad_norm": 0.6953072761863929, "learning_rate": 5e-06, "loss": 0.6599, "step": 780 }, { "epoch": 2.6061855670103093, "grad_norm": 0.7683634702318719, "learning_rate": 5e-06, "loss": 0.6584, "step": 790 }, { "epoch": 2.6391752577319587, "grad_norm": 0.9202931242949187, "learning_rate": 5e-06, "loss": 0.6599, "step": 800 }, { "epoch": 2.6721649484536085, "grad_norm": 0.6785534766587453, "learning_rate": 5e-06, "loss": 0.6605, "step": 810 }, { "epoch": 2.705154639175258, "grad_norm": 0.9373759072613878, "learning_rate": 5e-06, "loss": 0.6633, "step": 820 }, { "epoch": 2.7381443298969073, "grad_norm": 0.5316447851690145, "learning_rate": 5e-06, "loss": 0.6582, "step": 830 }, { "epoch": 2.7711340206185566, "grad_norm": 0.7810499110998566, "learning_rate": 5e-06, "loss": 0.6638, "step": 840 }, { "epoch": 2.804123711340206, "grad_norm": 0.5581690358208933, "learning_rate": 5e-06, "loss": 0.6572, "step": 850 }, { "epoch": 2.837113402061856, "grad_norm": 0.5757480690524878, "learning_rate": 5e-06, "loss": 0.6629, "step": 860 }, { "epoch": 2.870103092783505, "grad_norm": 0.6570394054126519, "learning_rate": 5e-06, "loss": 0.6603, "step": 870 }, { "epoch": 2.9030927835051545, "grad_norm": 0.5532161107989387, "learning_rate": 5e-06, "loss": 0.6596, "step": 880 }, { "epoch": 2.9360824742268044, "grad_norm": 0.6779485831959426, "learning_rate": 5e-06, "loss": 0.6615, "step": 890 }, { "epoch": 2.9690721649484537, "grad_norm": 0.6105580266011457, "learning_rate": 5e-06, "loss": 0.6573, "step": 900 }, { "epoch": 2.9987628865979383, "eval_loss": 0.737443208694458, "eval_runtime": 321.3627, "eval_samples_per_second": 25.417, "eval_steps_per_second": 0.398, "step": 909 }, { "epoch": 2.9987628865979383, "step": 909, "total_flos": 1522399476449280.0, "train_loss": 0.7216839113644641, "train_runtime": 53747.0627, "train_samples_per_second": 8.662, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 909, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1522399476449280.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }