{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.953917050691244, "eval_steps": 500, "global_step": 540, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18433179723502305, "grad_norm": 15.2578125, "learning_rate": 0.00019983081582712685, "loss": 1.1335, "step": 10 }, { "epoch": 0.3686635944700461, "grad_norm": 15.1484375, "learning_rate": 0.00019932383577419432, "loss": 1.0389, "step": 20 }, { "epoch": 0.5529953917050692, "grad_norm": 16.21875, "learning_rate": 0.00019848077530122083, "loss": 1.0017, "step": 30 }, { "epoch": 0.7373271889400922, "grad_norm": 13.3359375, "learning_rate": 0.00019730448705798239, "loss": 0.9937, "step": 40 }, { "epoch": 0.9216589861751152, "grad_norm": 13.1171875, "learning_rate": 0.0001957989512315489, "loss": 0.9773, "step": 50 }, { "epoch": 1.1059907834101383, "grad_norm": 11.2578125, "learning_rate": 0.00019396926207859084, "loss": 0.9484, "step": 60 }, { "epoch": 1.2903225806451613, "grad_norm": 13.71875, "learning_rate": 0.00019182161068802741, "loss": 0.9577, "step": 70 }, { "epoch": 1.4746543778801844, "grad_norm": 10.65625, "learning_rate": 0.00018936326403234125, "loss": 0.9277, "step": 80 }, { "epoch": 1.6589861751152073, "grad_norm": 12.90625, "learning_rate": 0.00018660254037844388, "loss": 0.9301, "step": 90 }, { "epoch": 1.8433179723502304, "grad_norm": 13.7578125, "learning_rate": 0.00018354878114129367, "loss": 0.9212, "step": 100 }, { "epoch": 2.0276497695852536, "grad_norm": 10.140625, "learning_rate": 0.0001802123192755044, "loss": 0.8985, "step": 110 }, { "epoch": 2.2119815668202767, "grad_norm": 9.6015625, "learning_rate": 0.0001766044443118978, "loss": 0.8939, "step": 120 }, { "epoch": 2.3963133640552994, "grad_norm": 11.921875, "learning_rate": 0.00017273736415730488, "loss": 0.8935, "step": 130 }, { "epoch": 2.5806451612903225, "grad_norm": 10.8203125, "learning_rate": 0.0001686241637868734, "loss": 0.8973, "step": 140 }, { "epoch": 2.7649769585253456, "grad_norm": 11.7421875, "learning_rate": 0.00016427876096865394, "loss": 0.9522, "step": 150 }, { "epoch": 2.9493087557603688, "grad_norm": 11.5234375, "learning_rate": 0.00015971585917027862, "loss": 0.9124, "step": 160 }, { "epoch": 3.133640552995392, "grad_norm": 10.25, "learning_rate": 0.0001549508978070806, "loss": 0.899, "step": 170 }, { "epoch": 3.3179723502304146, "grad_norm": 10.4609375, "learning_rate": 0.00015000000000000001, "loss": 0.8893, "step": 180 }, { "epoch": 3.5023041474654377, "grad_norm": 9.0078125, "learning_rate": 0.00014487991802004623, "loss": 0.8878, "step": 190 }, { "epoch": 3.686635944700461, "grad_norm": 10.21875, "learning_rate": 0.0001396079766039157, "loss": 0.8656, "step": 200 }, { "epoch": 3.870967741935484, "grad_norm": 7.875, "learning_rate": 0.00013420201433256689, "loss": 0.8897, "step": 210 }, { "epoch": 4.055299539170507, "grad_norm": 9.4921875, "learning_rate": 0.00012868032327110904, "loss": 0.8549, "step": 220 }, { "epoch": 4.23963133640553, "grad_norm": 9.2421875, "learning_rate": 0.00012306158707424403, "loss": 0.8577, "step": 230 }, { "epoch": 4.423963133640553, "grad_norm": 7.875, "learning_rate": 0.00011736481776669306, "loss": 0.8579, "step": 240 }, { "epoch": 4.6082949308755765, "grad_norm": 8.1328125, "learning_rate": 0.00011160929141252303, "loss": 0.8278, "step": 250 }, { "epoch": 4.792626728110599, "grad_norm": 7.01171875, "learning_rate": 0.00010581448289104758, "loss": 0.8458, "step": 260 }, { "epoch": 4.976958525345622, "grad_norm": 6.92578125, "learning_rate": 0.0001, "loss": 0.874, "step": 270 }, { "epoch": 5.161290322580645, "grad_norm": 8.1796875, "learning_rate": 9.418551710895243e-05, "loss": 0.8455, "step": 280 }, { "epoch": 5.345622119815668, "grad_norm": 7.21484375, "learning_rate": 8.839070858747697e-05, "loss": 0.8316, "step": 290 }, { "epoch": 5.529953917050691, "grad_norm": 9.9375, "learning_rate": 8.263518223330697e-05, "loss": 0.8555, "step": 300 }, { "epoch": 5.714285714285714, "grad_norm": 7.5390625, "learning_rate": 7.693841292575598e-05, "loss": 0.8385, "step": 310 }, { "epoch": 5.8986175115207375, "grad_norm": 6.8515625, "learning_rate": 7.131967672889101e-05, "loss": 0.8339, "step": 320 }, { "epoch": 6.082949308755761, "grad_norm": 7.38671875, "learning_rate": 6.579798566743314e-05, "loss": 0.8497, "step": 330 }, { "epoch": 6.267281105990784, "grad_norm": 7.21875, "learning_rate": 6.039202339608432e-05, "loss": 0.836, "step": 340 }, { "epoch": 6.451612903225806, "grad_norm": 8.515625, "learning_rate": 5.5120081979953785e-05, "loss": 0.8411, "step": 350 }, { "epoch": 6.635944700460829, "grad_norm": 8.515625, "learning_rate": 5.000000000000002e-05, "loss": 0.822, "step": 360 }, { "epoch": 6.820276497695852, "grad_norm": 7.73828125, "learning_rate": 4.50491021929194e-05, "loss": 0.8298, "step": 370 }, { "epoch": 7.0046082949308754, "grad_norm": 7.15625, "learning_rate": 4.028414082972141e-05, "loss": 0.8366, "step": 380 }, { "epoch": 7.188940092165899, "grad_norm": 7.45703125, "learning_rate": 3.5721239031346066e-05, "loss": 0.8222, "step": 390 }, { "epoch": 7.373271889400922, "grad_norm": 7.94921875, "learning_rate": 3.137583621312665e-05, "loss": 0.8294, "step": 400 }, { "epoch": 7.557603686635945, "grad_norm": 7.80078125, "learning_rate": 2.7262635842695127e-05, "loss": 0.8189, "step": 410 }, { "epoch": 7.741935483870968, "grad_norm": 7.33984375, "learning_rate": 2.339555568810221e-05, "loss": 0.8279, "step": 420 }, { "epoch": 7.926267281105991, "grad_norm": 10.0390625, "learning_rate": 1.9787680724495617e-05, "loss": 0.8299, "step": 430 }, { "epoch": 8.110599078341014, "grad_norm": 8.34375, "learning_rate": 1.6451218858706374e-05, "loss": 0.8408, "step": 440 }, { "epoch": 8.294930875576037, "grad_norm": 7.22265625, "learning_rate": 1.339745962155613e-05, "loss": 0.819, "step": 450 }, { "epoch": 8.47926267281106, "grad_norm": 7.890625, "learning_rate": 1.0636735967658784e-05, "loss": 0.8087, "step": 460 }, { "epoch": 8.663594470046084, "grad_norm": 8.9296875, "learning_rate": 8.178389311972612e-06, "loss": 0.8218, "step": 470 }, { "epoch": 8.847926267281107, "grad_norm": 9.421875, "learning_rate": 6.030737921409169e-06, "loss": 0.8213, "step": 480 }, { "epoch": 9.03225806451613, "grad_norm": 7.0078125, "learning_rate": 4.20104876845111e-06, "loss": 0.8259, "step": 490 }, { "epoch": 9.216589861751151, "grad_norm": 7.5546875, "learning_rate": 2.6955129420176196e-06, "loss": 0.8463, "step": 500 }, { "epoch": 9.400921658986174, "grad_norm": 8.0546875, "learning_rate": 1.5192246987791981e-06, "loss": 0.7932, "step": 510 }, { "epoch": 9.585253456221198, "grad_norm": 8.109375, "learning_rate": 6.761642258056978e-07, "loss": 0.8172, "step": 520 }, { "epoch": 9.76958525345622, "grad_norm": 7.5859375, "learning_rate": 1.6918417287318245e-07, "loss": 0.8263, "step": 530 }, { "epoch": 9.953917050691244, "grad_norm": 8.59375, "learning_rate": 0.0, "loss": 0.8372, "step": 540 }, { "epoch": 9.953917050691244, "step": 540, "total_flos": 3.52542013784064e+16, "train_loss": 0.8746989762341535, "train_runtime": 468.0964, "train_samples_per_second": 4.636, "train_steps_per_second": 1.154 } ], "logging_steps": 10, "max_steps": 540, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 3.52542013784064e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }