{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 461, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021691973969631237, "grad_norm": 11.479755487865775, "learning_rate": 1.0638297872340425e-08, "loss": 1.204, "step": 1 }, { "epoch": 0.010845986984815618, "grad_norm": 11.719854924785, "learning_rate": 5.3191489361702123e-08, "loss": 1.2705, "step": 5 }, { "epoch": 0.021691973969631236, "grad_norm": 12.5013105505897, "learning_rate": 1.0638297872340425e-07, "loss": 1.2569, "step": 10 }, { "epoch": 0.03253796095444685, "grad_norm": 10.664645875250606, "learning_rate": 1.5957446808510638e-07, "loss": 1.2504, "step": 15 }, { "epoch": 0.04338394793926247, "grad_norm": 9.933034533041308, "learning_rate": 2.127659574468085e-07, "loss": 1.2132, "step": 20 }, { "epoch": 0.05422993492407809, "grad_norm": 5.664415659701916, "learning_rate": 2.659574468085106e-07, "loss": 1.1718, "step": 25 }, { "epoch": 0.0650759219088937, "grad_norm": 4.799955074967526, "learning_rate": 3.1914893617021275e-07, "loss": 1.1492, "step": 30 }, { "epoch": 0.07592190889370933, "grad_norm": 3.5056771066313868, "learning_rate": 3.7234042553191484e-07, "loss": 1.1193, "step": 35 }, { "epoch": 0.08676789587852494, "grad_norm": 2.758357067802773, "learning_rate": 4.25531914893617e-07, "loss": 1.0995, "step": 40 }, { "epoch": 0.09761388286334056, "grad_norm": 2.512596535973654, "learning_rate": 4.787234042553192e-07, "loss": 1.0789, "step": 45 }, { "epoch": 0.10845986984815618, "grad_norm": 2.4219547091489244, "learning_rate": 4.999352212103373e-07, "loss": 1.0741, "step": 50 }, { "epoch": 0.1193058568329718, "grad_norm": 2.359757061351952, "learning_rate": 4.995394723941067e-07, "loss": 1.0585, "step": 55 }, { "epoch": 0.1301518438177874, "grad_norm": 2.4328040984593247, "learning_rate": 4.98784531935359e-07, "loss": 1.0568, "step": 60 }, { "epoch": 0.14099783080260303, "grad_norm": 2.3052389483909543, "learning_rate": 4.976714865090826e-07, "loss": 1.0638, "step": 65 }, { "epoch": 0.15184381778741865, "grad_norm": 2.198699363060523, "learning_rate": 4.96201938253052e-07, "loss": 1.0349, "step": 70 }, { "epoch": 0.16268980477223427, "grad_norm": 2.138037840029951, "learning_rate": 4.943780024616802e-07, "loss": 1.034, "step": 75 }, { "epoch": 0.1735357917570499, "grad_norm": 2.1989759782977076, "learning_rate": 4.922023045412265e-07, "loss": 1.0295, "step": 80 }, { "epoch": 0.1843817787418655, "grad_norm": 2.2941860079446235, "learning_rate": 4.896779762307389e-07, "loss": 1.0262, "step": 85 }, { "epoch": 0.19522776572668113, "grad_norm": 2.1481967771932773, "learning_rate": 4.868086510941716e-07, "loss": 1.0154, "step": 90 }, { "epoch": 0.20607375271149675, "grad_norm": 2.160671871794234, "learning_rate": 4.835984592901677e-07, "loss": 1.0468, "step": 95 }, { "epoch": 0.21691973969631237, "grad_norm": 2.1550419124695783, "learning_rate": 4.800520216270341e-07, "loss": 1.0336, "step": 100 }, { "epoch": 0.227765726681128, "grad_norm": 2.2253199590666157, "learning_rate": 4.7617444291146555e-07, "loss": 1.0224, "step": 105 }, { "epoch": 0.2386117136659436, "grad_norm": 2.183801883640947, "learning_rate": 4.7197130460059377e-07, "loss": 1.0139, "step": 110 }, { "epoch": 0.24945770065075923, "grad_norm": 2.177485956300299, "learning_rate": 4.6744865676793666e-07, "loss": 1.0053, "step": 115 }, { "epoch": 0.2603036876355748, "grad_norm": 2.1256298478006856, "learning_rate": 4.6261300939481274e-07, "loss": 0.9914, "step": 120 }, { "epoch": 0.27114967462039047, "grad_norm": 2.1075218318516886, "learning_rate": 4.574713229997563e-07, "loss": 1.0006, "step": 125 }, { "epoch": 0.28199566160520606, "grad_norm": 2.1304778495096457, "learning_rate": 4.520309986194201e-07, "loss": 1.0338, "step": 130 }, { "epoch": 0.2928416485900217, "grad_norm": 2.1650488920006046, "learning_rate": 4.462998671553897e-07, "loss": 1.0017, "step": 135 }, { "epoch": 0.3036876355748373, "grad_norm": 2.214789820606799, "learning_rate": 4.4028617810224115e-07, "loss": 1.0101, "step": 140 }, { "epoch": 0.31453362255965295, "grad_norm": 2.157917041478876, "learning_rate": 4.3399858767306927e-07, "loss": 0.989, "step": 145 }, { "epoch": 0.32537960954446854, "grad_norm": 2.092011368725107, "learning_rate": 4.2744614633957723e-07, "loss": 1.0115, "step": 150 }, { "epoch": 0.3362255965292842, "grad_norm": 2.219786897391391, "learning_rate": 4.206382858046635e-07, "loss": 0.9853, "step": 155 }, { "epoch": 0.3470715835140998, "grad_norm": 2.0979071528204756, "learning_rate": 4.135848054262578e-07, "loss": 1.0163, "step": 160 }, { "epoch": 0.3579175704989154, "grad_norm": 2.1698498942112288, "learning_rate": 4.062958581119472e-07, "loss": 0.9865, "step": 165 }, { "epoch": 0.368763557483731, "grad_norm": 2.132812386449239, "learning_rate": 3.9878193570469743e-07, "loss": 0.9871, "step": 170 }, { "epoch": 0.3796095444685466, "grad_norm": 2.337071820270912, "learning_rate": 3.91053853880703e-07, "loss": 0.984, "step": 175 }, { "epoch": 0.39045553145336226, "grad_norm": 2.1129666341045263, "learning_rate": 3.831227365811074e-07, "loss": 0.9943, "step": 180 }, { "epoch": 0.40130151843817785, "grad_norm": 2.121223339662348, "learning_rate": 3.75e-07, "loss": 0.9918, "step": 185 }, { "epoch": 0.4121475054229935, "grad_norm": 2.285212865590998, "learning_rate": 3.6669733615173965e-07, "loss": 0.9843, "step": 190 }, { "epoch": 0.4229934924078091, "grad_norm": 2.123810421477716, "learning_rate": 3.5822669604125684e-07, "loss": 0.9781, "step": 195 }, { "epoch": 0.43383947939262474, "grad_norm": 2.1627048249065997, "learning_rate": 3.4960027246156036e-07, "loss": 0.9781, "step": 200 }, { "epoch": 0.44468546637744033, "grad_norm": 2.137516375815561, "learning_rate": 3.408304824432103e-07, "loss": 0.9867, "step": 205 }, { "epoch": 0.455531453362256, "grad_norm": 2.058882247220635, "learning_rate": 3.319299493810187e-07, "loss": 0.9776, "step": 210 }, { "epoch": 0.46637744034707157, "grad_norm": 2.1433762495027864, "learning_rate": 3.229114848637062e-07, "loss": 0.9879, "step": 215 }, { "epoch": 0.4772234273318872, "grad_norm": 2.1877299182133734, "learning_rate": 3.13788070232669e-07, "loss": 0.9812, "step": 220 }, { "epoch": 0.4880694143167028, "grad_norm": 2.0613909390774525, "learning_rate": 3.0457283789640036e-07, "loss": 1.0005, "step": 225 }, { "epoch": 0.49891540130151846, "grad_norm": 2.137287960677187, "learning_rate": 2.9527905242746395e-07, "loss": 0.9994, "step": 230 }, { "epoch": 0.5097613882863341, "grad_norm": 2.062311669373047, "learning_rate": 2.85920091469227e-07, "loss": 0.9806, "step": 235 }, { "epoch": 0.5206073752711496, "grad_norm": 2.1009325069210054, "learning_rate": 2.765094264798387e-07, "loss": 0.9905, "step": 240 }, { "epoch": 0.5314533622559653, "grad_norm": 2.252996820494139, "learning_rate": 2.6706060334116775e-07, "loss": 0.9833, "step": 245 }, { "epoch": 0.5422993492407809, "grad_norm": 2.063111058570344, "learning_rate": 2.575872228606156e-07, "loss": 0.9466, "step": 250 }, { "epoch": 0.5531453362255966, "grad_norm": 2.1028294210091794, "learning_rate": 2.4810292119386674e-07, "loss": 0.9556, "step": 255 }, { "epoch": 0.5639913232104121, "grad_norm": 2.121237281068362, "learning_rate": 2.3862135021675915e-07, "loss": 0.9717, "step": 260 }, { "epoch": 0.5748373101952278, "grad_norm": 2.2205226946736905, "learning_rate": 2.2915615787452664e-07, "loss": 0.9638, "step": 265 }, { "epoch": 0.5856832971800434, "grad_norm": 2.0940361008010333, "learning_rate": 2.1972096853669903e-07, "loss": 0.9671, "step": 270 }, { "epoch": 0.596529284164859, "grad_norm": 2.0586787374748576, "learning_rate": 2.1032936338593717e-07, "loss": 0.9773, "step": 275 }, { "epoch": 0.6073752711496746, "grad_norm": 2.072790057342988, "learning_rate": 2.0099486086903294e-07, "loss": 0.9684, "step": 280 }, { "epoch": 0.6182212581344902, "grad_norm": 2.1778824831003147, "learning_rate": 1.9173089723821087e-07, "loss": 0.9667, "step": 285 }, { "epoch": 0.6290672451193059, "grad_norm": 2.115203742538656, "learning_rate": 1.825508072107439e-07, "loss": 0.9919, "step": 290 }, { "epoch": 0.6399132321041214, "grad_norm": 2.9734190506035536, "learning_rate": 1.7346780477471897e-07, "loss": 0.9741, "step": 295 }, { "epoch": 0.6507592190889371, "grad_norm": 2.109405285224148, "learning_rate": 1.6449496416858282e-07, "loss": 0.9815, "step": 300 }, { "epoch": 0.6616052060737527, "grad_norm": 2.0612586919012355, "learning_rate": 1.5564520106184643e-07, "loss": 0.9637, "step": 305 }, { "epoch": 0.6724511930585684, "grad_norm": 2.126282772097656, "learning_rate": 1.4693125396403562e-07, "loss": 0.9576, "step": 310 }, { "epoch": 0.6832971800433839, "grad_norm": 2.151977422363772, "learning_rate": 1.3836566588865e-07, "loss": 0.9871, "step": 315 }, { "epoch": 0.6941431670281996, "grad_norm": 2.10699457923422, "learning_rate": 1.2996076629852112e-07, "loss": 0.9775, "step": 320 }, { "epoch": 0.7049891540130152, "grad_norm": 2.2306202203282366, "learning_rate": 1.2172865335856064e-07, "loss": 0.9869, "step": 325 }, { "epoch": 0.7158351409978309, "grad_norm": 2.1309989319293696, "learning_rate": 1.1368117652144185e-07, "loss": 0.9871, "step": 330 }, { "epoch": 0.7266811279826464, "grad_norm": 2.1871677586329175, "learning_rate": 1.0582991947128323e-07, "loss": 0.983, "step": 335 }, { "epoch": 0.737527114967462, "grad_norm": 2.089837928751529, "learning_rate": 9.818618344988258e-08, "loss": 0.9817, "step": 340 }, { "epoch": 0.7483731019522777, "grad_norm": 2.0529724920050905, "learning_rate": 9.076097098950541e-08, "loss": 0.9581, "step": 345 }, { "epoch": 0.7592190889370932, "grad_norm": 2.077217246150324, "learning_rate": 8.356497007563986e-08, "loss": 0.9845, "step": 350 }, { "epoch": 0.7700650759219089, "grad_norm": 2.1176827367409876, "learning_rate": 7.660853876251683e-08, "loss": 0.9625, "step": 355 }, { "epoch": 0.7809110629067245, "grad_norm": 2.1409566690457433, "learning_rate": 6.990169026353867e-08, "loss": 0.9999, "step": 360 }, { "epoch": 0.7917570498915402, "grad_norm": 2.0254717410055703, "learning_rate": 6.345407853807863e-08, "loss": 0.9642, "step": 365 }, { "epoch": 0.8026030368763557, "grad_norm": 2.140899328755212, "learning_rate": 5.727498439539602e-08, "loss": 0.9717, "step": 370 }, { "epoch": 0.8134490238611713, "grad_norm": 2.0666287819389404, "learning_rate": 5.13733021356714e-08, "loss": 0.9697, "step": 375 }, { "epoch": 0.824295010845987, "grad_norm": 2.0741290971837127, "learning_rate": 4.57575267473895e-08, "loss": 0.9641, "step": 380 }, { "epoch": 0.8351409978308026, "grad_norm": 2.108990519804528, "learning_rate": 4.043574167949892e-08, "loss": 0.9902, "step": 385 }, { "epoch": 0.8459869848156182, "grad_norm": 2.094620559269544, "learning_rate": 3.541560720594869e-08, "loss": 0.9926, "step": 390 }, { "epoch": 0.8568329718004338, "grad_norm": 2.0594257251140586, "learning_rate": 3.0704349399351435e-08, "loss": 0.9614, "step": 395 }, { "epoch": 0.8676789587852495, "grad_norm": 2.1291510379356753, "learning_rate": 2.6308749729643058e-08, "loss": 0.9651, "step": 400 }, { "epoch": 0.8785249457700651, "grad_norm": 2.1035620640567734, "learning_rate": 2.2235135302712092e-08, "loss": 0.9952, "step": 405 }, { "epoch": 0.8893709327548807, "grad_norm": 2.0424482935650614, "learning_rate": 1.8489369753048682e-08, "loss": 0.9631, "step": 410 }, { "epoch": 0.9002169197396963, "grad_norm": 2.0939056533766847, "learning_rate": 1.507684480352292e-08, "loss": 0.9747, "step": 415 }, { "epoch": 0.911062906724512, "grad_norm": 4.429046210547233, "learning_rate": 1.2002472504440807e-08, "loss": 0.9843, "step": 420 }, { "epoch": 0.9219088937093276, "grad_norm": 2.1047314550346297, "learning_rate": 9.270678163050217e-09, "loss": 0.9738, "step": 425 }, { "epoch": 0.9327548806941431, "grad_norm": 2.0998363541587888, "learning_rate": 6.885393973673298e-09, "loss": 0.9509, "step": 430 }, { "epoch": 0.9436008676789588, "grad_norm": 2.138035512894866, "learning_rate": 4.850053357634693e-09, "loss": 0.9799, "step": 435 }, { "epoch": 0.9544468546637744, "grad_norm": 2.053994471901985, "learning_rate": 3.1675860211325954e-09, "loss": 0.9762, "step": 440 }, { "epoch": 0.96529284164859, "grad_norm": 2.054984790757136, "learning_rate": 1.840413738166402e-09, "loss": 0.9499, "step": 445 }, { "epoch": 0.9761388286334056, "grad_norm": 2.0710147404144097, "learning_rate": 8.704468645914787e-10, "loss": 0.9692, "step": 450 }, { "epoch": 0.9869848156182213, "grad_norm": 2.067619347023652, "learning_rate": 2.5908158831811077e-10, "loss": 0.9994, "step": 455 }, { "epoch": 0.9978308026030369, "grad_norm": 2.095822455094207, "learning_rate": 7.197919613455284e-12, "loss": 0.9524, "step": 460 }, { "epoch": 1.0, "eval_loss": 1.0288481712341309, "eval_runtime": 3.4782, "eval_samples_per_second": 74.465, "eval_steps_per_second": 1.438, "step": 461 }, { "epoch": 1.0, "step": 461, "total_flos": 192943352709120.0, "train_loss": 1.0081440616326323, "train_runtime": 5423.7345, "train_samples_per_second": 21.752, "train_steps_per_second": 0.085 } ], "logging_steps": 5, "max_steps": 461, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 192943352709120.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }