| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 454, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.022026431718061675, |
| "grad_norm": 1.2465029954910278, |
| "learning_rate": 2.1052631578947366e-06, |
| "loss": 1.3594, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.04405286343612335, |
| "grad_norm": 0.7338076233863831, |
| "learning_rate": 4.736842105263158e-06, |
| "loss": 1.3607, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.06607929515418502, |
| "grad_norm": 0.5697624087333679, |
| "learning_rate": 7.3684210526315784e-06, |
| "loss": 1.3328, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.0881057268722467, |
| "grad_norm": 0.661296010017395, |
| "learning_rate": 9.999999999999999e-06, |
| "loss": 1.3393, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.11013215859030837, |
| "grad_norm": 0.6124210357666016, |
| "learning_rate": 1.263157894736842e-05, |
| "loss": 1.2621, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.13215859030837004, |
| "grad_norm": 0.5665815472602844, |
| "learning_rate": 1.5263157894736842e-05, |
| "loss": 1.2624, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.15418502202643172, |
| "grad_norm": 0.5574439167976379, |
| "learning_rate": 1.7894736842105264e-05, |
| "loss": 1.2487, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.1762114537444934, |
| "grad_norm": 0.4665200412273407, |
| "learning_rate": 2.0526315789473685e-05, |
| "loss": 1.2335, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.19823788546255505, |
| "grad_norm": 0.4646995961666107, |
| "learning_rate": 2.3157894736842103e-05, |
| "loss": 1.1568, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.22026431718061673, |
| "grad_norm": 0.5178118348121643, |
| "learning_rate": 2.578947368421053e-05, |
| "loss": 1.214, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.2422907488986784, |
| "grad_norm": 0.41648760437965393, |
| "learning_rate": 2.8421052631578946e-05, |
| "loss": 1.151, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.2643171806167401, |
| "grad_norm": 0.5988641977310181, |
| "learning_rate": 2.9999745210076202e-05, |
| "loss": 1.1945, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.28634361233480177, |
| "grad_norm": 0.4767996370792389, |
| "learning_rate": 2.9996878922838097e-05, |
| "loss": 1.0975, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.30837004405286345, |
| "grad_norm": 0.547401487827301, |
| "learning_rate": 2.9990828471561044e-05, |
| "loss": 1.1063, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.3303964757709251, |
| "grad_norm": 0.5365089178085327, |
| "learning_rate": 2.998159514088762e-05, |
| "loss": 1.1153, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.3524229074889868, |
| "grad_norm": 0.5236433148384094, |
| "learning_rate": 2.9969180891255046e-05, |
| "loss": 1.088, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.3744493392070485, |
| "grad_norm": 0.6282631754875183, |
| "learning_rate": 2.995358835847891e-05, |
| "loss": 1.0695, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.3964757709251101, |
| "grad_norm": 0.6258669495582581, |
| "learning_rate": 2.9934820853193538e-05, |
| "loss": 1.0281, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.4185022026431718, |
| "grad_norm": 0.6035173535346985, |
| "learning_rate": 2.991288236014907e-05, |
| "loss": 1.0368, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.44052863436123346, |
| "grad_norm": 0.553928554058075, |
| "learning_rate": 2.9887777537365416e-05, |
| "loss": 0.9775, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.46255506607929514, |
| "grad_norm": 0.6714206337928772, |
| "learning_rate": 2.985951171514326e-05, |
| "loss": 0.9667, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.4845814977973568, |
| "grad_norm": 0.640594482421875, |
| "learning_rate": 2.982809089493231e-05, |
| "loss": 0.9989, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.5066079295154186, |
| "grad_norm": 0.6225477457046509, |
| "learning_rate": 2.9793521748057065e-05, |
| "loss": 1.011, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.5286343612334802, |
| "grad_norm": 0.6979171633720398, |
| "learning_rate": 2.975581161430035e-05, |
| "loss": 0.9919, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.5506607929515418, |
| "grad_norm": 0.7126419544219971, |
| "learning_rate": 2.971496850034492e-05, |
| "loss": 0.9232, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.5726872246696035, |
| "grad_norm": 0.8123069405555725, |
| "learning_rate": 2.9671001078073453e-05, |
| "loss": 0.932, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.5947136563876652, |
| "grad_norm": 0.8383358120918274, |
| "learning_rate": 2.9623918682727355e-05, |
| "loss": 0.9127, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.6167400881057269, |
| "grad_norm": 0.7429929971694946, |
| "learning_rate": 2.957373131092464e-05, |
| "loss": 0.9187, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.6387665198237885, |
| "grad_norm": 0.9482147693634033, |
| "learning_rate": 2.9520449618537465e-05, |
| "loss": 0.8848, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.6607929515418502, |
| "grad_norm": 0.8874338865280151, |
| "learning_rate": 2.946408491842964e-05, |
| "loss": 0.8802, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.6828193832599119, |
| "grad_norm": 0.7921388149261475, |
| "learning_rate": 2.940464917805466e-05, |
| "loss": 0.8105, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.7048458149779736, |
| "grad_norm": 0.823284924030304, |
| "learning_rate": 2.9342155016914772e-05, |
| "loss": 0.8335, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.7268722466960352, |
| "grad_norm": 0.8587915301322937, |
| "learning_rate": 2.927661570388155e-05, |
| "loss": 0.8236, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.748898678414097, |
| "grad_norm": 0.9209868311882019, |
| "learning_rate": 2.920804515437865e-05, |
| "loss": 0.7313, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.7709251101321586, |
| "grad_norm": 0.9900104999542236, |
| "learning_rate": 2.9136457927427254e-05, |
| "loss": 0.8445, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.7929515418502202, |
| "grad_norm": 1.1005882024765015, |
| "learning_rate": 2.9061869222554863e-05, |
| "loss": 0.7258, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.8149779735682819, |
| "grad_norm": 0.7985159754753113, |
| "learning_rate": 2.898429487656813e-05, |
| "loss": 0.7554, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.8370044052863436, |
| "grad_norm": 1.0106185674667358, |
| "learning_rate": 2.8903751360190327e-05, |
| "loss": 0.7773, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.8590308370044053, |
| "grad_norm": 1.0603998899459839, |
| "learning_rate": 2.8820255774564287e-05, |
| "loss": 0.8034, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.8810572687224669, |
| "grad_norm": 0.9879089593887329, |
| "learning_rate": 2.8733825847621436e-05, |
| "loss": 0.71, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.9030837004405287, |
| "grad_norm": 1.0008033514022827, |
| "learning_rate": 2.8644479930317776e-05, |
| "loss": 0.7837, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.9251101321585903, |
| "grad_norm": 1.0469297170639038, |
| "learning_rate": 2.8552236992737572e-05, |
| "loss": 0.7139, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.947136563876652, |
| "grad_norm": 0.9759002327919006, |
| "learning_rate": 2.8457116620065596e-05, |
| "loss": 0.7207, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.9691629955947136, |
| "grad_norm": 0.907706618309021, |
| "learning_rate": 2.8359139008428758e-05, |
| "loss": 0.7134, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.9911894273127754, |
| "grad_norm": 1.022333025932312, |
| "learning_rate": 2.8258324960608043e-05, |
| "loss": 0.669, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.013215859030837, |
| "grad_norm": 1.021018385887146, |
| "learning_rate": 2.815469588162161e-05, |
| "loss": 0.6179, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.0352422907488987, |
| "grad_norm": 1.1175833940505981, |
| "learning_rate": 2.8048273774180043e-05, |
| "loss": 0.5932, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.0572687224669604, |
| "grad_norm": 0.9679650068283081, |
| "learning_rate": 2.7939081234014708e-05, |
| "loss": 0.5915, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.079295154185022, |
| "grad_norm": 1.1451464891433716, |
| "learning_rate": 2.7827141445080196e-05, |
| "loss": 0.5768, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.1013215859030836, |
| "grad_norm": 1.274778127670288, |
| "learning_rate": 2.7712478174631813e-05, |
| "loss": 0.5711, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.1233480176211454, |
| "grad_norm": 1.4071186780929565, |
| "learning_rate": 2.759511576817934e-05, |
| "loss": 0.5991, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.145374449339207, |
| "grad_norm": 1.2345290184020996, |
| "learning_rate": 2.747507914431791e-05, |
| "loss": 0.5184, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.1674008810572687, |
| "grad_norm": 1.345145344734192, |
| "learning_rate": 2.7352393789437258e-05, |
| "loss": 0.5362, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.1894273127753303, |
| "grad_norm": 1.1527823209762573, |
| "learning_rate": 2.7227085752310413e-05, |
| "loss": 0.543, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.2114537444933922, |
| "grad_norm": 1.1420563459396362, |
| "learning_rate": 2.709918163856295e-05, |
| "loss": 0.5326, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.2334801762114538, |
| "grad_norm": 1.0940356254577637, |
| "learning_rate": 2.696870860502408e-05, |
| "loss": 0.5188, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.2555066079295154, |
| "grad_norm": 1.2769014835357666, |
| "learning_rate": 2.6835694353960623e-05, |
| "loss": 0.5029, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.277533039647577, |
| "grad_norm": 1.3038066625595093, |
| "learning_rate": 2.6700167127195233e-05, |
| "loss": 0.5492, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.2995594713656389, |
| "grad_norm": 1.0858526229858398, |
| "learning_rate": 2.6562155700110046e-05, |
| "loss": 0.5278, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.3215859030837005, |
| "grad_norm": 1.1577568054199219, |
| "learning_rate": 2.6421689375537015e-05, |
| "loss": 0.5314, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.3436123348017621, |
| "grad_norm": 1.1254470348358154, |
| "learning_rate": 2.6278797977536325e-05, |
| "loss": 0.466, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.3656387665198237, |
| "grad_norm": 1.2429901361465454, |
| "learning_rate": 2.613351184506405e-05, |
| "loss": 0.5185, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.3876651982378854, |
| "grad_norm": 1.3175548315048218, |
| "learning_rate": 2.598586182553056e-05, |
| "loss": 0.5077, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.4096916299559472, |
| "grad_norm": 1.3169467449188232, |
| "learning_rate": 2.5835879268250934e-05, |
| "loss": 0.5124, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.4317180616740088, |
| "grad_norm": 1.2650501728057861, |
| "learning_rate": 2.568359601778881e-05, |
| "loss": 0.4961, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.4537444933920705, |
| "grad_norm": 1.1624271869659424, |
| "learning_rate": 2.5529044407195127e-05, |
| "loss": 0.4552, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.475770925110132, |
| "grad_norm": 1.1693929433822632, |
| "learning_rate": 2.5372257251143056e-05, |
| "loss": 0.4668, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.497797356828194, |
| "grad_norm": 1.105682134628296, |
| "learning_rate": 2.5213267838960772e-05, |
| "loss": 0.4485, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.5198237885462555, |
| "grad_norm": 1.365013599395752, |
| "learning_rate": 2.5052109927563393e-05, |
| "loss": 0.4641, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.5418502202643172, |
| "grad_norm": 1.2356834411621094, |
| "learning_rate": 2.4888817734285657e-05, |
| "loss": 0.4388, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.5638766519823788, |
| "grad_norm": 1.8782477378845215, |
| "learning_rate": 2.472342592961683e-05, |
| "loss": 0.4556, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.5859030837004404, |
| "grad_norm": 1.1085964441299438, |
| "learning_rate": 2.4555969629839393e-05, |
| "loss": 0.4284, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.607929515418502, |
| "grad_norm": 1.3663532733917236, |
| "learning_rate": 2.4386484389573126e-05, |
| "loss": 0.4322, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.6299559471365639, |
| "grad_norm": 1.307347297668457, |
| "learning_rate": 2.421500619422606e-05, |
| "loss": 0.4378, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.6519823788546255, |
| "grad_norm": 1.0821038484573364, |
| "learning_rate": 2.4041571452353982e-05, |
| "loss": 0.4003, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.6740088105726874, |
| "grad_norm": 1.056179404258728, |
| "learning_rate": 2.386621698793015e-05, |
| "loss": 0.4161, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.696035242290749, |
| "grad_norm": 1.2139962911605835, |
| "learning_rate": 2.3688980032526707e-05, |
| "loss": 0.3905, |
| "step": 385 |
| }, |
| { |
| "epoch": 1.7180616740088106, |
| "grad_norm": 1.0942909717559814, |
| "learning_rate": 2.3509898217409645e-05, |
| "loss": 0.4268, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.7400881057268722, |
| "grad_norm": 1.277398705482483, |
| "learning_rate": 2.3329009565548857e-05, |
| "loss": 0.4007, |
| "step": 395 |
| }, |
| { |
| "epoch": 1.7621145374449338, |
| "grad_norm": 1.1945730447769165, |
| "learning_rate": 2.3146352483545026e-05, |
| "loss": 0.3755, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.7841409691629955, |
| "grad_norm": 1.1625131368637085, |
| "learning_rate": 2.2961965753475074e-05, |
| "loss": 0.4081, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.8061674008810573, |
| "grad_norm": 1.1497328281402588, |
| "learning_rate": 2.277588852465788e-05, |
| "loss": 0.3711, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.828193832599119, |
| "grad_norm": 1.1712104082107544, |
| "learning_rate": 2.2588160305342024e-05, |
| "loss": 0.4283, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.8502202643171806, |
| "grad_norm": 1.3465914726257324, |
| "learning_rate": 2.2398820954317342e-05, |
| "loss": 0.3948, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.8722466960352424, |
| "grad_norm": 1.3581944704055786, |
| "learning_rate": 2.220791067245201e-05, |
| "loss": 0.4051, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.894273127753304, |
| "grad_norm": 1.656079649925232, |
| "learning_rate": 2.201546999415704e-05, |
| "loss": 0.3842, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.9162995594713657, |
| "grad_norm": 1.3536030054092407, |
| "learning_rate": 2.182153977877994e-05, |
| "loss": 0.3618, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.9383259911894273, |
| "grad_norm": 1.2806636095046997, |
| "learning_rate": 2.162616120192939e-05, |
| "loss": 0.362, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.960352422907489, |
| "grad_norm": 1.1593424081802368, |
| "learning_rate": 2.142937574673275e-05, |
| "loss": 0.3802, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.9823788546255505, |
| "grad_norm": 1.5513302087783813, |
| "learning_rate": 2.12312251950283e-05, |
| "loss": 0.36, |
| "step": 450 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1135, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.563190790820987e+17, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|