{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.4925373134328357, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018656716417910446, "grad_norm": 2.403158187866211, "learning_rate": 4.9998282347929784e-05, "loss": 3.3875, "step": 5 }, { "epoch": 0.03731343283582089, "grad_norm": 2.301710367202759, "learning_rate": 4.99931296277454e-05, "loss": 2.9015, "step": 10 }, { "epoch": 0.055970149253731345, "grad_norm": 1.271048665046692, "learning_rate": 4.998454254749331e-05, "loss": 2.6229, "step": 15 }, { "epoch": 0.07462686567164178, "grad_norm": 1.069893717765808, "learning_rate": 4.997252228714279e-05, "loss": 2.3704, "step": 20 }, { "epoch": 0.09328358208955224, "grad_norm": 0.9044906497001648, "learning_rate": 4.9957070498423854e-05, "loss": 2.3782, "step": 25 }, { "epoch": 0.11194029850746269, "grad_norm": 0.9635376334190369, "learning_rate": 4.993818930460026e-05, "loss": 2.3576, "step": 30 }, { "epoch": 0.13059701492537312, "grad_norm": 0.8513979315757751, "learning_rate": 4.9915881300177725e-05, "loss": 2.4603, "step": 35 }, { "epoch": 0.14925373134328357, "grad_norm": 0.845267117023468, "learning_rate": 4.9890149550547454e-05, "loss": 2.2033, "step": 40 }, { "epoch": 0.16791044776119404, "grad_norm": 0.6632418036460876, "learning_rate": 4.98609975915649e-05, "loss": 2.1851, "step": 45 }, { "epoch": 0.1865671641791045, "grad_norm": 0.6857479810714722, "learning_rate": 4.982842942906386e-05, "loss": 2.3592, "step": 50 }, { "epoch": 0.20522388059701493, "grad_norm": 0.7204287648200989, "learning_rate": 4.979244953830608e-05, "loss": 2.1323, "step": 55 }, { "epoch": 0.22388059701492538, "grad_norm": 0.6864420175552368, "learning_rate": 4.9753062863366276e-05, "loss": 2.2138, "step": 60 }, { "epoch": 0.24253731343283583, "grad_norm": 0.7536088228225708, "learning_rate": 4.971027481645274e-05, "loss": 2.2584, "step": 65 }, { "epoch": 0.26119402985074625, "grad_norm": 0.9708526134490967, "learning_rate": 4.966409127716367e-05, "loss": 2.2669, "step": 70 }, { "epoch": 0.2798507462686567, "grad_norm": 0.7516190409660339, "learning_rate": 4.96145185916792e-05, "loss": 2.2133, "step": 75 }, { "epoch": 0.29850746268656714, "grad_norm": 0.7864778637886047, "learning_rate": 4.95615635718894e-05, "loss": 2.1683, "step": 80 }, { "epoch": 0.31716417910447764, "grad_norm": 0.7846741080284119, "learning_rate": 4.950523349445824e-05, "loss": 2.1274, "step": 85 }, { "epoch": 0.3358208955223881, "grad_norm": 0.816838800907135, "learning_rate": 4.944553609982363e-05, "loss": 2.2033, "step": 90 }, { "epoch": 0.35447761194029853, "grad_norm": 0.7661916017532349, "learning_rate": 4.938247959113386e-05, "loss": 2.1492, "step": 95 }, { "epoch": 0.373134328358209, "grad_norm": 0.8964986205101013, "learning_rate": 4.931607263312032e-05, "loss": 2.0862, "step": 100 }, { "epoch": 0.3917910447761194, "grad_norm": 0.8603547215461731, "learning_rate": 4.924632435090696e-05, "loss": 2.1444, "step": 105 }, { "epoch": 0.41044776119402987, "grad_norm": 0.8611045479774475, "learning_rate": 4.917324432875627e-05, "loss": 2.1202, "step": 110 }, { "epoch": 0.4291044776119403, "grad_norm": 0.9499636888504028, "learning_rate": 4.909684260875235e-05, "loss": 2.1285, "step": 115 }, { "epoch": 0.44776119402985076, "grad_norm": 0.8490393161773682, "learning_rate": 4.9017129689421e-05, "loss": 2.236, "step": 120 }, { "epoch": 0.4664179104477612, "grad_norm": 0.9628555178642273, "learning_rate": 4.893411652428712e-05, "loss": 2.1219, "step": 125 }, { "epoch": 0.48507462686567165, "grad_norm": 1.1119599342346191, "learning_rate": 4.8847814520369475e-05, "loss": 2.2537, "step": 130 }, { "epoch": 0.503731343283582, "grad_norm": 0.9489665627479553, "learning_rate": 4.875823553661334e-05, "loss": 2.1018, "step": 135 }, { "epoch": 0.5223880597014925, "grad_norm": 0.9434083700180054, "learning_rate": 4.8665391882260856e-05, "loss": 2.0809, "step": 140 }, { "epoch": 0.5410447761194029, "grad_norm": 0.8856557607650757, "learning_rate": 4.856929631515964e-05, "loss": 2.0807, "step": 145 }, { "epoch": 0.5597014925373134, "grad_norm": 0.8770031929016113, "learning_rate": 4.846996204000967e-05, "loss": 2.0843, "step": 150 }, { "epoch": 0.5783582089552238, "grad_norm": 0.8374930620193481, "learning_rate": 4.8367402706548805e-05, "loss": 2.1869, "step": 155 }, { "epoch": 0.5970149253731343, "grad_norm": 1.0829132795333862, "learning_rate": 4.8261632407677174e-05, "loss": 2.028, "step": 160 }, { "epoch": 0.6156716417910447, "grad_norm": 0.9735206365585327, "learning_rate": 4.815266567752059e-05, "loss": 2.0966, "step": 165 }, { "epoch": 0.6343283582089553, "grad_norm": 1.087944746017456, "learning_rate": 4.804051748943343e-05, "loss": 2.0863, "step": 170 }, { "epoch": 0.6529850746268657, "grad_norm": 0.8176729083061218, "learning_rate": 4.792520325394111e-05, "loss": 2.1135, "step": 175 }, { "epoch": 0.6716417910447762, "grad_norm": 0.9173070788383484, "learning_rate": 4.780673881662242e-05, "loss": 2.0564, "step": 180 }, { "epoch": 0.6902985074626866, "grad_norm": 0.9463202953338623, "learning_rate": 4.7685140455932267e-05, "loss": 2.1579, "step": 185 }, { "epoch": 0.7089552238805971, "grad_norm": 1.149950385093689, "learning_rate": 4.756042488096471e-05, "loss": 2.1447, "step": 190 }, { "epoch": 0.7276119402985075, "grad_norm": 0.940965473651886, "learning_rate": 4.743260922915701e-05, "loss": 2.0823, "step": 195 }, { "epoch": 0.746268656716418, "grad_norm": 0.9384671449661255, "learning_rate": 4.730171106393466e-05, "loss": 2.1445, "step": 200 }, { "epoch": 0.7649253731343284, "grad_norm": 0.8937250971794128, "learning_rate": 4.716774837229804e-05, "loss": 2.014, "step": 205 }, { "epoch": 0.7835820895522388, "grad_norm": 0.8928058743476868, "learning_rate": 4.7030739562350713e-05, "loss": 2.1882, "step": 210 }, { "epoch": 0.8022388059701493, "grad_norm": 1.0239906311035156, "learning_rate": 4.6890703460769955e-05, "loss": 2.1042, "step": 215 }, { "epoch": 0.8208955223880597, "grad_norm": 1.0555064678192139, "learning_rate": 4.674765931021976e-05, "loss": 2.015, "step": 220 }, { "epoch": 0.8395522388059702, "grad_norm": 1.084709882736206, "learning_rate": 4.6601626766706626e-05, "loss": 2.0603, "step": 225 }, { "epoch": 0.8582089552238806, "grad_norm": 0.9265861511230469, "learning_rate": 4.645262589687861e-05, "loss": 2.1006, "step": 230 }, { "epoch": 0.8768656716417911, "grad_norm": 1.0058296918869019, "learning_rate": 4.6300677175267914e-05, "loss": 2.063, "step": 235 }, { "epoch": 0.8955223880597015, "grad_norm": 1.0766576528549194, "learning_rate": 4.614580148147744e-05, "loss": 2.0781, "step": 240 }, { "epoch": 0.914179104477612, "grad_norm": 1.0215730667114258, "learning_rate": 4.598802009731167e-05, "loss": 2.1774, "step": 245 }, { "epoch": 0.9328358208955224, "grad_norm": 0.9870419502258301, "learning_rate": 4.582735470385229e-05, "loss": 1.9636, "step": 250 }, { "epoch": 0.9514925373134329, "grad_norm": 1.1921675205230713, "learning_rate": 4.5663827378478975e-05, "loss": 2.0141, "step": 255 }, { "epoch": 0.9701492537313433, "grad_norm": 1.0618964433670044, "learning_rate": 4.5497460591835615e-05, "loss": 2.0508, "step": 260 }, { "epoch": 0.9888059701492538, "grad_norm": 0.9723111391067505, "learning_rate": 4.532827720474268e-05, "loss": 2.0312, "step": 265 }, { "epoch": 1.007462686567164, "grad_norm": 0.9339023232460022, "learning_rate": 4.515630046505575e-05, "loss": 2.1107, "step": 270 }, { "epoch": 1.0261194029850746, "grad_norm": 1.0588074922561646, "learning_rate": 4.498155400447107e-05, "loss": 2.0963, "step": 275 }, { "epoch": 1.044776119402985, "grad_norm": 1.0709750652313232, "learning_rate": 4.480406183527823e-05, "loss": 2.0359, "step": 280 }, { "epoch": 1.0634328358208955, "grad_norm": 1.2172249555587769, "learning_rate": 4.462384834706058e-05, "loss": 2.1083, "step": 285 }, { "epoch": 1.0820895522388059, "grad_norm": 1.1719626188278198, "learning_rate": 4.4440938303343804e-05, "loss": 2.1259, "step": 290 }, { "epoch": 1.1007462686567164, "grad_norm": 1.051269292831421, "learning_rate": 4.425535683819312e-05, "loss": 2.0901, "step": 295 }, { "epoch": 1.1194029850746268, "grad_norm": 1.3167760372161865, "learning_rate": 4.406712945275955e-05, "loss": 2.0032, "step": 300 }, { "epoch": 1.1380597014925373, "grad_norm": 1.2565367221832275, "learning_rate": 4.387628201177577e-05, "loss": 2.0148, "step": 305 }, { "epoch": 1.1567164179104479, "grad_norm": 1.1141688823699951, "learning_rate": 4.368284074000193e-05, "loss": 2.0217, "step": 310 }, { "epoch": 1.1753731343283582, "grad_norm": 1.1642612218856812, "learning_rate": 4.348683221862212e-05, "loss": 2.0194, "step": 315 }, { "epoch": 1.1940298507462686, "grad_norm": 1.1613104343414307, "learning_rate": 4.328828338159173e-05, "loss": 1.9371, "step": 320 }, { "epoch": 1.212686567164179, "grad_norm": 1.2319557666778564, "learning_rate": 4.3087221511936434e-05, "loss": 2.0227, "step": 325 }, { "epoch": 1.2313432835820897, "grad_norm": 1.2520420551300049, "learning_rate": 4.288367423800319e-05, "loss": 1.9883, "step": 330 }, { "epoch": 1.25, "grad_norm": 1.0452089309692383, "learning_rate": 4.267766952966369e-05, "loss": 1.9912, "step": 335 }, { "epoch": 1.2686567164179103, "grad_norm": 0.9965611100196838, "learning_rate": 4.2469235694471043e-05, "loss": 1.983, "step": 340 }, { "epoch": 1.287313432835821, "grad_norm": 1.0808607339859009, "learning_rate": 4.225840137376993e-05, "loss": 1.9514, "step": 345 }, { "epoch": 1.3059701492537314, "grad_norm": 1.102575659751892, "learning_rate": 4.204519553876095e-05, "loss": 2.0286, "step": 350 }, { "epoch": 1.3246268656716418, "grad_norm": 1.0246608257293701, "learning_rate": 4.1829647486519596e-05, "loss": 2.0265, "step": 355 }, { "epoch": 1.3432835820895521, "grad_norm": 1.0723367929458618, "learning_rate": 4.161178683597054e-05, "loss": 2.0077, "step": 360 }, { "epoch": 1.3619402985074627, "grad_norm": 1.4298617839813232, "learning_rate": 4.139164352381758e-05, "loss": 2.0898, "step": 365 }, { "epoch": 1.3805970149253732, "grad_norm": 1.1437115669250488, "learning_rate": 4.116924780042997e-05, "loss": 2.024, "step": 370 }, { "epoch": 1.3992537313432836, "grad_norm": 1.326556921005249, "learning_rate": 4.094463022568569e-05, "loss": 2.2252, "step": 375 }, { "epoch": 1.417910447761194, "grad_norm": 1.2549344301223755, "learning_rate": 4.071782166477213e-05, "loss": 1.9777, "step": 380 }, { "epoch": 1.4365671641791045, "grad_norm": 1.1226497888565063, "learning_rate": 4.0488853283944806e-05, "loss": 2.0062, "step": 385 }, { "epoch": 1.455223880597015, "grad_norm": 1.2250981330871582, "learning_rate": 4.0257756546244804e-05, "loss": 1.9147, "step": 390 }, { "epoch": 1.4738805970149254, "grad_norm": 1.3552589416503906, "learning_rate": 4.0024563207175316e-05, "loss": 1.9709, "step": 395 }, { "epoch": 1.4925373134328357, "grad_norm": 1.3661599159240723, "learning_rate": 3.978930531033807e-05, "loss": 1.9748, "step": 400 } ], "logging_steps": 5, "max_steps": 1340, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.162472574151557e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }