{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.1194029850746268, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018656716417910446, "grad_norm": 2.403158187866211, "learning_rate": 4.9998282347929784e-05, "loss": 3.3875, "step": 5 }, { "epoch": 0.03731343283582089, "grad_norm": 2.301710367202759, "learning_rate": 4.99931296277454e-05, "loss": 2.9015, "step": 10 }, { "epoch": 0.055970149253731345, "grad_norm": 1.271048665046692, "learning_rate": 4.998454254749331e-05, "loss": 2.6229, "step": 15 }, { "epoch": 0.07462686567164178, "grad_norm": 1.069893717765808, "learning_rate": 4.997252228714279e-05, "loss": 2.3704, "step": 20 }, { "epoch": 0.09328358208955224, "grad_norm": 0.9044906497001648, "learning_rate": 4.9957070498423854e-05, "loss": 2.3782, "step": 25 }, { "epoch": 0.11194029850746269, "grad_norm": 0.9635376334190369, "learning_rate": 4.993818930460026e-05, "loss": 2.3576, "step": 30 }, { "epoch": 0.13059701492537312, "grad_norm": 0.8513979315757751, "learning_rate": 4.9915881300177725e-05, "loss": 2.4603, "step": 35 }, { "epoch": 0.14925373134328357, "grad_norm": 0.845267117023468, "learning_rate": 4.9890149550547454e-05, "loss": 2.2033, "step": 40 }, { "epoch": 0.16791044776119404, "grad_norm": 0.6632418036460876, "learning_rate": 4.98609975915649e-05, "loss": 2.1851, "step": 45 }, { "epoch": 0.1865671641791045, "grad_norm": 0.6857479810714722, "learning_rate": 4.982842942906386e-05, "loss": 2.3592, "step": 50 }, { "epoch": 0.20522388059701493, "grad_norm": 0.7204287648200989, "learning_rate": 4.979244953830608e-05, "loss": 2.1323, "step": 55 }, { "epoch": 0.22388059701492538, "grad_norm": 0.6864420175552368, "learning_rate": 4.9753062863366276e-05, "loss": 2.2138, "step": 60 }, { "epoch": 0.24253731343283583, "grad_norm": 0.7536088228225708, "learning_rate": 4.971027481645274e-05, "loss": 2.2584, "step": 65 }, { "epoch": 0.26119402985074625, "grad_norm": 0.9708526134490967, "learning_rate": 4.966409127716367e-05, "loss": 2.2669, "step": 70 }, { "epoch": 0.2798507462686567, "grad_norm": 0.7516190409660339, "learning_rate": 4.96145185916792e-05, "loss": 2.2133, "step": 75 }, { "epoch": 0.29850746268656714, "grad_norm": 0.7864778637886047, "learning_rate": 4.95615635718894e-05, "loss": 2.1683, "step": 80 }, { "epoch": 0.31716417910447764, "grad_norm": 0.7846741080284119, "learning_rate": 4.950523349445824e-05, "loss": 2.1274, "step": 85 }, { "epoch": 0.3358208955223881, "grad_norm": 0.816838800907135, "learning_rate": 4.944553609982363e-05, "loss": 2.2033, "step": 90 }, { "epoch": 0.35447761194029853, "grad_norm": 0.7661916017532349, "learning_rate": 4.938247959113386e-05, "loss": 2.1492, "step": 95 }, { "epoch": 0.373134328358209, "grad_norm": 0.8964986205101013, "learning_rate": 4.931607263312032e-05, "loss": 2.0862, "step": 100 }, { "epoch": 0.3917910447761194, "grad_norm": 0.8603547215461731, "learning_rate": 4.924632435090696e-05, "loss": 2.1444, "step": 105 }, { "epoch": 0.41044776119402987, "grad_norm": 0.8611045479774475, "learning_rate": 4.917324432875627e-05, "loss": 2.1202, "step": 110 }, { "epoch": 0.4291044776119403, "grad_norm": 0.9499636888504028, "learning_rate": 4.909684260875235e-05, "loss": 2.1285, "step": 115 }, { "epoch": 0.44776119402985076, "grad_norm": 0.8490393161773682, "learning_rate": 4.9017129689421e-05, "loss": 2.236, "step": 120 }, { "epoch": 0.4664179104477612, "grad_norm": 0.9628555178642273, "learning_rate": 4.893411652428712e-05, "loss": 2.1219, "step": 125 }, { "epoch": 0.48507462686567165, "grad_norm": 1.1119599342346191, "learning_rate": 4.8847814520369475e-05, "loss": 2.2537, "step": 130 }, { "epoch": 0.503731343283582, "grad_norm": 0.9489665627479553, "learning_rate": 4.875823553661334e-05, "loss": 2.1018, "step": 135 }, { "epoch": 0.5223880597014925, "grad_norm": 0.9434083700180054, "learning_rate": 4.8665391882260856e-05, "loss": 2.0809, "step": 140 }, { "epoch": 0.5410447761194029, "grad_norm": 0.8856557607650757, "learning_rate": 4.856929631515964e-05, "loss": 2.0807, "step": 145 }, { "epoch": 0.5597014925373134, "grad_norm": 0.8770031929016113, "learning_rate": 4.846996204000967e-05, "loss": 2.0843, "step": 150 }, { "epoch": 0.5783582089552238, "grad_norm": 0.8374930620193481, "learning_rate": 4.8367402706548805e-05, "loss": 2.1869, "step": 155 }, { "epoch": 0.5970149253731343, "grad_norm": 1.0829132795333862, "learning_rate": 4.8261632407677174e-05, "loss": 2.028, "step": 160 }, { "epoch": 0.6156716417910447, "grad_norm": 0.9735206365585327, "learning_rate": 4.815266567752059e-05, "loss": 2.0966, "step": 165 }, { "epoch": 0.6343283582089553, "grad_norm": 1.087944746017456, "learning_rate": 4.804051748943343e-05, "loss": 2.0863, "step": 170 }, { "epoch": 0.6529850746268657, "grad_norm": 0.8176729083061218, "learning_rate": 4.792520325394111e-05, "loss": 2.1135, "step": 175 }, { "epoch": 0.6716417910447762, "grad_norm": 0.9173070788383484, "learning_rate": 4.780673881662242e-05, "loss": 2.0564, "step": 180 }, { "epoch": 0.6902985074626866, "grad_norm": 0.9463202953338623, "learning_rate": 4.7685140455932267e-05, "loss": 2.1579, "step": 185 }, { "epoch": 0.7089552238805971, "grad_norm": 1.149950385093689, "learning_rate": 4.756042488096471e-05, "loss": 2.1447, "step": 190 }, { "epoch": 0.7276119402985075, "grad_norm": 0.940965473651886, "learning_rate": 4.743260922915701e-05, "loss": 2.0823, "step": 195 }, { "epoch": 0.746268656716418, "grad_norm": 0.9384671449661255, "learning_rate": 4.730171106393466e-05, "loss": 2.1445, "step": 200 }, { "epoch": 0.7649253731343284, "grad_norm": 0.8937250971794128, "learning_rate": 4.716774837229804e-05, "loss": 2.014, "step": 205 }, { "epoch": 0.7835820895522388, "grad_norm": 0.8928058743476868, "learning_rate": 4.7030739562350713e-05, "loss": 2.1882, "step": 210 }, { "epoch": 0.8022388059701493, "grad_norm": 1.0239906311035156, "learning_rate": 4.6890703460769955e-05, "loss": 2.1042, "step": 215 }, { "epoch": 0.8208955223880597, "grad_norm": 1.0555064678192139, "learning_rate": 4.674765931021976e-05, "loss": 2.015, "step": 220 }, { "epoch": 0.8395522388059702, "grad_norm": 1.084709882736206, "learning_rate": 4.6601626766706626e-05, "loss": 2.0603, "step": 225 }, { "epoch": 0.8582089552238806, "grad_norm": 0.9265861511230469, "learning_rate": 4.645262589687861e-05, "loss": 2.1006, "step": 230 }, { "epoch": 0.8768656716417911, "grad_norm": 1.0058296918869019, "learning_rate": 4.6300677175267914e-05, "loss": 2.063, "step": 235 }, { "epoch": 0.8955223880597015, "grad_norm": 1.0766576528549194, "learning_rate": 4.614580148147744e-05, "loss": 2.0781, "step": 240 }, { "epoch": 0.914179104477612, "grad_norm": 1.0215730667114258, "learning_rate": 4.598802009731167e-05, "loss": 2.1774, "step": 245 }, { "epoch": 0.9328358208955224, "grad_norm": 0.9870419502258301, "learning_rate": 4.582735470385229e-05, "loss": 1.9636, "step": 250 }, { "epoch": 0.9514925373134329, "grad_norm": 1.1921675205230713, "learning_rate": 4.5663827378478975e-05, "loss": 2.0141, "step": 255 }, { "epoch": 0.9701492537313433, "grad_norm": 1.0618964433670044, "learning_rate": 4.5497460591835615e-05, "loss": 2.0508, "step": 260 }, { "epoch": 0.9888059701492538, "grad_norm": 0.9723111391067505, "learning_rate": 4.532827720474268e-05, "loss": 2.0312, "step": 265 }, { "epoch": 1.007462686567164, "grad_norm": 0.9339023232460022, "learning_rate": 4.515630046505575e-05, "loss": 2.1107, "step": 270 }, { "epoch": 1.0261194029850746, "grad_norm": 1.0588074922561646, "learning_rate": 4.498155400447107e-05, "loss": 2.0963, "step": 275 }, { "epoch": 1.044776119402985, "grad_norm": 1.0709750652313232, "learning_rate": 4.480406183527823e-05, "loss": 2.0359, "step": 280 }, { "epoch": 1.0634328358208955, "grad_norm": 1.2172249555587769, "learning_rate": 4.462384834706058e-05, "loss": 2.1083, "step": 285 }, { "epoch": 1.0820895522388059, "grad_norm": 1.1719626188278198, "learning_rate": 4.4440938303343804e-05, "loss": 2.1259, "step": 290 }, { "epoch": 1.1007462686567164, "grad_norm": 1.051269292831421, "learning_rate": 4.425535683819312e-05, "loss": 2.0901, "step": 295 }, { "epoch": 1.1194029850746268, "grad_norm": 1.3167760372161865, "learning_rate": 4.406712945275955e-05, "loss": 2.0032, "step": 300 } ], "logging_steps": 5, "max_steps": 1340, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.122292814015365e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }