{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2057062393817929, "eval_steps": 2, "global_step": 30240, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 4.0625, "learning_rate": 4.896848787123102e-07, "loss": 1.9129, "step": 144 }, { "epoch": 0.0, "grad_norm": 3.609375, "learning_rate": 9.793697574246203e-07, "loss": 1.9063, "step": 288 }, { "epoch": 0.0, "grad_norm": 3.734375, "learning_rate": 1.4690546361369305e-06, "loss": 1.8962, "step": 432 }, { "epoch": 0.0, "grad_norm": 3.71875, "learning_rate": 1.9587395148492407e-06, "loss": 1.8796, "step": 576 }, { "epoch": 0.0, "grad_norm": 6.375, "learning_rate": 2.4484243935615507e-06, "loss": 1.9003, "step": 720 }, { "epoch": 0.01, "grad_norm": 3.3125, "learning_rate": 2.938109272273861e-06, "loss": 1.8795, "step": 864 }, { "epoch": 0.01, "grad_norm": 3.953125, "learning_rate": 3.427794150986171e-06, "loss": 1.8969, "step": 1008 }, { "epoch": 0.01, "grad_norm": 3.375, "learning_rate": 3.917479029698481e-06, "loss": 1.8597, "step": 1152 }, { "epoch": 0.01, "grad_norm": 3.203125, "learning_rate": 4.4071639084107905e-06, "loss": 1.8685, "step": 1296 }, { "epoch": 0.01, "grad_norm": 4.0625, "learning_rate": 4.896848787123101e-06, "loss": 1.867, "step": 1440 }, { "epoch": 0.01, "grad_norm": 3.203125, "learning_rate": 5.386533665835411e-06, "loss": 1.8622, "step": 1584 }, { "epoch": 0.01, "grad_norm": 3.109375, "learning_rate": 5.876218544547722e-06, "loss": 1.8697, "step": 1728 }, { "epoch": 0.01, "grad_norm": 3.453125, "learning_rate": 6.365903423260031e-06, "loss": 1.873, "step": 1872 }, { "epoch": 0.01, "grad_norm": 3.4375, "learning_rate": 6.855588301972342e-06, "loss": 1.8698, "step": 2016 }, { "epoch": 0.01, "grad_norm": 3.125, "learning_rate": 7.345273180684652e-06, "loss": 1.8587, "step": 2160 }, { "epoch": 0.02, "grad_norm": 3.21875, "learning_rate": 7.834958059396963e-06, "loss": 1.8685, "step": 2304 }, { "epoch": 0.02, "grad_norm": 2.921875, "learning_rate": 8.324642938109274e-06, "loss": 1.8771, "step": 2448 }, { "epoch": 0.02, "grad_norm": 3.1875, "learning_rate": 8.814327816821581e-06, "loss": 1.8423, "step": 2592 }, { "epoch": 0.02, "grad_norm": 3.078125, "learning_rate": 9.304012695533892e-06, "loss": 1.8542, "step": 2736 }, { "epoch": 0.02, "grad_norm": 3.015625, "learning_rate": 9.793697574246203e-06, "loss": 1.8577, "step": 2880 }, { "epoch": 0.02, "grad_norm": 2.859375, "learning_rate": 1.0283382452958513e-05, "loss": 1.8372, "step": 3024 }, { "epoch": 0.02, "grad_norm": 3.28125, "learning_rate": 1.0773067331670823e-05, "loss": 1.8332, "step": 3168 }, { "epoch": 0.02, "grad_norm": 3.09375, "learning_rate": 1.1262752210383133e-05, "loss": 1.8618, "step": 3312 }, { "epoch": 0.02, "grad_norm": 3.0, "learning_rate": 1.1752437089095444e-05, "loss": 1.8525, "step": 3456 }, { "epoch": 0.02, "grad_norm": 3.0625, "learning_rate": 1.2242121967807753e-05, "loss": 1.8357, "step": 3600 }, { "epoch": 0.03, "grad_norm": 2.96875, "learning_rate": 1.2731806846520062e-05, "loss": 1.8269, "step": 3744 }, { "epoch": 0.03, "grad_norm": 2.875, "learning_rate": 1.3221491725232373e-05, "loss": 1.8274, "step": 3888 }, { "epoch": 0.03, "grad_norm": 3.03125, "learning_rate": 1.3711176603944684e-05, "loss": 1.8343, "step": 4032 }, { "epoch": 0.03, "grad_norm": 3.234375, "learning_rate": 1.4200861482656995e-05, "loss": 1.8468, "step": 4176 }, { "epoch": 0.03, "grad_norm": 3.0625, "learning_rate": 1.4690546361369304e-05, "loss": 1.8347, "step": 4320 }, { "epoch": 0.03, "grad_norm": 2.875, "learning_rate": 1.4999994886948122e-05, "loss": 1.8314, "step": 4464 }, { "epoch": 0.03, "grad_norm": 2.640625, "learning_rate": 1.499992935844023e-05, "loss": 1.832, "step": 4608 }, { "epoch": 0.03, "grad_norm": 2.78125, "learning_rate": 1.4999788341738499e-05, "loss": 1.8244, "step": 4752 }, { "epoch": 0.03, "grad_norm": 2.75, "learning_rate": 1.4999571838262296e-05, "loss": 1.8268, "step": 4896 }, { "epoch": 0.03, "grad_norm": 2.75, "learning_rate": 1.4999279850190762e-05, "loss": 1.8101, "step": 5040 }, { "epoch": 0.04, "grad_norm": 2.625, "learning_rate": 1.4998912380462815e-05, "loss": 1.8298, "step": 5184 }, { "epoch": 0.04, "grad_norm": 2.71875, "learning_rate": 1.4998469432777108e-05, "loss": 1.8275, "step": 5328 }, { "epoch": 0.04, "grad_norm": 2.6875, "learning_rate": 1.4997951011591987e-05, "loss": 1.8109, "step": 5472 }, { "epoch": 0.04, "grad_norm": 2.671875, "learning_rate": 1.4997357122125465e-05, "loss": 1.825, "step": 5616 }, { "epoch": 0.04, "grad_norm": 2.796875, "learning_rate": 1.4996687770355145e-05, "loss": 1.8322, "step": 5760 }, { "epoch": 0.04, "grad_norm": 2.640625, "learning_rate": 1.4995942963018183e-05, "loss": 1.8085, "step": 5904 }, { "epoch": 0.04, "grad_norm": 2.53125, "learning_rate": 1.49951227076112e-05, "loss": 1.819, "step": 6048 }, { "epoch": 0.04, "grad_norm": 2.609375, "learning_rate": 1.4994227012390224e-05, "loss": 1.7942, "step": 6192 }, { "epoch": 0.04, "grad_norm": 2.84375, "learning_rate": 1.4993255886370593e-05, "loss": 1.7975, "step": 6336 }, { "epoch": 0.04, "grad_norm": 3.4375, "learning_rate": 1.4992209339326872e-05, "loss": 1.8219, "step": 6480 }, { "epoch": 0.05, "grad_norm": 2.734375, "learning_rate": 1.4991087381792756e-05, "loss": 1.8119, "step": 6624 }, { "epoch": 0.05, "grad_norm": 2.421875, "learning_rate": 1.4989890025060953e-05, "loss": 1.815, "step": 6768 }, { "epoch": 0.05, "grad_norm": 2.546875, "learning_rate": 1.4988617281183087e-05, "loss": 1.7912, "step": 6912 }, { "epoch": 0.05, "grad_norm": 2.421875, "learning_rate": 1.4987269162969559e-05, "loss": 1.8018, "step": 7056 }, { "epoch": 0.05, "grad_norm": 2.59375, "learning_rate": 1.4985845683989435e-05, "loss": 1.7973, "step": 7200 }, { "epoch": 0.05, "grad_norm": 2.84375, "learning_rate": 1.4984346858570297e-05, "loss": 1.8109, "step": 7344 }, { "epoch": 0.05, "grad_norm": 2.609375, "learning_rate": 1.4982772701798105e-05, "loss": 1.8162, "step": 7488 }, { "epoch": 0.05, "grad_norm": 2.59375, "learning_rate": 1.4981123229517039e-05, "loss": 1.7892, "step": 7632 }, { "epoch": 0.05, "grad_norm": 2.6875, "learning_rate": 1.4979398458329347e-05, "loss": 1.7908, "step": 7776 }, { "epoch": 0.05, "grad_norm": 2.578125, "learning_rate": 1.4977598405595177e-05, "loss": 1.7722, "step": 7920 }, { "epoch": 0.05, "grad_norm": 2.59375, "learning_rate": 1.4975723089432394e-05, "loss": 1.7826, "step": 8064 }, { "epoch": 0.06, "grad_norm": 2.515625, "learning_rate": 1.4973772528716404e-05, "loss": 1.8005, "step": 8208 }, { "epoch": 0.06, "grad_norm": 2.75, "learning_rate": 1.4971746743079972e-05, "loss": 1.7911, "step": 8352 }, { "epoch": 0.06, "grad_norm": 2.75, "learning_rate": 1.4969645752913006e-05, "loss": 1.7798, "step": 8496 }, { "epoch": 0.06, "grad_norm": 2.734375, "learning_rate": 1.4967469579362364e-05, "loss": 1.7806, "step": 8640 }, { "epoch": 0.06, "grad_norm": 3.03125, "learning_rate": 1.4965218244331647e-05, "loss": 1.7927, "step": 8784 }, { "epoch": 0.06, "grad_norm": 2.671875, "learning_rate": 1.4962891770480958e-05, "loss": 1.7869, "step": 8928 }, { "epoch": 0.06, "grad_norm": 2.734375, "learning_rate": 1.4960490181226693e-05, "loss": 1.8041, "step": 9072 }, { "epoch": 0.06, "grad_norm": 2.5, "learning_rate": 1.49580135007413e-05, "loss": 1.7747, "step": 9216 }, { "epoch": 0.06, "grad_norm": 2.75, "learning_rate": 1.495546175395303e-05, "loss": 1.7883, "step": 9360 }, { "epoch": 0.06, "grad_norm": 2.640625, "learning_rate": 1.495283496654569e-05, "loss": 1.7776, "step": 9504 }, { "epoch": 0.07, "grad_norm": 2.671875, "learning_rate": 1.4950133164958392e-05, "loss": 1.7834, "step": 9648 }, { "epoch": 0.07, "grad_norm": 2.8125, "learning_rate": 1.4947356376385275e-05, "loss": 1.7711, "step": 9792 }, { "epoch": 0.07, "grad_norm": 2.640625, "learning_rate": 1.4944504628775233e-05, "loss": 1.7704, "step": 9936 }, { "epoch": 0.07, "grad_norm": 2.546875, "learning_rate": 1.4941577950831642e-05, "loss": 1.774, "step": 10080 }, { "epoch": 0.07, "grad_norm": 2.953125, "learning_rate": 1.4938576372012062e-05, "loss": 1.7621, "step": 10224 }, { "epoch": 0.07, "grad_norm": 2.515625, "learning_rate": 1.4935499922527949e-05, "loss": 1.7776, "step": 10368 }, { "epoch": 0.07, "grad_norm": 2.640625, "learning_rate": 1.4932348633344338e-05, "loss": 1.7676, "step": 10512 }, { "epoch": 0.07, "grad_norm": 2.671875, "learning_rate": 1.492912253617955e-05, "loss": 1.7807, "step": 10656 }, { "epoch": 0.07, "grad_norm": 2.515625, "learning_rate": 1.4925821663504856e-05, "loss": 1.7738, "step": 10800 }, { "epoch": 0.07, "grad_norm": 2.625, "learning_rate": 1.4922446048544157e-05, "loss": 1.7791, "step": 10944 }, { "epoch": 0.08, "grad_norm": 2.515625, "learning_rate": 1.4918995725273649e-05, "loss": 1.7673, "step": 11088 }, { "epoch": 0.08, "grad_norm": 2.796875, "learning_rate": 1.4915470728421481e-05, "loss": 1.7691, "step": 11232 }, { "epoch": 0.08, "grad_norm": 2.953125, "learning_rate": 1.4911871093467414e-05, "loss": 1.7634, "step": 11376 }, { "epoch": 0.08, "grad_norm": 2.578125, "learning_rate": 1.4908196856642441e-05, "loss": 1.7649, "step": 11520 }, { "epoch": 0.08, "grad_norm": 2.765625, "learning_rate": 1.4904448054928444e-05, "loss": 1.7783, "step": 11664 }, { "epoch": 0.08, "grad_norm": 2.71875, "learning_rate": 1.4900624726057816e-05, "loss": 1.7363, "step": 11808 }, { "epoch": 0.08, "grad_norm": 2.609375, "learning_rate": 1.489672690851308e-05, "loss": 1.7565, "step": 11952 }, { "epoch": 0.08, "grad_norm": 2.578125, "learning_rate": 1.4892754641526498e-05, "loss": 1.7488, "step": 12096 }, { "epoch": 0.08, "grad_norm": 2.40625, "learning_rate": 1.4888707965079682e-05, "loss": 1.7523, "step": 12240 }, { "epoch": 0.08, "grad_norm": 2.703125, "learning_rate": 1.4884586919903186e-05, "loss": 1.7485, "step": 12384 }, { "epoch": 0.09, "grad_norm": 2.40625, "learning_rate": 1.4880391547476106e-05, "loss": 1.7572, "step": 12528 }, { "epoch": 0.09, "grad_norm": 2.5625, "learning_rate": 1.4876121890025647e-05, "loss": 1.7678, "step": 12672 }, { "epoch": 0.09, "grad_norm": 2.359375, "learning_rate": 1.4871777990526713e-05, "loss": 1.7655, "step": 12816 }, { "epoch": 0.09, "grad_norm": 2.515625, "learning_rate": 1.4867359892701466e-05, "loss": 1.7677, "step": 12960 }, { "epoch": 0.09, "grad_norm": 2.609375, "learning_rate": 1.4862867641018887e-05, "loss": 1.7575, "step": 13104 }, { "epoch": 0.09, "grad_norm": 2.828125, "learning_rate": 1.4858301280694332e-05, "loss": 1.7721, "step": 13248 }, { "epoch": 0.09, "grad_norm": 2.6875, "learning_rate": 1.4853660857689073e-05, "loss": 1.7307, "step": 13392 }, { "epoch": 0.09, "grad_norm": 2.375, "learning_rate": 1.4848946418709838e-05, "loss": 1.7483, "step": 13536 }, { "epoch": 0.09, "grad_norm": 2.5625, "learning_rate": 1.4844158011208336e-05, "loss": 1.7484, "step": 13680 }, { "epoch": 0.09, "grad_norm": 2.375, "learning_rate": 1.4839295683380785e-05, "loss": 1.7345, "step": 13824 }, { "epoch": 0.1, "grad_norm": 2.515625, "learning_rate": 1.4834359484167429e-05, "loss": 1.7413, "step": 13968 }, { "epoch": 0.1, "grad_norm": 2.6875, "learning_rate": 1.4829349463252034e-05, "loss": 1.7516, "step": 14112 }, { "epoch": 0.1, "grad_norm": 2.96875, "learning_rate": 1.48242656710614e-05, "loss": 1.7295, "step": 14256 }, { "epoch": 0.1, "grad_norm": 3.328125, "learning_rate": 1.4819108158764847e-05, "loss": 1.7659, "step": 14400 }, { "epoch": 0.1, "grad_norm": 2.546875, "learning_rate": 1.4813876978273708e-05, "loss": 1.7535, "step": 14544 }, { "epoch": 0.1, "grad_norm": 2.484375, "learning_rate": 1.4808572182240786e-05, "loss": 1.7379, "step": 14688 }, { "epoch": 0.1, "grad_norm": 2.515625, "learning_rate": 1.4803193824059852e-05, "loss": 1.7487, "step": 14832 }, { "epoch": 0.1, "grad_norm": 2.5625, "learning_rate": 1.4797741957865087e-05, "loss": 1.7694, "step": 14976 }, { "epoch": 0.1, "grad_norm": 2.484375, "learning_rate": 1.4792216638530545e-05, "loss": 1.7527, "step": 15120 }, { "epoch": 0.1, "grad_norm": 2.4375, "learning_rate": 1.4786617921669603e-05, "loss": 1.7529, "step": 15264 }, { "epoch": 0.1, "grad_norm": 2.453125, "learning_rate": 1.478094586363439e-05, "loss": 1.7451, "step": 15408 }, { "epoch": 0.11, "grad_norm": 2.453125, "learning_rate": 1.4775200521515237e-05, "loss": 1.7626, "step": 15552 }, { "epoch": 0.11, "grad_norm": 2.453125, "learning_rate": 1.4769381953140084e-05, "loss": 1.7435, "step": 15696 }, { "epoch": 0.11, "grad_norm": 2.53125, "learning_rate": 1.4763490217073917e-05, "loss": 1.7327, "step": 15840 }, { "epoch": 0.11, "grad_norm": 2.453125, "learning_rate": 1.475752537261816e-05, "loss": 1.7187, "step": 15984 }, { "epoch": 0.11, "grad_norm": 2.65625, "learning_rate": 1.475148747981009e-05, "loss": 1.7366, "step": 16128 }, { "epoch": 0.11, "grad_norm": 2.46875, "learning_rate": 1.4745376599422232e-05, "loss": 1.7298, "step": 16272 }, { "epoch": 0.11, "grad_norm": 2.578125, "learning_rate": 1.4739192792961736e-05, "loss": 1.7565, "step": 16416 }, { "epoch": 0.11, "grad_norm": 2.5, "learning_rate": 1.4732936122669777e-05, "loss": 1.7442, "step": 16560 }, { "epoch": 0.11, "grad_norm": 2.609375, "learning_rate": 1.4726606651520911e-05, "loss": 1.7295, "step": 16704 }, { "epoch": 0.11, "grad_norm": 2.46875, "learning_rate": 1.4720204443222452e-05, "loss": 1.7239, "step": 16848 }, { "epoch": 0.12, "grad_norm": 2.53125, "learning_rate": 1.4713729562213825e-05, "loss": 1.75, "step": 16992 }, { "epoch": 0.12, "grad_norm": 2.59375, "learning_rate": 1.4707182073665921e-05, "loss": 1.7387, "step": 17136 }, { "epoch": 0.12, "grad_norm": 2.375, "learning_rate": 1.470056204348044e-05, "loss": 1.7465, "step": 17280 }, { "epoch": 0.12, "grad_norm": 2.609375, "learning_rate": 1.4693869538289221e-05, "loss": 1.741, "step": 17424 }, { "epoch": 0.12, "grad_norm": 2.484375, "learning_rate": 1.4687104625453587e-05, "loss": 1.7203, "step": 17568 }, { "epoch": 0.12, "grad_norm": 2.734375, "learning_rate": 1.4680267373063651e-05, "loss": 1.7498, "step": 17712 }, { "epoch": 0.12, "grad_norm": 2.46875, "learning_rate": 1.467335784993764e-05, "loss": 1.7519, "step": 17856 }, { "epoch": 0.12, "grad_norm": 2.765625, "learning_rate": 1.4666376125621198e-05, "loss": 1.7386, "step": 18000 }, { "epoch": 0.12, "grad_norm": 2.5625, "learning_rate": 1.4659322270386687e-05, "loss": 1.7228, "step": 18144 }, { "epoch": 0.12, "grad_norm": 2.671875, "learning_rate": 1.4652196355232481e-05, "loss": 1.723, "step": 18288 }, { "epoch": 0.13, "grad_norm": 2.546875, "learning_rate": 1.4644998451882254e-05, "loss": 1.7334, "step": 18432 }, { "epoch": 0.13, "grad_norm": 2.5625, "learning_rate": 1.463772863278425e-05, "loss": 1.7269, "step": 18576 }, { "epoch": 0.13, "grad_norm": 2.515625, "learning_rate": 1.4630386971110556e-05, "loss": 1.7646, "step": 18720 }, { "epoch": 0.13, "grad_norm": 2.46875, "learning_rate": 1.4622973540756377e-05, "loss": 1.7391, "step": 18864 }, { "epoch": 0.13, "grad_norm": 3.125, "learning_rate": 1.4615488416339277e-05, "loss": 1.7336, "step": 19008 }, { "epoch": 0.13, "grad_norm": 2.5, "learning_rate": 1.4607931673198435e-05, "loss": 1.7431, "step": 19152 }, { "epoch": 0.13, "grad_norm": 2.734375, "learning_rate": 1.4600303387393886e-05, "loss": 1.7331, "step": 19296 }, { "epoch": 0.13, "grad_norm": 2.828125, "learning_rate": 1.4592603635705753e-05, "loss": 1.7113, "step": 19440 }, { "epoch": 0.13, "grad_norm": 2.8125, "learning_rate": 1.4584832495633486e-05, "loss": 1.7154, "step": 19584 }, { "epoch": 0.13, "grad_norm": 2.578125, "learning_rate": 1.4576990045395056e-05, "loss": 1.7258, "step": 19728 }, { "epoch": 0.14, "grad_norm": 2.390625, "learning_rate": 1.4569076363926202e-05, "loss": 1.7353, "step": 19872 }, { "epoch": 0.14, "grad_norm": 2.59375, "learning_rate": 1.4561091530879606e-05, "loss": 1.7143, "step": 20016 }, { "epoch": 0.14, "grad_norm": 2.671875, "learning_rate": 1.455303562662411e-05, "loss": 1.7556, "step": 20160 }, { "epoch": 0.14, "grad_norm": 2.578125, "learning_rate": 1.4544908732243898e-05, "loss": 1.7028, "step": 20304 }, { "epoch": 0.14, "grad_norm": 2.75, "learning_rate": 1.4536710929537685e-05, "loss": 1.7214, "step": 20448 }, { "epoch": 0.14, "grad_norm": 2.8125, "learning_rate": 1.4528442301017889e-05, "loss": 1.7206, "step": 20592 }, { "epoch": 0.14, "grad_norm": 2.4375, "learning_rate": 1.4520102929909807e-05, "loss": 1.7248, "step": 20736 }, { "epoch": 0.14, "grad_norm": 2.671875, "learning_rate": 1.4511692900150771e-05, "loss": 1.7353, "step": 20880 }, { "epoch": 0.14, "grad_norm": 2.453125, "learning_rate": 1.4503212296389307e-05, "loss": 1.7482, "step": 21024 }, { "epoch": 0.14, "grad_norm": 2.59375, "learning_rate": 1.4494661203984277e-05, "loss": 1.7368, "step": 21168 }, { "epoch": 0.14, "grad_norm": 2.703125, "learning_rate": 1.4486039709004033e-05, "loss": 1.7314, "step": 21312 }, { "epoch": 0.15, "grad_norm": 4.0, "learning_rate": 1.447734789822553e-05, "loss": 1.727, "step": 21456 }, { "epoch": 0.15, "grad_norm": 2.796875, "learning_rate": 1.4468585859133475e-05, "loss": 1.7388, "step": 21600 }, { "epoch": 0.15, "grad_norm": 2.484375, "learning_rate": 1.4459753679919435e-05, "loss": 1.7324, "step": 21744 }, { "epoch": 0.15, "grad_norm": 2.453125, "learning_rate": 1.4450851449480947e-05, "loss": 1.725, "step": 21888 }, { "epoch": 0.15, "grad_norm": 2.578125, "learning_rate": 1.4441879257420625e-05, "loss": 1.7084, "step": 22032 }, { "epoch": 0.15, "grad_norm": 2.484375, "learning_rate": 1.4432837194045265e-05, "loss": 1.7264, "step": 22176 }, { "epoch": 0.15, "grad_norm": 3.546875, "learning_rate": 1.4423725350364928e-05, "loss": 1.7102, "step": 22320 }, { "epoch": 0.15, "grad_norm": 2.5, "learning_rate": 1.441454381809203e-05, "loss": 1.7565, "step": 22464 }, { "epoch": 0.15, "grad_norm": 2.625, "learning_rate": 1.440529268964041e-05, "loss": 1.7304, "step": 22608 }, { "epoch": 0.15, "grad_norm": 2.640625, "learning_rate": 1.439597205812441e-05, "loss": 1.7348, "step": 22752 }, { "epoch": 0.16, "grad_norm": 2.5625, "learning_rate": 1.438658201735793e-05, "loss": 1.7227, "step": 22896 }, { "epoch": 0.16, "grad_norm": 2.578125, "learning_rate": 1.4377122661853493e-05, "loss": 1.6986, "step": 23040 }, { "epoch": 0.16, "grad_norm": 2.375, "learning_rate": 1.436759408682128e-05, "loss": 1.7251, "step": 23184 }, { "epoch": 0.16, "grad_norm": 2.53125, "learning_rate": 1.4357996388168187e-05, "loss": 1.7317, "step": 23328 }, { "epoch": 0.16, "grad_norm": 2.484375, "learning_rate": 1.4348329662496846e-05, "loss": 1.7185, "step": 23472 }, { "epoch": 0.16, "grad_norm": 2.546875, "learning_rate": 1.4338594007104664e-05, "loss": 1.7359, "step": 23616 }, { "epoch": 0.16, "grad_norm": 2.34375, "learning_rate": 1.4328789519982834e-05, "loss": 1.7192, "step": 23760 }, { "epoch": 0.16, "grad_norm": 2.609375, "learning_rate": 1.4318916299815355e-05, "loss": 1.7156, "step": 23904 }, { "epoch": 0.16, "grad_norm": 2.421875, "learning_rate": 1.4308974445978041e-05, "loss": 1.7172, "step": 24048 }, { "epoch": 0.16, "grad_norm": 2.359375, "learning_rate": 1.4298964058537512e-05, "loss": 1.7061, "step": 24192 }, { "epoch": 0.17, "grad_norm": 2.484375, "learning_rate": 1.4288885238250189e-05, "loss": 1.7198, "step": 24336 }, { "epoch": 0.17, "grad_norm": 2.421875, "learning_rate": 1.4278738086561289e-05, "loss": 1.7154, "step": 24480 }, { "epoch": 0.17, "grad_norm": 2.515625, "learning_rate": 1.4268522705603794e-05, "loss": 1.7287, "step": 24624 }, { "epoch": 0.17, "grad_norm": 2.890625, "learning_rate": 1.4258239198197428e-05, "loss": 1.6973, "step": 24768 }, { "epoch": 0.17, "grad_norm": 2.609375, "learning_rate": 1.4247887667847619e-05, "loss": 1.7118, "step": 24912 }, { "epoch": 0.17, "grad_norm": 2.484375, "learning_rate": 1.4237468218744457e-05, "loss": 1.732, "step": 25056 }, { "epoch": 0.17, "grad_norm": 2.65625, "learning_rate": 1.4226980955761649e-05, "loss": 1.7271, "step": 25200 }, { "epoch": 0.17, "grad_norm": 2.484375, "learning_rate": 1.4216425984455464e-05, "loss": 1.7145, "step": 25344 }, { "epoch": 0.17, "grad_norm": 2.359375, "learning_rate": 1.4205803411063663e-05, "loss": 1.7272, "step": 25488 }, { "epoch": 0.17, "grad_norm": 3.0, "learning_rate": 1.4195113342504438e-05, "loss": 1.6899, "step": 25632 }, { "epoch": 0.18, "grad_norm": 2.59375, "learning_rate": 1.4184355886375332e-05, "loss": 1.7089, "step": 25776 }, { "epoch": 0.18, "grad_norm": 2.4375, "learning_rate": 1.4173531150952153e-05, "loss": 1.7156, "step": 25920 }, { "epoch": 0.18, "grad_norm": 2.5625, "learning_rate": 1.4162639245187894e-05, "loss": 1.7069, "step": 26064 }, { "epoch": 0.18, "grad_norm": 2.578125, "learning_rate": 1.4151680278711626e-05, "loss": 1.7139, "step": 26208 }, { "epoch": 0.18, "grad_norm": 2.546875, "learning_rate": 1.4140654361827399e-05, "loss": 1.7375, "step": 26352 }, { "epoch": 0.18, "grad_norm": 4.8125, "learning_rate": 1.412956160551313e-05, "loss": 1.7372, "step": 26496 }, { "epoch": 0.18, "grad_norm": 2.859375, "learning_rate": 1.411840212141949e-05, "loss": 1.7045, "step": 26640 }, { "epoch": 0.18, "grad_norm": 2.3125, "learning_rate": 1.410717602186878e-05, "loss": 1.7077, "step": 26784 }, { "epoch": 0.18, "grad_norm": 2.546875, "learning_rate": 1.4095883419853792e-05, "loss": 1.7075, "step": 26928 }, { "epoch": 0.18, "grad_norm": 4.34375, "learning_rate": 1.408452442903668e-05, "loss": 1.7091, "step": 27072 }, { "epoch": 0.19, "grad_norm": 2.5, "learning_rate": 1.4073099163747816e-05, "loss": 1.7108, "step": 27216 }, { "epoch": 0.19, "grad_norm": 2.90625, "learning_rate": 1.406160773898463e-05, "loss": 1.7036, "step": 27360 }, { "epoch": 0.19, "grad_norm": 3.125, "learning_rate": 1.4050050270410474e-05, "loss": 1.7362, "step": 27504 }, { "epoch": 0.19, "grad_norm": 2.578125, "learning_rate": 1.4038426874353428e-05, "loss": 1.6984, "step": 27648 }, { "epoch": 0.19, "grad_norm": 2.515625, "learning_rate": 1.402673766780515e-05, "loss": 1.7191, "step": 27792 }, { "epoch": 0.19, "grad_norm": 3.140625, "learning_rate": 1.4014982768419697e-05, "loss": 1.7401, "step": 27936 }, { "epoch": 0.19, "grad_norm": 2.90625, "learning_rate": 1.400316229451233e-05, "loss": 1.7152, "step": 28080 }, { "epoch": 0.19, "grad_norm": 2.671875, "learning_rate": 1.3991276365058336e-05, "loss": 1.7208, "step": 28224 }, { "epoch": 0.19, "grad_norm": 2.421875, "learning_rate": 1.3979325099691819e-05, "loss": 1.707, "step": 28368 }, { "epoch": 0.19, "grad_norm": 2.734375, "learning_rate": 1.3967308618704503e-05, "loss": 1.7274, "step": 28512 }, { "epoch": 0.19, "grad_norm": 3.0, "learning_rate": 1.3955227043044524e-05, "loss": 1.7213, "step": 28656 }, { "epoch": 0.2, "grad_norm": 2.484375, "learning_rate": 1.3943080494315198e-05, "loss": 1.702, "step": 28800 }, { "epoch": 0.2, "grad_norm": 2.625, "learning_rate": 1.393086909477382e-05, "loss": 1.706, "step": 28944 }, { "epoch": 0.2, "grad_norm": 2.46875, "learning_rate": 1.3918592967330407e-05, "loss": 1.7095, "step": 29088 }, { "epoch": 0.2, "grad_norm": 2.328125, "learning_rate": 1.3906252235546486e-05, "loss": 1.6814, "step": 29232 }, { "epoch": 0.2, "grad_norm": 2.40625, "learning_rate": 1.3893847023633831e-05, "loss": 1.7225, "step": 29376 }, { "epoch": 0.2, "grad_norm": 2.671875, "learning_rate": 1.3881377456453227e-05, "loss": 1.7103, "step": 29520 }, { "epoch": 0.2, "grad_norm": 2.78125, "learning_rate": 1.3868843659513198e-05, "loss": 1.6974, "step": 29664 }, { "epoch": 0.2, "grad_norm": 2.75, "learning_rate": 1.3856245758968766e-05, "loss": 1.7212, "step": 29808 }, { "epoch": 0.2, "grad_norm": 2.484375, "learning_rate": 1.3843583881620151e-05, "loss": 1.727, "step": 29952 }, { "epoch": 0.2, "grad_norm": 2.609375, "learning_rate": 1.3830858154911528e-05, "loss": 1.7035, "step": 30096 }, { "epoch": 0.21, "grad_norm": 2.5625, "learning_rate": 1.3818068706929713e-05, "loss": 1.7041, "step": 30240 } ], "logging_steps": 144, "max_steps": 147005, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1440, "total_flos": 1.2669053109629092e+20, "train_batch_size": 4, "trial_name": null, "trial_params": null }