{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.12734195771253845, "eval_steps": 2, "global_step": 18720, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 3.921875, "learning_rate": 4.896848787123102e-07, "loss": 1.9119, "step": 144 }, { "epoch": 0.0, "grad_norm": 3.5625, "learning_rate": 9.793697574246203e-07, "loss": 1.9115, "step": 288 }, { "epoch": 0.0, "grad_norm": 5.96875, "learning_rate": 1.4690546361369305e-06, "loss": 1.9003, "step": 432 }, { "epoch": 0.0, "grad_norm": 3.359375, "learning_rate": 1.9587395148492407e-06, "loss": 1.9034, "step": 576 }, { "epoch": 0.0, "grad_norm": 3.390625, "learning_rate": 2.4484243935615507e-06, "loss": 1.9145, "step": 720 }, { "epoch": 0.01, "grad_norm": 3.5, "learning_rate": 2.938109272273861e-06, "loss": 1.8969, "step": 864 }, { "epoch": 0.01, "grad_norm": 3.234375, "learning_rate": 3.427794150986171e-06, "loss": 1.8644, "step": 1008 }, { "epoch": 0.01, "grad_norm": 3.328125, "learning_rate": 3.917479029698481e-06, "loss": 1.8823, "step": 1152 }, { "epoch": 0.01, "grad_norm": 3.421875, "learning_rate": 4.4071639084107905e-06, "loss": 1.8635, "step": 1296 }, { "epoch": 0.01, "grad_norm": 3.15625, "learning_rate": 4.896848787123101e-06, "loss": 1.8545, "step": 1440 }, { "epoch": 0.01, "grad_norm": 3.890625, "learning_rate": 5.386533665835411e-06, "loss": 1.871, "step": 1584 }, { "epoch": 0.01, "grad_norm": 3.046875, "learning_rate": 5.876218544547722e-06, "loss": 1.8609, "step": 1728 }, { "epoch": 0.01, "grad_norm": 3.40625, "learning_rate": 6.365903423260031e-06, "loss": 1.8788, "step": 1872 }, { "epoch": 0.01, "grad_norm": 3.140625, "learning_rate": 6.855588301972342e-06, "loss": 1.8672, "step": 2016 }, { "epoch": 0.01, "grad_norm": 3.03125, "learning_rate": 7.345273180684652e-06, "loss": 1.8723, "step": 2160 }, { "epoch": 0.02, "grad_norm": 3.078125, "learning_rate": 7.834958059396963e-06, "loss": 1.879, "step": 2304 }, { "epoch": 0.02, "grad_norm": 3.359375, "learning_rate": 8.324642938109274e-06, "loss": 1.8622, "step": 2448 }, { "epoch": 0.02, "grad_norm": 3.03125, "learning_rate": 8.814327816821581e-06, "loss": 1.8533, "step": 2592 }, { "epoch": 0.02, "grad_norm": 3.125, "learning_rate": 9.304012695533892e-06, "loss": 1.8503, "step": 2736 }, { "epoch": 0.02, "grad_norm": 3.1875, "learning_rate": 9.793697574246203e-06, "loss": 1.8314, "step": 2880 }, { "epoch": 0.02, "grad_norm": 2.96875, "learning_rate": 1.0283382452958513e-05, "loss": 1.835, "step": 3024 }, { "epoch": 0.02, "grad_norm": 3.546875, "learning_rate": 1.0773067331670823e-05, "loss": 1.8526, "step": 3168 }, { "epoch": 0.02, "grad_norm": 2.96875, "learning_rate": 1.1262752210383133e-05, "loss": 1.8434, "step": 3312 }, { "epoch": 0.02, "grad_norm": 3.03125, "learning_rate": 1.1752437089095444e-05, "loss": 1.8381, "step": 3456 }, { "epoch": 0.02, "grad_norm": 2.984375, "learning_rate": 1.2242121967807753e-05, "loss": 1.848, "step": 3600 }, { "epoch": 0.03, "grad_norm": 2.828125, "learning_rate": 1.2731806846520062e-05, "loss": 1.8476, "step": 3744 }, { "epoch": 0.03, "grad_norm": 2.984375, "learning_rate": 1.3221491725232373e-05, "loss": 1.8307, "step": 3888 }, { "epoch": 0.03, "grad_norm": 2.953125, "learning_rate": 1.3711176603944684e-05, "loss": 1.8328, "step": 4032 }, { "epoch": 0.03, "grad_norm": 2.875, "learning_rate": 1.4200861482656995e-05, "loss": 1.833, "step": 4176 }, { "epoch": 0.03, "grad_norm": 2.765625, "learning_rate": 1.4690546361369304e-05, "loss": 1.8559, "step": 4320 }, { "epoch": 0.03, "grad_norm": 2.765625, "learning_rate": 1.4999994886948122e-05, "loss": 1.8258, "step": 4464 }, { "epoch": 0.03, "grad_norm": 2.765625, "learning_rate": 1.499992935844023e-05, "loss": 1.8256, "step": 4608 }, { "epoch": 0.03, "grad_norm": 3.15625, "learning_rate": 1.4999788341738499e-05, "loss": 1.8315, "step": 4752 }, { "epoch": 0.03, "grad_norm": 2.796875, "learning_rate": 1.4999571838262296e-05, "loss": 1.8583, "step": 4896 }, { "epoch": 0.03, "grad_norm": 2.71875, "learning_rate": 1.4999279850190762e-05, "loss": 1.8287, "step": 5040 }, { "epoch": 0.04, "grad_norm": 2.953125, "learning_rate": 1.4998912380462815e-05, "loss": 1.8134, "step": 5184 }, { "epoch": 0.04, "grad_norm": 2.59375, "learning_rate": 1.4998469432777108e-05, "loss": 1.8111, "step": 5328 }, { "epoch": 0.04, "grad_norm": 3.015625, "learning_rate": 1.4997951011591987e-05, "loss": 1.8199, "step": 5472 }, { "epoch": 0.04, "grad_norm": 2.6875, "learning_rate": 1.4997357122125465e-05, "loss": 1.8188, "step": 5616 }, { "epoch": 0.04, "grad_norm": 2.625, "learning_rate": 1.4996687770355145e-05, "loss": 1.8023, "step": 5760 }, { "epoch": 0.04, "grad_norm": 3.015625, "learning_rate": 1.4995942963018183e-05, "loss": 1.8131, "step": 5904 }, { "epoch": 0.04, "grad_norm": 2.71875, "learning_rate": 1.49951227076112e-05, "loss": 1.8056, "step": 6048 }, { "epoch": 0.04, "grad_norm": 2.578125, "learning_rate": 1.4994227012390224e-05, "loss": 1.8034, "step": 6192 }, { "epoch": 0.04, "grad_norm": 2.8125, "learning_rate": 1.4993255886370593e-05, "loss": 1.8221, "step": 6336 }, { "epoch": 0.04, "grad_norm": 3.0, "learning_rate": 1.4992209339326872e-05, "loss": 1.7937, "step": 6480 }, { "epoch": 0.05, "grad_norm": 2.453125, "learning_rate": 1.4991087381792756e-05, "loss": 1.7936, "step": 6624 }, { "epoch": 0.05, "grad_norm": 2.59375, "learning_rate": 1.4989890025060953e-05, "loss": 1.804, "step": 6768 }, { "epoch": 0.05, "grad_norm": 2.671875, "learning_rate": 1.4988617281183087e-05, "loss": 1.779, "step": 6912 }, { "epoch": 0.05, "grad_norm": 2.671875, "learning_rate": 1.4987269162969559e-05, "loss": 1.7907, "step": 7056 }, { "epoch": 0.05, "grad_norm": 2.65625, "learning_rate": 1.4985845683989435e-05, "loss": 1.7945, "step": 7200 }, { "epoch": 0.05, "grad_norm": 2.6875, "learning_rate": 1.4984346858570297e-05, "loss": 1.7887, "step": 7344 }, { "epoch": 0.05, "grad_norm": 2.9375, "learning_rate": 1.4982772701798105e-05, "loss": 1.7657, "step": 7488 }, { "epoch": 0.05, "grad_norm": 2.703125, "learning_rate": 1.4981123229517039e-05, "loss": 1.7914, "step": 7632 }, { "epoch": 0.05, "grad_norm": 2.546875, "learning_rate": 1.4979398458329347e-05, "loss": 1.7716, "step": 7776 }, { "epoch": 0.05, "grad_norm": 2.46875, "learning_rate": 1.4977598405595177e-05, "loss": 1.7979, "step": 7920 }, { "epoch": 0.05, "grad_norm": 2.59375, "learning_rate": 1.4975723089432394e-05, "loss": 1.7751, "step": 8064 }, { "epoch": 0.06, "grad_norm": 2.5625, "learning_rate": 1.4973772528716404e-05, "loss": 1.7868, "step": 8208 }, { "epoch": 0.06, "grad_norm": 2.78125, "learning_rate": 1.4971746743079972e-05, "loss": 1.7974, "step": 8352 }, { "epoch": 0.06, "grad_norm": 2.59375, "learning_rate": 1.4969645752913006e-05, "loss": 1.7895, "step": 8496 }, { "epoch": 0.06, "grad_norm": 2.578125, "learning_rate": 1.4967469579362364e-05, "loss": 1.7919, "step": 8640 }, { "epoch": 0.06, "grad_norm": 2.484375, "learning_rate": 1.4965218244331647e-05, "loss": 1.7856, "step": 8784 }, { "epoch": 0.06, "grad_norm": 2.578125, "learning_rate": 1.4962891770480958e-05, "loss": 1.7777, "step": 8928 }, { "epoch": 0.06, "grad_norm": 2.5, "learning_rate": 1.4960490181226693e-05, "loss": 1.7796, "step": 9072 }, { "epoch": 0.06, "grad_norm": 2.421875, "learning_rate": 1.49580135007413e-05, "loss": 1.7769, "step": 9216 }, { "epoch": 0.06, "grad_norm": 2.59375, "learning_rate": 1.495546175395303e-05, "loss": 1.7843, "step": 9360 }, { "epoch": 0.06, "grad_norm": 2.640625, "learning_rate": 1.495283496654569e-05, "loss": 1.7763, "step": 9504 }, { "epoch": 0.07, "grad_norm": 2.765625, "learning_rate": 1.4950133164958392e-05, "loss": 1.7709, "step": 9648 }, { "epoch": 0.07, "grad_norm": 2.4375, "learning_rate": 1.4947356376385275e-05, "loss": 1.781, "step": 9792 }, { "epoch": 0.07, "grad_norm": 2.5625, "learning_rate": 1.4944504628775233e-05, "loss": 1.7831, "step": 9936 }, { "epoch": 0.07, "grad_norm": 2.71875, "learning_rate": 1.4941577950831642e-05, "loss": 1.7835, "step": 10080 }, { "epoch": 0.07, "grad_norm": 2.640625, "learning_rate": 1.4938576372012062e-05, "loss": 1.7648, "step": 10224 }, { "epoch": 0.07, "grad_norm": 2.703125, "learning_rate": 1.4935499922527949e-05, "loss": 1.774, "step": 10368 }, { "epoch": 0.07, "grad_norm": 2.640625, "learning_rate": 1.4932348633344338e-05, "loss": 1.7989, "step": 10512 }, { "epoch": 0.07, "grad_norm": 2.5, "learning_rate": 1.492912253617955e-05, "loss": 1.7624, "step": 10656 }, { "epoch": 0.07, "grad_norm": 2.4375, "learning_rate": 1.4925821663504856e-05, "loss": 1.7584, "step": 10800 }, { "epoch": 0.07, "grad_norm": 2.640625, "learning_rate": 1.4922446048544157e-05, "loss": 1.7719, "step": 10944 }, { "epoch": 0.08, "grad_norm": 2.703125, "learning_rate": 1.4918995725273649e-05, "loss": 1.7416, "step": 11088 }, { "epoch": 0.08, "grad_norm": 2.59375, "learning_rate": 1.4915470728421481e-05, "loss": 1.7587, "step": 11232 }, { "epoch": 0.08, "grad_norm": 2.609375, "learning_rate": 1.4911871093467414e-05, "loss": 1.7562, "step": 11376 }, { "epoch": 0.08, "grad_norm": 2.90625, "learning_rate": 1.4908196856642441e-05, "loss": 1.7579, "step": 11520 }, { "epoch": 0.08, "grad_norm": 2.625, "learning_rate": 1.4904448054928444e-05, "loss": 1.7734, "step": 11664 }, { "epoch": 0.08, "grad_norm": 2.734375, "learning_rate": 1.4900624726057816e-05, "loss": 1.7781, "step": 11808 }, { "epoch": 0.08, "grad_norm": 2.640625, "learning_rate": 1.489672690851308e-05, "loss": 1.7576, "step": 11952 }, { "epoch": 0.08, "grad_norm": 2.65625, "learning_rate": 1.4892754641526498e-05, "loss": 1.7561, "step": 12096 }, { "epoch": 0.08, "grad_norm": 2.375, "learning_rate": 1.4888707965079682e-05, "loss": 1.7602, "step": 12240 }, { "epoch": 0.08, "grad_norm": 2.5625, "learning_rate": 1.4884586919903186e-05, "loss": 1.7676, "step": 12384 }, { "epoch": 0.09, "grad_norm": 2.515625, "learning_rate": 1.4880391547476106e-05, "loss": 1.7546, "step": 12528 }, { "epoch": 0.09, "grad_norm": 2.234375, "learning_rate": 1.4876121890025647e-05, "loss": 1.7484, "step": 12672 }, { "epoch": 0.09, "grad_norm": 3.765625, "learning_rate": 1.4871777990526713e-05, "loss": 1.7425, "step": 12816 }, { "epoch": 0.09, "grad_norm": 2.703125, "learning_rate": 1.4867359892701466e-05, "loss": 1.7495, "step": 12960 }, { "epoch": 0.09, "grad_norm": 2.578125, "learning_rate": 1.4862867641018887e-05, "loss": 1.7478, "step": 13104 }, { "epoch": 0.09, "grad_norm": 2.453125, "learning_rate": 1.4858301280694332e-05, "loss": 1.7648, "step": 13248 }, { "epoch": 0.09, "grad_norm": 2.546875, "learning_rate": 1.4853660857689073e-05, "loss": 1.7779, "step": 13392 }, { "epoch": 0.09, "grad_norm": 6.0, "learning_rate": 1.4848946418709838e-05, "loss": 1.746, "step": 13536 }, { "epoch": 0.09, "grad_norm": 11.1875, "learning_rate": 1.4844158011208336e-05, "loss": 1.7482, "step": 13680 }, { "epoch": 0.09, "grad_norm": 2.4375, "learning_rate": 1.4839295683380785e-05, "loss": 1.7614, "step": 13824 }, { "epoch": 0.1, "grad_norm": 2.625, "learning_rate": 1.4834359484167429e-05, "loss": 1.7394, "step": 13968 }, { "epoch": 0.1, "grad_norm": 2.65625, "learning_rate": 1.4829349463252034e-05, "loss": 1.7529, "step": 14112 }, { "epoch": 0.1, "grad_norm": 2.578125, "learning_rate": 1.48242656710614e-05, "loss": 1.741, "step": 14256 }, { "epoch": 0.1, "grad_norm": 2.484375, "learning_rate": 1.4819108158764847e-05, "loss": 1.7496, "step": 14400 }, { "epoch": 0.1, "grad_norm": 2.40625, "learning_rate": 1.4813876978273708e-05, "loss": 1.7537, "step": 14544 }, { "epoch": 0.1, "grad_norm": 2.46875, "learning_rate": 1.4808572182240786e-05, "loss": 1.7435, "step": 14688 }, { "epoch": 0.1, "grad_norm": 2.703125, "learning_rate": 1.4803193824059852e-05, "loss": 1.7665, "step": 14832 }, { "epoch": 0.1, "grad_norm": 2.546875, "learning_rate": 1.4797741957865087e-05, "loss": 1.7547, "step": 14976 }, { "epoch": 0.1, "grad_norm": 2.59375, "learning_rate": 1.4792216638530545e-05, "loss": 1.7356, "step": 15120 }, { "epoch": 0.1, "grad_norm": 2.703125, "learning_rate": 1.4786617921669603e-05, "loss": 1.7397, "step": 15264 }, { "epoch": 0.1, "grad_norm": 2.5625, "learning_rate": 1.478094586363439e-05, "loss": 1.7239, "step": 15408 }, { "epoch": 0.11, "grad_norm": 2.96875, "learning_rate": 1.4775200521515237e-05, "loss": 1.7428, "step": 15552 }, { "epoch": 0.11, "grad_norm": 2.765625, "learning_rate": 1.4769381953140084e-05, "loss": 1.745, "step": 15696 }, { "epoch": 0.11, "grad_norm": 2.453125, "learning_rate": 1.4763490217073917e-05, "loss": 1.7382, "step": 15840 }, { "epoch": 0.11, "grad_norm": 2.5, "learning_rate": 1.475752537261816e-05, "loss": 1.7524, "step": 15984 }, { "epoch": 0.11, "grad_norm": 2.5, "learning_rate": 1.475148747981009e-05, "loss": 1.7288, "step": 16128 }, { "epoch": 0.11, "grad_norm": 2.484375, "learning_rate": 1.4745376599422232e-05, "loss": 1.7336, "step": 16272 }, { "epoch": 0.11, "grad_norm": 2.78125, "learning_rate": 1.4739192792961736e-05, "loss": 1.7313, "step": 16416 }, { "epoch": 0.11, "grad_norm": 2.53125, "learning_rate": 1.4732936122669777e-05, "loss": 1.7537, "step": 16560 }, { "epoch": 0.11, "grad_norm": 2.671875, "learning_rate": 1.4726606651520911e-05, "loss": 1.7428, "step": 16704 }, { "epoch": 0.11, "grad_norm": 2.90625, "learning_rate": 1.4720204443222452e-05, "loss": 1.7299, "step": 16848 }, { "epoch": 0.12, "grad_norm": 2.4375, "learning_rate": 1.4713729562213825e-05, "loss": 1.7276, "step": 16992 }, { "epoch": 0.12, "grad_norm": 2.5, "learning_rate": 1.4707182073665921e-05, "loss": 1.7312, "step": 17136 }, { "epoch": 0.12, "grad_norm": 2.71875, "learning_rate": 1.470056204348044e-05, "loss": 1.7477, "step": 17280 }, { "epoch": 0.12, "grad_norm": 2.453125, "learning_rate": 1.4693869538289221e-05, "loss": 1.7296, "step": 17424 }, { "epoch": 0.12, "grad_norm": 2.890625, "learning_rate": 1.4687104625453587e-05, "loss": 1.7353, "step": 17568 }, { "epoch": 0.12, "grad_norm": 2.359375, "learning_rate": 1.4680267373063651e-05, "loss": 1.7265, "step": 17712 }, { "epoch": 0.12, "grad_norm": 2.421875, "learning_rate": 1.467335784993764e-05, "loss": 1.7406, "step": 17856 }, { "epoch": 0.12, "grad_norm": 2.828125, "learning_rate": 1.4666376125621198e-05, "loss": 1.7313, "step": 18000 }, { "epoch": 0.12, "grad_norm": 2.46875, "learning_rate": 1.4659322270386687e-05, "loss": 1.7614, "step": 18144 }, { "epoch": 0.12, "grad_norm": 2.609375, "learning_rate": 1.4652196355232481e-05, "loss": 1.7313, "step": 18288 }, { "epoch": 0.13, "grad_norm": 2.5, "learning_rate": 1.4644998451882254e-05, "loss": 1.7307, "step": 18432 }, { "epoch": 0.13, "grad_norm": 2.546875, "learning_rate": 1.463772863278425e-05, "loss": 1.7291, "step": 18576 }, { "epoch": 0.13, "grad_norm": 2.359375, "learning_rate": 1.4630386971110556e-05, "loss": 1.7315, "step": 18720 } ], "logging_steps": 144, "max_steps": 147005, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1440, "total_flos": 7.842747163103724e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }