{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9970326409495549, "eval_steps": 42, "global_step": 168, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 0.0981353223323822, "learning_rate": 2e-05, "loss": 0.641, "step": 1 }, { "epoch": 0.01, "eval_loss": 0.6416735053062439, "eval_runtime": 21.4326, "eval_samples_per_second": 46.331, "eval_steps_per_second": 11.618, "step": 1 }, { "epoch": 0.01, "grad_norm": 0.09748291969299316, "learning_rate": 4e-05, "loss": 0.6396, "step": 2 }, { "epoch": 0.02, "grad_norm": 0.09947647899389267, "learning_rate": 6e-05, "loss": 0.6397, "step": 3 }, { "epoch": 0.02, "grad_norm": 0.09976381808519363, "learning_rate": 8e-05, "loss": 0.6371, "step": 4 }, { "epoch": 0.03, "grad_norm": 0.10493721067905426, "learning_rate": 0.0001, "loss": 0.6491, "step": 5 }, { "epoch": 0.04, "grad_norm": 0.1144007071852684, "learning_rate": 0.00012, "loss": 0.6218, "step": 6 }, { "epoch": 0.04, "grad_norm": 0.08536222577095032, "learning_rate": 0.00014, "loss": 0.6177, "step": 7 }, { "epoch": 0.05, "grad_norm": 0.11926598846912384, "learning_rate": 0.00016, "loss": 0.5861, "step": 8 }, { "epoch": 0.05, "grad_norm": 0.15648387372493744, "learning_rate": 0.00018, "loss": 0.6006, "step": 9 }, { "epoch": 0.06, "grad_norm": 0.12172720581293106, "learning_rate": 0.0002, "loss": 0.5845, "step": 10 }, { "epoch": 0.07, "grad_norm": 0.09348208457231522, "learning_rate": 0.0001999988739622358, "loss": 0.5471, "step": 11 }, { "epoch": 0.07, "grad_norm": 0.07471276819705963, "learning_rate": 0.00019999549587430254, "loss": 0.578, "step": 12 }, { "epoch": 0.08, "grad_norm": 0.07200929522514343, "learning_rate": 0.00019998986581227718, "loss": 0.5328, "step": 13 }, { "epoch": 0.08, "grad_norm": 0.07460763305425644, "learning_rate": 0.000199981983902953, "loss": 0.5651, "step": 14 }, { "epoch": 0.09, "grad_norm": 0.07441641390323639, "learning_rate": 0.00019997185032383664, "loss": 0.5589, "step": 15 }, { "epoch": 0.09, "grad_norm": 0.07513019442558289, "learning_rate": 0.00019995946530314385, "loss": 0.5736, "step": 16 }, { "epoch": 0.1, "grad_norm": 0.06902395933866501, "learning_rate": 0.00019994482911979468, "loss": 0.556, "step": 17 }, { "epoch": 0.11, "grad_norm": 0.07314619421958923, "learning_rate": 0.00019992794210340706, "loss": 0.5469, "step": 18 }, { "epoch": 0.11, "grad_norm": 0.06833848357200623, "learning_rate": 0.00019990880463428937, "loss": 0.5448, "step": 19 }, { "epoch": 0.12, "grad_norm": 0.07301248610019684, "learning_rate": 0.00019988741714343177, "loss": 0.5612, "step": 20 }, { "epoch": 0.12, "grad_norm": 0.07063400000333786, "learning_rate": 0.0001998637801124968, "loss": 0.5446, "step": 21 }, { "epoch": 0.13, "grad_norm": 0.06935883313417435, "learning_rate": 0.00019983789407380828, "loss": 0.5223, "step": 22 }, { "epoch": 0.14, "grad_norm": 0.06576420366764069, "learning_rate": 0.00019980975961033924, "loss": 0.5351, "step": 23 }, { "epoch": 0.14, "grad_norm": 0.07276671379804611, "learning_rate": 0.00019977937735569915, "loss": 0.5423, "step": 24 }, { "epoch": 0.15, "grad_norm": 0.0756976306438446, "learning_rate": 0.00019974674799411925, "loss": 0.5344, "step": 25 }, { "epoch": 0.15, "grad_norm": 0.06945928931236267, "learning_rate": 0.00019971187226043745, "loss": 0.5198, "step": 26 }, { "epoch": 0.16, "grad_norm": 0.06587155908346176, "learning_rate": 0.0001996747509400816, "loss": 0.5175, "step": 27 }, { "epoch": 0.17, "grad_norm": 0.0752682164311409, "learning_rate": 0.0001996353848690519, "loss": 0.5068, "step": 28 }, { "epoch": 0.17, "grad_norm": 0.0740601122379303, "learning_rate": 0.00019959377493390196, "loss": 0.535, "step": 29 }, { "epoch": 0.18, "grad_norm": 0.07076304405927658, "learning_rate": 0.00019954992207171898, "loss": 0.5079, "step": 30 }, { "epoch": 0.18, "grad_norm": 0.0776033028960228, "learning_rate": 0.00019950382727010254, "loss": 0.5124, "step": 31 }, { "epoch": 0.19, "grad_norm": 0.0779872015118599, "learning_rate": 0.00019945549156714234, "loss": 0.5146, "step": 32 }, { "epoch": 0.2, "grad_norm": 0.08037945628166199, "learning_rate": 0.00019940491605139498, "loss": 0.5189, "step": 33 }, { "epoch": 0.2, "grad_norm": 0.06880298256874084, "learning_rate": 0.0001993521018618592, "loss": 0.506, "step": 34 }, { "epoch": 0.21, "grad_norm": 0.0755767747759819, "learning_rate": 0.00019929705018795053, "loss": 0.4997, "step": 35 }, { "epoch": 0.21, "grad_norm": 0.07505559921264648, "learning_rate": 0.00019923976226947417, "loss": 0.502, "step": 36 }, { "epoch": 0.22, "grad_norm": 0.07533205300569534, "learning_rate": 0.00019918023939659733, "loss": 0.5093, "step": 37 }, { "epoch": 0.23, "grad_norm": 0.0748637244105339, "learning_rate": 0.0001991184829098201, "loss": 0.4976, "step": 38 }, { "epoch": 0.23, "grad_norm": 0.076931431889534, "learning_rate": 0.00019905449419994518, "loss": 0.4992, "step": 39 }, { "epoch": 0.24, "grad_norm": 0.07511387020349503, "learning_rate": 0.0001989882747080466, "loss": 0.5069, "step": 40 }, { "epoch": 0.24, "grad_norm": 0.0723625123500824, "learning_rate": 0.00019891982592543746, "loss": 0.4952, "step": 41 }, { "epoch": 0.25, "grad_norm": 0.07320375740528107, "learning_rate": 0.00019884914939363588, "loss": 0.5093, "step": 42 }, { "epoch": 0.25, "eval_loss": 0.5259941220283508, "eval_runtime": 21.4684, "eval_samples_per_second": 46.254, "eval_steps_per_second": 11.598, "step": 42 }, { "epoch": 0.26, "grad_norm": 0.07251272350549698, "learning_rate": 0.00019877624670433086, "loss": 0.4931, "step": 43 }, { "epoch": 0.26, "grad_norm": 0.07731667906045914, "learning_rate": 0.00019870111949934599, "loss": 0.4879, "step": 44 }, { "epoch": 0.27, "grad_norm": 0.074358269572258, "learning_rate": 0.00019862376947060264, "loss": 0.5049, "step": 45 }, { "epoch": 0.27, "grad_norm": 0.0808371901512146, "learning_rate": 0.0001985441983600819, "loss": 0.517, "step": 46 }, { "epoch": 0.28, "grad_norm": 0.07559769600629807, "learning_rate": 0.00019846240795978528, "loss": 0.4834, "step": 47 }, { "epoch": 0.28, "grad_norm": 0.07425505667924881, "learning_rate": 0.00019837840011169438, "loss": 0.5138, "step": 48 }, { "epoch": 0.29, "grad_norm": 0.07782939821481705, "learning_rate": 0.00019829217670772935, "loss": 0.4858, "step": 49 }, { "epoch": 0.3, "grad_norm": 0.0754002034664154, "learning_rate": 0.00019820373968970642, "loss": 0.4941, "step": 50 }, { "epoch": 0.3, "grad_norm": 0.07364428788423538, "learning_rate": 0.000198113091049294, "loss": 0.4835, "step": 51 }, { "epoch": 0.31, "grad_norm": 0.08309967815876007, "learning_rate": 0.00019802023282796796, "loss": 0.5237, "step": 52 }, { "epoch": 0.31, "grad_norm": 0.07548778504133224, "learning_rate": 0.00019792516711696556, "loss": 0.4923, "step": 53 }, { "epoch": 0.32, "grad_norm": 0.07607278972864151, "learning_rate": 0.0001978278960572384, "loss": 0.4971, "step": 54 }, { "epoch": 0.33, "grad_norm": 0.07432844489812851, "learning_rate": 0.00019772842183940422, "loss": 0.4874, "step": 55 }, { "epoch": 0.33, "grad_norm": 0.077260322868824, "learning_rate": 0.00019762674670369755, "loss": 0.5067, "step": 56 }, { "epoch": 0.34, "grad_norm": 0.08594146370887756, "learning_rate": 0.00019752287293991927, "loss": 0.4804, "step": 57 }, { "epoch": 0.34, "grad_norm": 0.075816310942173, "learning_rate": 0.00019741680288738492, "loss": 0.4738, "step": 58 }, { "epoch": 0.35, "grad_norm": 0.07784326374530792, "learning_rate": 0.00019730853893487228, "loss": 0.4768, "step": 59 }, { "epoch": 0.36, "grad_norm": 0.08903329074382782, "learning_rate": 0.00019719808352056724, "loss": 0.4773, "step": 60 }, { "epoch": 0.36, "grad_norm": 0.07911587506532669, "learning_rate": 0.00019708543913200924, "loss": 0.4672, "step": 61 }, { "epoch": 0.37, "grad_norm": 0.07881385087966919, "learning_rate": 0.00019697060830603494, "loss": 0.4824, "step": 62 }, { "epoch": 0.37, "grad_norm": 0.08292945474386215, "learning_rate": 0.00019685359362872125, "loss": 0.4814, "step": 63 }, { "epoch": 0.38, "grad_norm": 0.08237861096858978, "learning_rate": 0.00019673439773532713, "loss": 0.486, "step": 64 }, { "epoch": 0.39, "grad_norm": 0.07958442717790604, "learning_rate": 0.0001966130233102341, "loss": 0.4913, "step": 65 }, { "epoch": 0.39, "grad_norm": 0.07969169318675995, "learning_rate": 0.00019648947308688593, "loss": 0.4781, "step": 66 }, { "epoch": 0.4, "grad_norm": 0.08310563862323761, "learning_rate": 0.00019636374984772692, "loss": 0.4811, "step": 67 }, { "epoch": 0.4, "grad_norm": 0.07763976603746414, "learning_rate": 0.00019623585642413938, "loss": 0.4809, "step": 68 }, { "epoch": 0.41, "grad_norm": 0.0927213802933693, "learning_rate": 0.00019610579569637982, "loss": 0.5019, "step": 69 }, { "epoch": 0.42, "grad_norm": 0.08405344933271408, "learning_rate": 0.000195973570593514, "loss": 0.5001, "step": 70 }, { "epoch": 0.42, "grad_norm": 0.07862479984760284, "learning_rate": 0.0001958391840933512, "loss": 0.4894, "step": 71 }, { "epoch": 0.43, "grad_norm": 0.07815296947956085, "learning_rate": 0.00019570263922237687, "loss": 0.4676, "step": 72 }, { "epoch": 0.43, "grad_norm": 0.07999672740697861, "learning_rate": 0.00019556393905568458, "loss": 0.4857, "step": 73 }, { "epoch": 0.44, "grad_norm": 0.08266247063875198, "learning_rate": 0.0001954230867169069, "loss": 0.4842, "step": 74 }, { "epoch": 0.45, "grad_norm": 0.08117777854204178, "learning_rate": 0.00019528008537814486, "loss": 0.4602, "step": 75 }, { "epoch": 0.45, "grad_norm": 0.08203484117984772, "learning_rate": 0.00019513493825989664, "loss": 0.4761, "step": 76 }, { "epoch": 0.46, "grad_norm": 0.07647153735160828, "learning_rate": 0.00019498764863098495, "loss": 0.4839, "step": 77 }, { "epoch": 0.46, "grad_norm": 0.0811714455485344, "learning_rate": 0.00019483821980848347, "loss": 0.4803, "step": 78 }, { "epoch": 0.47, "grad_norm": 0.08266978710889816, "learning_rate": 0.00019468665515764215, "loss": 0.4665, "step": 79 }, { "epoch": 0.47, "grad_norm": 0.07869689911603928, "learning_rate": 0.00019453295809181143, "loss": 0.4857, "step": 80 }, { "epoch": 0.48, "grad_norm": 0.08934654295444489, "learning_rate": 0.00019437713207236525, "loss": 0.4825, "step": 81 }, { "epoch": 0.49, "grad_norm": 0.07842836529016495, "learning_rate": 0.00019421918060862333, "loss": 0.4609, "step": 82 }, { "epoch": 0.49, "grad_norm": 0.08244986832141876, "learning_rate": 0.0001940591072577719, "loss": 0.4688, "step": 83 }, { "epoch": 0.5, "grad_norm": 0.07819854468107224, "learning_rate": 0.00019389691562478374, "loss": 0.4665, "step": 84 }, { "epoch": 0.5, "eval_loss": 0.5117939114570618, "eval_runtime": 21.4742, "eval_samples_per_second": 46.242, "eval_steps_per_second": 11.595, "step": 84 }, { "epoch": 0.5, "grad_norm": 0.0837428942322731, "learning_rate": 0.0001937326093623369, "loss": 0.4952, "step": 85 }, { "epoch": 0.51, "grad_norm": 0.07781701534986496, "learning_rate": 0.00019356619217073253, "loss": 0.467, "step": 86 }, { "epoch": 0.52, "grad_norm": 0.08447270840406418, "learning_rate": 0.00019339766779781145, "loss": 0.4838, "step": 87 }, { "epoch": 0.52, "grad_norm": 0.08231997489929199, "learning_rate": 0.00019322704003886987, "loss": 0.4611, "step": 88 }, { "epoch": 0.53, "grad_norm": 0.08507382869720459, "learning_rate": 0.00019305431273657374, "loss": 0.4757, "step": 89 }, { "epoch": 0.53, "grad_norm": 0.08521989732980728, "learning_rate": 0.0001928794897808724, "loss": 0.4854, "step": 90 }, { "epoch": 0.54, "grad_norm": 0.0963786169886589, "learning_rate": 0.00019270257510891082, "loss": 0.4505, "step": 91 }, { "epoch": 0.55, "grad_norm": 0.08671442419290543, "learning_rate": 0.0001925235727049411, "loss": 0.4766, "step": 92 }, { "epoch": 0.55, "grad_norm": 0.09087081998586655, "learning_rate": 0.0001923424866002325, "loss": 0.4966, "step": 93 }, { "epoch": 0.56, "grad_norm": 0.07899381965398788, "learning_rate": 0.00019215932087298092, "loss": 0.4638, "step": 94 }, { "epoch": 0.56, "grad_norm": 0.09070860594511032, "learning_rate": 0.00019197407964821684, "loss": 0.4847, "step": 95 }, { "epoch": 0.57, "grad_norm": 0.0885949656367302, "learning_rate": 0.00019178676709771258, "loss": 0.4648, "step": 96 }, { "epoch": 0.58, "grad_norm": 0.09253839403390884, "learning_rate": 0.00019159738743988825, "loss": 0.459, "step": 97 }, { "epoch": 0.58, "grad_norm": 0.08571318536996841, "learning_rate": 0.00019140594493971674, "loss": 0.4797, "step": 98 }, { "epoch": 0.59, "grad_norm": 0.07787954807281494, "learning_rate": 0.0001912124439086278, "loss": 0.4547, "step": 99 }, { "epoch": 0.59, "grad_norm": 0.08822935819625854, "learning_rate": 0.00019101688870441078, "loss": 0.4511, "step": 100 }, { "epoch": 0.6, "grad_norm": 0.08409956842660904, "learning_rate": 0.0001908192837311166, "loss": 0.4631, "step": 101 }, { "epoch": 0.61, "grad_norm": 0.08279416710138321, "learning_rate": 0.00019061963343895846, "loss": 0.4696, "step": 102 }, { "epoch": 0.61, "grad_norm": 0.09696204960346222, "learning_rate": 0.00019041794232421176, "loss": 0.4862, "step": 103 }, { "epoch": 0.62, "grad_norm": 0.08494329452514648, "learning_rate": 0.00019021421492911272, "loss": 0.4557, "step": 104 }, { "epoch": 0.62, "grad_norm": 0.08702557533979416, "learning_rate": 0.00019000845584175616, "loss": 0.4693, "step": 105 }, { "epoch": 0.63, "grad_norm": 0.09048158675432205, "learning_rate": 0.00018980066969599216, "loss": 0.4714, "step": 106 }, { "epoch": 0.64, "grad_norm": 0.08462114632129669, "learning_rate": 0.0001895908611713216, "loss": 0.4632, "step": 107 }, { "epoch": 0.64, "grad_norm": 0.09956546127796173, "learning_rate": 0.00018937903499279102, "loss": 0.4638, "step": 108 }, { "epoch": 0.65, "grad_norm": 0.08630617707967758, "learning_rate": 0.00018916519593088584, "loss": 0.4499, "step": 109 }, { "epoch": 0.65, "grad_norm": 0.08207620680332184, "learning_rate": 0.0001889493488014233, "loss": 0.4603, "step": 110 }, { "epoch": 0.66, "grad_norm": 0.08473565429449081, "learning_rate": 0.00018873149846544376, "loss": 0.4571, "step": 111 }, { "epoch": 0.66, "grad_norm": 0.08818928152322769, "learning_rate": 0.00018851164982910135, "loss": 0.4489, "step": 112 }, { "epoch": 0.67, "grad_norm": 0.08116699010133743, "learning_rate": 0.00018828980784355338, "loss": 0.4578, "step": 113 }, { "epoch": 0.68, "grad_norm": 0.08832226693630219, "learning_rate": 0.00018806597750484897, "loss": 0.4719, "step": 114 }, { "epoch": 0.68, "grad_norm": 0.08624406903982162, "learning_rate": 0.0001878401638538163, "loss": 0.4628, "step": 115 }, { "epoch": 0.69, "grad_norm": 0.08936543017625809, "learning_rate": 0.00018761237197594945, "loss": 0.4533, "step": 116 }, { "epoch": 0.69, "grad_norm": 0.08579661697149277, "learning_rate": 0.00018738260700129354, "loss": 0.4772, "step": 117 }, { "epoch": 0.7, "grad_norm": 0.08271288126707077, "learning_rate": 0.0001871508741043293, "loss": 0.4773, "step": 118 }, { "epoch": 0.71, "grad_norm": 0.08224964886903763, "learning_rate": 0.0001869171785038566, "loss": 0.4635, "step": 119 }, { "epoch": 0.71, "grad_norm": 0.087012380361557, "learning_rate": 0.00018668152546287686, "loss": 0.4559, "step": 120 }, { "epoch": 0.72, "grad_norm": 0.08352699875831604, "learning_rate": 0.00018644392028847458, "loss": 0.4485, "step": 121 }, { "epoch": 0.72, "grad_norm": 0.08281444013118744, "learning_rate": 0.00018620436833169772, "loss": 0.4393, "step": 122 }, { "epoch": 0.73, "grad_norm": 0.08376545459032059, "learning_rate": 0.00018596287498743732, "loss": 0.4525, "step": 123 }, { "epoch": 0.74, "grad_norm": 0.08526434749364853, "learning_rate": 0.0001857194456943058, "loss": 0.4456, "step": 124 }, { "epoch": 0.74, "grad_norm": 0.08151934295892715, "learning_rate": 0.0001854740859345148, "loss": 0.4576, "step": 125 }, { "epoch": 0.75, "grad_norm": 0.08793777972459793, "learning_rate": 0.0001852268012337514, "loss": 0.4431, "step": 126 }, { "epoch": 0.75, "eval_loss": 0.5042669773101807, "eval_runtime": 21.4592, "eval_samples_per_second": 46.274, "eval_steps_per_second": 11.603, "step": 126 }, { "epoch": 0.75, "grad_norm": 0.08135095983743668, "learning_rate": 0.00018497759716105377, "loss": 0.4384, "step": 127 }, { "epoch": 0.76, "grad_norm": 0.0917576476931572, "learning_rate": 0.0001847264793286859, "loss": 0.4687, "step": 128 }, { "epoch": 0.77, "grad_norm": 0.08832691609859467, "learning_rate": 0.00018447345339201102, "loss": 0.4386, "step": 129 }, { "epoch": 0.77, "grad_norm": 0.08340886980295181, "learning_rate": 0.00018421852504936438, "loss": 0.4512, "step": 130 }, { "epoch": 0.78, "grad_norm": 0.08589499443769455, "learning_rate": 0.00018396170004192475, "loss": 0.4387, "step": 131 }, { "epoch": 0.78, "grad_norm": 0.08753557503223419, "learning_rate": 0.00018370298415358526, "loss": 0.4615, "step": 132 }, { "epoch": 0.79, "grad_norm": 0.08406232297420502, "learning_rate": 0.00018344238321082315, "loss": 0.4465, "step": 133 }, { "epoch": 0.8, "grad_norm": 0.08514856547117233, "learning_rate": 0.0001831799030825685, "loss": 0.4516, "step": 134 }, { "epoch": 0.8, "grad_norm": 0.09259331226348877, "learning_rate": 0.000182915549680072, "loss": 0.4387, "step": 135 }, { "epoch": 0.81, "grad_norm": 0.08862275630235672, "learning_rate": 0.00018264932895677193, "loss": 0.4434, "step": 136 }, { "epoch": 0.81, "grad_norm": 0.08515379577875137, "learning_rate": 0.0001823812469081601, "loss": 0.4425, "step": 137 }, { "epoch": 0.82, "grad_norm": 0.09041007608175278, "learning_rate": 0.00018211130957164668, "loss": 0.4607, "step": 138 }, { "epoch": 0.82, "grad_norm": 0.08312032371759415, "learning_rate": 0.0001818395230264244, "loss": 0.442, "step": 139 }, { "epoch": 0.83, "grad_norm": 0.08981412649154663, "learning_rate": 0.00018156589339333152, "loss": 0.4608, "step": 140 }, { "epoch": 0.84, "grad_norm": 0.08991118520498276, "learning_rate": 0.00018129042683471402, "loss": 0.451, "step": 141 }, { "epoch": 0.84, "grad_norm": 0.08628728240728378, "learning_rate": 0.00018101312955428692, "loss": 0.4453, "step": 142 }, { "epoch": 0.85, "grad_norm": 0.08730859309434891, "learning_rate": 0.00018073400779699435, "loss": 0.4485, "step": 143 }, { "epoch": 0.85, "grad_norm": 0.08489865809679031, "learning_rate": 0.0001804530678488691, "loss": 0.4592, "step": 144 }, { "epoch": 0.86, "grad_norm": 0.08439410477876663, "learning_rate": 0.00018017031603689102, "loss": 0.4326, "step": 145 }, { "epoch": 0.87, "grad_norm": 0.09346488118171692, "learning_rate": 0.0001798857587288445, "loss": 0.4484, "step": 146 }, { "epoch": 0.87, "grad_norm": 0.09130821377038956, "learning_rate": 0.00017959940233317498, "loss": 0.4502, "step": 147 }, { "epoch": 0.88, "grad_norm": 0.08846256881952286, "learning_rate": 0.0001793112532988448, "loss": 0.4322, "step": 148 }, { "epoch": 0.88, "grad_norm": 0.09061886370182037, "learning_rate": 0.00017902131811518786, "loss": 0.4437, "step": 149 }, { "epoch": 0.89, "grad_norm": 0.09259927272796631, "learning_rate": 0.00017872960331176345, "loss": 0.4545, "step": 150 }, { "epoch": 0.9, "grad_norm": 0.09632189571857452, "learning_rate": 0.00017843611545820926, "loss": 0.4515, "step": 151 }, { "epoch": 0.9, "grad_norm": 0.08714065700769424, "learning_rate": 0.00017814086116409348, "loss": 0.4602, "step": 152 }, { "epoch": 0.91, "grad_norm": 0.09537078440189362, "learning_rate": 0.00017784384707876576, "loss": 0.4482, "step": 153 }, { "epoch": 0.91, "grad_norm": 0.09175322949886322, "learning_rate": 0.00017754507989120764, "loss": 0.4681, "step": 154 }, { "epoch": 0.92, "grad_norm": 0.08962789177894592, "learning_rate": 0.00017724456632988187, "loss": 0.4304, "step": 155 }, { "epoch": 0.93, "grad_norm": 0.09643880277872086, "learning_rate": 0.00017694231316258077, "loss": 0.4532, "step": 156 }, { "epoch": 0.93, "grad_norm": 0.08335065096616745, "learning_rate": 0.00017663832719627402, "loss": 0.4504, "step": 157 }, { "epoch": 0.94, "grad_norm": 0.087184838950634, "learning_rate": 0.0001763326152769551, "loss": 0.4752, "step": 158 }, { "epoch": 0.94, "grad_norm": 0.08858635276556015, "learning_rate": 0.0001760251842894874, "loss": 0.4413, "step": 159 }, { "epoch": 0.95, "grad_norm": 0.08101391792297363, "learning_rate": 0.00017571604115744892, "loss": 0.4465, "step": 160 }, { "epoch": 0.96, "grad_norm": 0.08623132854700089, "learning_rate": 0.0001754051928429765, "loss": 0.4673, "step": 161 }, { "epoch": 0.96, "grad_norm": 0.0922100692987442, "learning_rate": 0.00017509264634660895, "loss": 0.4587, "step": 162 }, { "epoch": 0.97, "grad_norm": 0.08243449032306671, "learning_rate": 0.00017477840870712945, "loss": 0.4368, "step": 163 }, { "epoch": 0.97, "grad_norm": 0.0845554992556572, "learning_rate": 0.00017446248700140693, "loss": 0.4209, "step": 164 }, { "epoch": 0.98, "grad_norm": 0.08277452737092972, "learning_rate": 0.00017414488834423687, "loss": 0.4397, "step": 165 }, { "epoch": 0.99, "grad_norm": 0.0826331302523613, "learning_rate": 0.00017382561988818086, "loss": 0.4333, "step": 166 }, { "epoch": 0.99, "grad_norm": 0.08441821485757828, "learning_rate": 0.0001735046888234057, "loss": 0.4496, "step": 167 }, { "epoch": 1.0, "grad_norm": 0.08665426075458527, "learning_rate": 0.00017318210237752136, "loss": 0.4523, "step": 168 }, { "epoch": 1.0, "eval_loss": 0.4984985589981079, "eval_runtime": 21.4662, "eval_samples_per_second": 46.259, "eval_steps_per_second": 11.6, "step": 168 } ], "logging_steps": 1, "max_steps": 672, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 168, "total_flos": 5.013190238546166e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }