{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.944596361907027, "eval_steps": 500, "global_step": 50000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.059445963619070265, "grad_norm": 2.0723960399627686, "learning_rate": 4.985138509095233e-05, "loss": 3.6788, "step": 500 }, { "epoch": 0.11889192723814053, "grad_norm": 1.9278995990753174, "learning_rate": 4.970277018190465e-05, "loss": 3.4742, "step": 1000 }, { "epoch": 0.1783378908572108, "grad_norm": 1.4848977327346802, "learning_rate": 4.955415527285698e-05, "loss": 3.3942, "step": 1500 }, { "epoch": 0.23778385447628106, "grad_norm": 1.3492341041564941, "learning_rate": 4.94055403638093e-05, "loss": 3.3358, "step": 2000 }, { "epoch": 0.2972298180953513, "grad_norm": 1.212128758430481, "learning_rate": 4.925692545476163e-05, "loss": 3.2851, "step": 2500 }, { "epoch": 0.3566757817144216, "grad_norm": 1.1597293615341187, "learning_rate": 4.9108310545713945e-05, "loss": 3.2331, "step": 3000 }, { "epoch": 0.41612174533349183, "grad_norm": 0.9653922319412231, "learning_rate": 4.8959695636666275e-05, "loss": 3.2339, "step": 3500 }, { "epoch": 0.4755677089525621, "grad_norm": 1.0085793733596802, "learning_rate": 4.88110807276186e-05, "loss": 3.1856, "step": 4000 }, { "epoch": 0.5350136725716323, "grad_norm": 1.0556505918502808, "learning_rate": 4.866246581857092e-05, "loss": 3.1748, "step": 4500 }, { "epoch": 0.5944596361907026, "grad_norm": 0.9526228904724121, "learning_rate": 4.851385090952324e-05, "loss": 3.1529, "step": 5000 }, { "epoch": 0.6539055998097729, "grad_norm": 0.984980046749115, "learning_rate": 4.836523600047557e-05, "loss": 3.1378, "step": 5500 }, { "epoch": 0.7133515634288432, "grad_norm": 1.0135027170181274, "learning_rate": 4.8216621091427895e-05, "loss": 3.0848, "step": 6000 }, { "epoch": 0.7727975270479135, "grad_norm": 0.9454924464225769, "learning_rate": 4.806800618238022e-05, "loss": 3.0916, "step": 6500 }, { "epoch": 0.8322434906669837, "grad_norm": 0.9793129563331604, "learning_rate": 4.791939127333254e-05, "loss": 3.0642, "step": 7000 }, { "epoch": 0.891689454286054, "grad_norm": 0.9016062617301941, "learning_rate": 4.777077636428487e-05, "loss": 3.0657, "step": 7500 }, { "epoch": 0.9511354179051242, "grad_norm": 0.8690605163574219, "learning_rate": 4.762216145523719e-05, "loss": 3.0281, "step": 8000 }, { "epoch": 1.0105813815241944, "grad_norm": 0.891808271408081, "learning_rate": 4.7473546546189516e-05, "loss": 3.0155, "step": 8500 }, { "epoch": 1.0700273451432647, "grad_norm": 0.9521974325180054, "learning_rate": 4.732493163714184e-05, "loss": 2.9713, "step": 9000 }, { "epoch": 1.129473308762335, "grad_norm": 0.9132643938064575, "learning_rate": 4.717631672809417e-05, "loss": 2.9663, "step": 9500 }, { "epoch": 1.1889192723814053, "grad_norm": 0.909182608127594, "learning_rate": 4.702770181904649e-05, "loss": 2.9616, "step": 10000 }, { "epoch": 1.2483652360004756, "grad_norm": 0.912726104259491, "learning_rate": 4.687908690999881e-05, "loss": 2.9653, "step": 10500 }, { "epoch": 1.3078111996195458, "grad_norm": 0.8568936586380005, "learning_rate": 4.6730472000951136e-05, "loss": 2.9486, "step": 11000 }, { "epoch": 1.3672571632386161, "grad_norm": 0.9120291471481323, "learning_rate": 4.6581857091903465e-05, "loss": 2.932, "step": 11500 }, { "epoch": 1.4267031268576864, "grad_norm": 0.981961190700531, "learning_rate": 4.643324218285579e-05, "loss": 2.9345, "step": 12000 }, { "epoch": 1.4861490904767567, "grad_norm": 0.9763424396514893, "learning_rate": 4.628462727380811e-05, "loss": 2.9193, "step": 12500 }, { "epoch": 1.545595054095827, "grad_norm": 0.8868328332901001, "learning_rate": 4.6136012364760434e-05, "loss": 2.9164, "step": 13000 }, { "epoch": 1.605041017714897, "grad_norm": 0.9175488352775574, "learning_rate": 4.598739745571276e-05, "loss": 2.8932, "step": 13500 }, { "epoch": 1.6644869813339676, "grad_norm": 0.890186607837677, "learning_rate": 4.583878254666508e-05, "loss": 2.8933, "step": 14000 }, { "epoch": 1.7239329449530376, "grad_norm": 0.9198343753814697, "learning_rate": 4.569016763761741e-05, "loss": 2.881, "step": 14500 }, { "epoch": 1.783378908572108, "grad_norm": 0.9706104397773743, "learning_rate": 4.554155272856973e-05, "loss": 2.8705, "step": 15000 }, { "epoch": 1.8428248721911782, "grad_norm": 0.9355807304382324, "learning_rate": 4.539293781952206e-05, "loss": 2.8601, "step": 15500 }, { "epoch": 1.9022708358102485, "grad_norm": 0.8972137570381165, "learning_rate": 4.524432291047438e-05, "loss": 2.8632, "step": 16000 }, { "epoch": 1.9617167994293188, "grad_norm": 0.8553013801574707, "learning_rate": 4.5095708001426706e-05, "loss": 2.8696, "step": 16500 }, { "epoch": 2.021162763048389, "grad_norm": 0.8952363133430481, "learning_rate": 4.494709309237903e-05, "loss": 2.8541, "step": 17000 }, { "epoch": 2.0806087266674593, "grad_norm": 0.8947279453277588, "learning_rate": 4.479847818333135e-05, "loss": 2.8203, "step": 17500 }, { "epoch": 2.1400546902865294, "grad_norm": 0.8680304884910583, "learning_rate": 4.4649863274283674e-05, "loss": 2.8088, "step": 18000 }, { "epoch": 2.1995006539056, "grad_norm": 0.8425644040107727, "learning_rate": 4.4501248365236004e-05, "loss": 2.8064, "step": 18500 }, { "epoch": 2.25894661752467, "grad_norm": 0.9474213719367981, "learning_rate": 4.4352633456188327e-05, "loss": 2.7851, "step": 19000 }, { "epoch": 2.3183925811437405, "grad_norm": 0.9292487502098083, "learning_rate": 4.420401854714065e-05, "loss": 2.8062, "step": 19500 }, { "epoch": 2.3778385447628105, "grad_norm": 0.8527488708496094, "learning_rate": 4.405540363809297e-05, "loss": 2.7851, "step": 20000 }, { "epoch": 2.437284508381881, "grad_norm": 0.9439261555671692, "learning_rate": 4.39067887290453e-05, "loss": 2.7873, "step": 20500 }, { "epoch": 2.496730472000951, "grad_norm": 0.9343836903572083, "learning_rate": 4.3758173819997624e-05, "loss": 2.7611, "step": 21000 }, { "epoch": 2.5561764356200216, "grad_norm": 0.9050599932670593, "learning_rate": 4.360955891094995e-05, "loss": 2.767, "step": 21500 }, { "epoch": 2.6156223992390917, "grad_norm": 0.9053699374198914, "learning_rate": 4.346094400190227e-05, "loss": 2.7873, "step": 22000 }, { "epoch": 2.6750683628581617, "grad_norm": 0.9282116293907166, "learning_rate": 4.33123290928546e-05, "loss": 2.7607, "step": 22500 }, { "epoch": 2.7345143264772322, "grad_norm": 0.9617480635643005, "learning_rate": 4.316371418380692e-05, "loss": 2.7678, "step": 23000 }, { "epoch": 2.7939602900963023, "grad_norm": 0.9725137948989868, "learning_rate": 4.3015099274759244e-05, "loss": 2.7665, "step": 23500 }, { "epoch": 2.853406253715373, "grad_norm": 0.9514666199684143, "learning_rate": 4.286648436571157e-05, "loss": 2.7534, "step": 24000 }, { "epoch": 2.912852217334443, "grad_norm": 0.9485461115837097, "learning_rate": 4.27178694566639e-05, "loss": 2.7306, "step": 24500 }, { "epoch": 2.9722981809535134, "grad_norm": 1.014106035232544, "learning_rate": 4.256925454761622e-05, "loss": 2.736, "step": 25000 }, { "epoch": 3.0317441445725835, "grad_norm": 0.9117903113365173, "learning_rate": 4.242063963856854e-05, "loss": 2.7278, "step": 25500 }, { "epoch": 3.091190108191654, "grad_norm": 0.8904880881309509, "learning_rate": 4.2272024729520865e-05, "loss": 2.7156, "step": 26000 }, { "epoch": 3.150636071810724, "grad_norm": 0.8653568625450134, "learning_rate": 4.2123409820473194e-05, "loss": 2.7137, "step": 26500 }, { "epoch": 3.210082035429794, "grad_norm": 0.9386480450630188, "learning_rate": 4.197479491142551e-05, "loss": 2.7021, "step": 27000 }, { "epoch": 3.2695279990488646, "grad_norm": 1.0122427940368652, "learning_rate": 4.182618000237784e-05, "loss": 2.699, "step": 27500 }, { "epoch": 3.3289739626679347, "grad_norm": 0.9319558143615723, "learning_rate": 4.167756509333017e-05, "loss": 2.689, "step": 28000 }, { "epoch": 3.388419926287005, "grad_norm": 0.9281746745109558, "learning_rate": 4.152895018428249e-05, "loss": 2.7027, "step": 28500 }, { "epoch": 3.4478658899060752, "grad_norm": 0.9750462770462036, "learning_rate": 4.1380335275234815e-05, "loss": 2.6947, "step": 29000 }, { "epoch": 3.5073118535251457, "grad_norm": 0.8887720704078674, "learning_rate": 4.123172036618714e-05, "loss": 2.6864, "step": 29500 }, { "epoch": 3.566757817144216, "grad_norm": 0.9884176254272461, "learning_rate": 4.108310545713947e-05, "loss": 2.6893, "step": 30000 }, { "epoch": 3.6262037807632863, "grad_norm": 0.9995080828666687, "learning_rate": 4.093449054809178e-05, "loss": 2.6734, "step": 30500 }, { "epoch": 3.6856497443823564, "grad_norm": 1.0068608522415161, "learning_rate": 4.078587563904411e-05, "loss": 2.6766, "step": 31000 }, { "epoch": 3.7450957080014264, "grad_norm": 1.0225422382354736, "learning_rate": 4.0637260729996435e-05, "loss": 2.6757, "step": 31500 }, { "epoch": 3.804541671620497, "grad_norm": 0.9354658126831055, "learning_rate": 4.0488645820948765e-05, "loss": 2.6593, "step": 32000 }, { "epoch": 3.8639876352395675, "grad_norm": 0.9209592938423157, "learning_rate": 4.034003091190108e-05, "loss": 2.6547, "step": 32500 }, { "epoch": 3.9234335988586375, "grad_norm": 0.8945015668869019, "learning_rate": 4.019141600285341e-05, "loss": 2.6719, "step": 33000 }, { "epoch": 3.9828795624777076, "grad_norm": 0.9823748469352722, "learning_rate": 4.004280109380573e-05, "loss": 2.6781, "step": 33500 }, { "epoch": 4.042325526096778, "grad_norm": 1.0186822414398193, "learning_rate": 3.989418618475806e-05, "loss": 2.6469, "step": 34000 }, { "epoch": 4.101771489715849, "grad_norm": 0.9255732297897339, "learning_rate": 3.974557127571038e-05, "loss": 2.6296, "step": 34500 }, { "epoch": 4.161217453334919, "grad_norm": 1.0235294103622437, "learning_rate": 3.959695636666271e-05, "loss": 2.6358, "step": 35000 }, { "epoch": 4.220663416953989, "grad_norm": 0.911547064781189, "learning_rate": 3.944834145761503e-05, "loss": 2.6354, "step": 35500 }, { "epoch": 4.280109380573059, "grad_norm": 1.0124516487121582, "learning_rate": 3.929972654856735e-05, "loss": 2.6416, "step": 36000 }, { "epoch": 4.33955534419213, "grad_norm": 1.0222316980361938, "learning_rate": 3.9151111639519676e-05, "loss": 2.6188, "step": 36500 }, { "epoch": 4.3990013078112, "grad_norm": 0.9710135459899902, "learning_rate": 3.9002496730472005e-05, "loss": 2.6228, "step": 37000 }, { "epoch": 4.45844727143027, "grad_norm": 1.0287182331085205, "learning_rate": 3.885388182142433e-05, "loss": 2.6067, "step": 37500 }, { "epoch": 4.51789323504934, "grad_norm": 0.9699456095695496, "learning_rate": 3.870526691237665e-05, "loss": 2.6385, "step": 38000 }, { "epoch": 4.57733919866841, "grad_norm": 0.9066009521484375, "learning_rate": 3.855665200332897e-05, "loss": 2.6284, "step": 38500 }, { "epoch": 4.636785162287481, "grad_norm": 0.8537769317626953, "learning_rate": 3.84080370942813e-05, "loss": 2.6135, "step": 39000 }, { "epoch": 4.696231125906551, "grad_norm": 1.0666980743408203, "learning_rate": 3.8259422185233626e-05, "loss": 2.6312, "step": 39500 }, { "epoch": 4.755677089525621, "grad_norm": 1.0641474723815918, "learning_rate": 3.811080727618595e-05, "loss": 2.6127, "step": 40000 }, { "epoch": 4.815123053144691, "grad_norm": 1.076323390007019, "learning_rate": 3.796219236713827e-05, "loss": 2.6184, "step": 40500 }, { "epoch": 4.874569016763762, "grad_norm": 0.8963558077812195, "learning_rate": 3.78135774580906e-05, "loss": 2.6165, "step": 41000 }, { "epoch": 4.934014980382832, "grad_norm": 0.968908429145813, "learning_rate": 3.766496254904292e-05, "loss": 2.6009, "step": 41500 }, { "epoch": 4.993460944001902, "grad_norm": 0.9362033605575562, "learning_rate": 3.7516347639995246e-05, "loss": 2.5956, "step": 42000 }, { "epoch": 5.052906907620972, "grad_norm": 1.1101199388504028, "learning_rate": 3.736773273094757e-05, "loss": 2.5755, "step": 42500 }, { "epoch": 5.112352871240043, "grad_norm": 1.2178868055343628, "learning_rate": 3.72191178218999e-05, "loss": 2.5724, "step": 43000 }, { "epoch": 5.171798834859113, "grad_norm": 1.0143418312072754, "learning_rate": 3.707050291285222e-05, "loss": 2.5834, "step": 43500 }, { "epoch": 5.231244798478183, "grad_norm": 0.9720271825790405, "learning_rate": 3.6921888003804544e-05, "loss": 2.586, "step": 44000 }, { "epoch": 5.290690762097253, "grad_norm": 0.8847070932388306, "learning_rate": 3.6773273094756866e-05, "loss": 2.5953, "step": 44500 }, { "epoch": 5.3501367257163235, "grad_norm": 0.9654759764671326, "learning_rate": 3.6624658185709196e-05, "loss": 2.5777, "step": 45000 }, { "epoch": 5.409582689335394, "grad_norm": 0.9272730350494385, "learning_rate": 3.647604327666151e-05, "loss": 2.5774, "step": 45500 }, { "epoch": 5.4690286529544645, "grad_norm": 0.9674676656723022, "learning_rate": 3.632742836761384e-05, "loss": 2.5779, "step": 46000 }, { "epoch": 5.528474616573535, "grad_norm": 1.0238367319107056, "learning_rate": 3.6178813458566164e-05, "loss": 2.5683, "step": 46500 }, { "epoch": 5.587920580192605, "grad_norm": 1.1663753986358643, "learning_rate": 3.603019854951849e-05, "loss": 2.5802, "step": 47000 }, { "epoch": 5.647366543811675, "grad_norm": 0.8961432576179504, "learning_rate": 3.588158364047081e-05, "loss": 2.5726, "step": 47500 }, { "epoch": 5.706812507430746, "grad_norm": 1.1115467548370361, "learning_rate": 3.573296873142314e-05, "loss": 2.5719, "step": 48000 }, { "epoch": 5.766258471049816, "grad_norm": 1.00434148311615, "learning_rate": 3.558435382237546e-05, "loss": 2.556, "step": 48500 }, { "epoch": 5.825704434668886, "grad_norm": 1.1120518445968628, "learning_rate": 3.5435738913327784e-05, "loss": 2.5627, "step": 49000 }, { "epoch": 5.885150398287957, "grad_norm": 0.9611983299255371, "learning_rate": 3.528712400428011e-05, "loss": 2.5568, "step": 49500 }, { "epoch": 5.944596361907027, "grad_norm": 1.1176481246948242, "learning_rate": 3.5138509095232436e-05, "loss": 2.5634, "step": 50000 } ], "logging_steps": 500, "max_steps": 168220, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 2.090205609984e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }