{ "best_metric": 0.21579746901988983, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.16359918200409, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008179959100204499, "grad_norm": 0.4496954381465912, "learning_rate": 1.013e-05, "loss": 0.8848, "step": 1 }, { "epoch": 0.0008179959100204499, "eval_loss": 0.5526403784751892, "eval_runtime": 164.1948, "eval_samples_per_second": 3.137, "eval_steps_per_second": 0.786, "step": 1 }, { "epoch": 0.0016359918200408998, "grad_norm": 0.38289597630500793, "learning_rate": 2.026e-05, "loss": 0.6338, "step": 2 }, { "epoch": 0.00245398773006135, "grad_norm": 0.678950309753418, "learning_rate": 3.039e-05, "loss": 1.371, "step": 3 }, { "epoch": 0.0032719836400817996, "grad_norm": 0.5606029629707336, "learning_rate": 4.052e-05, "loss": 1.0274, "step": 4 }, { "epoch": 0.00408997955010225, "grad_norm": 0.5244286060333252, "learning_rate": 5.065e-05, "loss": 0.7953, "step": 5 }, { "epoch": 0.0049079754601227, "grad_norm": 0.49360060691833496, "learning_rate": 6.078e-05, "loss": 0.8689, "step": 6 }, { "epoch": 0.0057259713701431495, "grad_norm": 1.8924438953399658, "learning_rate": 7.091e-05, "loss": 1.2078, "step": 7 }, { "epoch": 0.006543967280163599, "grad_norm": 0.60125732421875, "learning_rate": 8.104e-05, "loss": 0.951, "step": 8 }, { "epoch": 0.007361963190184049, "grad_norm": 0.5913432836532593, "learning_rate": 9.117e-05, "loss": 0.9561, "step": 9 }, { "epoch": 0.0081799591002045, "grad_norm": 0.6411980390548706, "learning_rate": 0.0001013, "loss": 0.9956, "step": 10 }, { "epoch": 0.00899795501022495, "grad_norm": 0.6919670701026917, "learning_rate": 0.00010076684210526316, "loss": 0.8319, "step": 11 }, { "epoch": 0.0098159509202454, "grad_norm": 0.6590487360954285, "learning_rate": 0.0001002336842105263, "loss": 0.7345, "step": 12 }, { "epoch": 0.01063394683026585, "grad_norm": 0.82487952709198, "learning_rate": 9.970052631578946e-05, "loss": 0.8608, "step": 13 }, { "epoch": 0.011451942740286299, "grad_norm": 0.7508109211921692, "learning_rate": 9.916736842105263e-05, "loss": 0.9751, "step": 14 }, { "epoch": 0.012269938650306749, "grad_norm": 0.7014967203140259, "learning_rate": 9.863421052631579e-05, "loss": 0.9172, "step": 15 }, { "epoch": 0.013087934560327199, "grad_norm": 0.6674103140830994, "learning_rate": 9.810105263157895e-05, "loss": 0.5077, "step": 16 }, { "epoch": 0.013905930470347648, "grad_norm": 0.8871914744377136, "learning_rate": 9.756789473684211e-05, "loss": 1.1221, "step": 17 }, { "epoch": 0.014723926380368098, "grad_norm": 0.8045795559883118, "learning_rate": 9.703473684210525e-05, "loss": 0.9631, "step": 18 }, { "epoch": 0.015541922290388548, "grad_norm": 0.8533051013946533, "learning_rate": 9.650157894736842e-05, "loss": 0.9807, "step": 19 }, { "epoch": 0.016359918200409, "grad_norm": 0.5677836537361145, "learning_rate": 9.596842105263158e-05, "loss": 0.4352, "step": 20 }, { "epoch": 0.01717791411042945, "grad_norm": 1.4102709293365479, "learning_rate": 9.543526315789474e-05, "loss": 1.2272, "step": 21 }, { "epoch": 0.0179959100204499, "grad_norm": 1.0736172199249268, "learning_rate": 9.49021052631579e-05, "loss": 1.1928, "step": 22 }, { "epoch": 0.01881390593047035, "grad_norm": 1.2938551902770996, "learning_rate": 9.436894736842105e-05, "loss": 0.9907, "step": 23 }, { "epoch": 0.0196319018404908, "grad_norm": 0.9903382658958435, "learning_rate": 9.38357894736842e-05, "loss": 0.8166, "step": 24 }, { "epoch": 0.02044989775051125, "grad_norm": 0.8400692939758301, "learning_rate": 9.330263157894737e-05, "loss": 0.9552, "step": 25 }, { "epoch": 0.0212678936605317, "grad_norm": 0.6418869495391846, "learning_rate": 9.276947368421051e-05, "loss": 0.4748, "step": 26 }, { "epoch": 0.022085889570552148, "grad_norm": 0.7171022295951843, "learning_rate": 9.223631578947369e-05, "loss": 0.6061, "step": 27 }, { "epoch": 0.022903885480572598, "grad_norm": 0.9005898237228394, "learning_rate": 9.170315789473684e-05, "loss": 0.7294, "step": 28 }, { "epoch": 0.023721881390593048, "grad_norm": 0.853922426700592, "learning_rate": 9.117e-05, "loss": 0.8019, "step": 29 }, { "epoch": 0.024539877300613498, "grad_norm": 0.6506239175796509, "learning_rate": 9.063684210526316e-05, "loss": 0.5485, "step": 30 }, { "epoch": 0.025357873210633947, "grad_norm": 0.8468363285064697, "learning_rate": 9.010368421052632e-05, "loss": 0.8216, "step": 31 }, { "epoch": 0.026175869120654397, "grad_norm": 1.2255985736846924, "learning_rate": 8.957052631578946e-05, "loss": 0.499, "step": 32 }, { "epoch": 0.026993865030674847, "grad_norm": 0.7245408296585083, "learning_rate": 8.903736842105263e-05, "loss": 0.4545, "step": 33 }, { "epoch": 0.027811860940695297, "grad_norm": 0.5149531960487366, "learning_rate": 8.850421052631579e-05, "loss": 0.3527, "step": 34 }, { "epoch": 0.028629856850715747, "grad_norm": 0.7418886423110962, "learning_rate": 8.797105263157895e-05, "loss": 0.6099, "step": 35 }, { "epoch": 0.029447852760736196, "grad_norm": 0.3611685037612915, "learning_rate": 8.743789473684211e-05, "loss": 0.152, "step": 36 }, { "epoch": 0.030265848670756646, "grad_norm": 0.2011786550283432, "learning_rate": 8.690473684210526e-05, "loss": 0.0166, "step": 37 }, { "epoch": 0.031083844580777096, "grad_norm": 0.09635209292173386, "learning_rate": 8.637157894736842e-05, "loss": 0.0076, "step": 38 }, { "epoch": 0.03190184049079755, "grad_norm": 0.08170394599437714, "learning_rate": 8.583842105263158e-05, "loss": 0.0055, "step": 39 }, { "epoch": 0.032719836400818, "grad_norm": 0.10278013348579407, "learning_rate": 8.530526315789472e-05, "loss": 0.0038, "step": 40 }, { "epoch": 0.03353783231083845, "grad_norm": 0.22206488251686096, "learning_rate": 8.47721052631579e-05, "loss": 0.0034, "step": 41 }, { "epoch": 0.0343558282208589, "grad_norm": 0.06311889737844467, "learning_rate": 8.423894736842105e-05, "loss": 0.0029, "step": 42 }, { "epoch": 0.03517382413087935, "grad_norm": 0.12246158719062805, "learning_rate": 8.37057894736842e-05, "loss": 0.0043, "step": 43 }, { "epoch": 0.0359918200408998, "grad_norm": 0.08296216279268265, "learning_rate": 8.317263157894737e-05, "loss": 0.0021, "step": 44 }, { "epoch": 0.03680981595092025, "grad_norm": 0.053368937224149704, "learning_rate": 8.263947368421053e-05, "loss": 0.001, "step": 45 }, { "epoch": 0.0376278118609407, "grad_norm": 0.0612197071313858, "learning_rate": 8.210631578947368e-05, "loss": 0.001, "step": 46 }, { "epoch": 0.03844580777096115, "grad_norm": 0.03648490086197853, "learning_rate": 8.157315789473684e-05, "loss": 0.001, "step": 47 }, { "epoch": 0.0392638036809816, "grad_norm": 0.020957158878445625, "learning_rate": 8.104e-05, "loss": 0.0004, "step": 48 }, { "epoch": 0.04008179959100205, "grad_norm": 0.31076347827911377, "learning_rate": 8.050684210526316e-05, "loss": 0.0053, "step": 49 }, { "epoch": 0.0408997955010225, "grad_norm": 0.009556309320032597, "learning_rate": 7.997368421052632e-05, "loss": 0.0002, "step": 50 }, { "epoch": 0.0408997955010225, "eval_loss": 0.31697726249694824, "eval_runtime": 164.5433, "eval_samples_per_second": 3.13, "eval_steps_per_second": 0.784, "step": 50 }, { "epoch": 0.04171779141104295, "grad_norm": 1.0068371295928955, "learning_rate": 7.944052631578947e-05, "loss": 0.9885, "step": 51 }, { "epoch": 0.0425357873210634, "grad_norm": 0.724956214427948, "learning_rate": 7.890736842105263e-05, "loss": 0.8083, "step": 52 }, { "epoch": 0.043353783231083846, "grad_norm": 0.5116149187088013, "learning_rate": 7.837421052631579e-05, "loss": 0.7497, "step": 53 }, { "epoch": 0.044171779141104296, "grad_norm": 0.5890445709228516, "learning_rate": 7.784105263157893e-05, "loss": 1.0557, "step": 54 }, { "epoch": 0.044989775051124746, "grad_norm": 0.6366136074066162, "learning_rate": 7.730789473684211e-05, "loss": 1.0897, "step": 55 }, { "epoch": 0.045807770961145196, "grad_norm": 0.503930926322937, "learning_rate": 7.677473684210526e-05, "loss": 0.909, "step": 56 }, { "epoch": 0.046625766871165646, "grad_norm": 0.3874259293079376, "learning_rate": 7.624157894736842e-05, "loss": 0.6214, "step": 57 }, { "epoch": 0.047443762781186095, "grad_norm": 0.46610915660858154, "learning_rate": 7.570842105263158e-05, "loss": 0.5465, "step": 58 }, { "epoch": 0.048261758691206545, "grad_norm": 0.38099950551986694, "learning_rate": 7.517526315789474e-05, "loss": 0.3863, "step": 59 }, { "epoch": 0.049079754601226995, "grad_norm": 0.7457447648048401, "learning_rate": 7.464210526315789e-05, "loss": 0.4988, "step": 60 }, { "epoch": 0.049897750511247445, "grad_norm": 0.48318201303482056, "learning_rate": 7.410894736842106e-05, "loss": 0.6316, "step": 61 }, { "epoch": 0.050715746421267895, "grad_norm": 0.41538336873054504, "learning_rate": 7.35757894736842e-05, "loss": 0.3852, "step": 62 }, { "epoch": 0.051533742331288344, "grad_norm": 0.40224334597587585, "learning_rate": 7.304263157894737e-05, "loss": 0.4715, "step": 63 }, { "epoch": 0.052351738241308794, "grad_norm": 0.9810552597045898, "learning_rate": 7.250947368421053e-05, "loss": 0.6195, "step": 64 }, { "epoch": 0.053169734151329244, "grad_norm": 0.930385172367096, "learning_rate": 7.197631578947368e-05, "loss": 0.6228, "step": 65 }, { "epoch": 0.053987730061349694, "grad_norm": 0.5943039655685425, "learning_rate": 7.144315789473684e-05, "loss": 0.7253, "step": 66 }, { "epoch": 0.054805725971370144, "grad_norm": 0.9984376430511475, "learning_rate": 7.091e-05, "loss": 0.9143, "step": 67 }, { "epoch": 0.05562372188139059, "grad_norm": 0.7382859587669373, "learning_rate": 7.037684210526316e-05, "loss": 0.9344, "step": 68 }, { "epoch": 0.05644171779141104, "grad_norm": 0.5910463333129883, "learning_rate": 6.984368421052632e-05, "loss": 0.5557, "step": 69 }, { "epoch": 0.05725971370143149, "grad_norm": 0.5349307060241699, "learning_rate": 6.931052631578947e-05, "loss": 0.6016, "step": 70 }, { "epoch": 0.05807770961145194, "grad_norm": 0.5327895283699036, "learning_rate": 6.877736842105263e-05, "loss": 0.5221, "step": 71 }, { "epoch": 0.05889570552147239, "grad_norm": 0.6906758546829224, "learning_rate": 6.824421052631579e-05, "loss": 0.8609, "step": 72 }, { "epoch": 0.05971370143149284, "grad_norm": 2.961278200149536, "learning_rate": 6.771105263157895e-05, "loss": 0.707, "step": 73 }, { "epoch": 0.06053169734151329, "grad_norm": 0.5922215580940247, "learning_rate": 6.71778947368421e-05, "loss": 0.6751, "step": 74 }, { "epoch": 0.06134969325153374, "grad_norm": 0.7490167617797852, "learning_rate": 6.664473684210527e-05, "loss": 0.7576, "step": 75 }, { "epoch": 0.06216768916155419, "grad_norm": 0.7223999500274658, "learning_rate": 6.611157894736842e-05, "loss": 0.6025, "step": 76 }, { "epoch": 0.06298568507157465, "grad_norm": 0.6571590304374695, "learning_rate": 6.557842105263158e-05, "loss": 0.7033, "step": 77 }, { "epoch": 0.0638036809815951, "grad_norm": 0.5444533824920654, "learning_rate": 6.504526315789474e-05, "loss": 0.5138, "step": 78 }, { "epoch": 0.06462167689161555, "grad_norm": 0.5655301213264465, "learning_rate": 6.451210526315789e-05, "loss": 0.4987, "step": 79 }, { "epoch": 0.065439672801636, "grad_norm": 0.632288932800293, "learning_rate": 6.397894736842105e-05, "loss": 0.4189, "step": 80 }, { "epoch": 0.06625766871165645, "grad_norm": 0.7677326202392578, "learning_rate": 6.344578947368421e-05, "loss": 0.3, "step": 81 }, { "epoch": 0.0670756646216769, "grad_norm": 0.4775019884109497, "learning_rate": 6.291263157894737e-05, "loss": 0.3159, "step": 82 }, { "epoch": 0.06789366053169735, "grad_norm": 0.41117969155311584, "learning_rate": 6.237947368421053e-05, "loss": 0.317, "step": 83 }, { "epoch": 0.0687116564417178, "grad_norm": 0.1944044977426529, "learning_rate": 6.184631578947368e-05, "loss": 0.074, "step": 84 }, { "epoch": 0.06952965235173825, "grad_norm": 0.267262727022171, "learning_rate": 6.131315789473684e-05, "loss": 0.1004, "step": 85 }, { "epoch": 0.0703476482617587, "grad_norm": 0.4343488812446594, "learning_rate": 6.078e-05, "loss": 0.1652, "step": 86 }, { "epoch": 0.07116564417177915, "grad_norm": 0.016880128532648087, "learning_rate": 6.024684210526315e-05, "loss": 0.0009, "step": 87 }, { "epoch": 0.0719836400817996, "grad_norm": 0.22920526564121246, "learning_rate": 5.9713684210526305e-05, "loss": 0.0066, "step": 88 }, { "epoch": 0.07280163599182005, "grad_norm": 0.1432953178882599, "learning_rate": 5.918052631578947e-05, "loss": 0.002, "step": 89 }, { "epoch": 0.0736196319018405, "grad_norm": 0.015287825837731361, "learning_rate": 5.8647368421052634e-05, "loss": 0.0007, "step": 90 }, { "epoch": 0.07443762781186095, "grad_norm": 0.15962694585323334, "learning_rate": 5.811421052631579e-05, "loss": 0.0084, "step": 91 }, { "epoch": 0.0752556237218814, "grad_norm": 0.0835452675819397, "learning_rate": 5.758105263157894e-05, "loss": 0.0026, "step": 92 }, { "epoch": 0.07607361963190185, "grad_norm": 0.3013435900211334, "learning_rate": 5.70478947368421e-05, "loss": 0.0046, "step": 93 }, { "epoch": 0.0768916155419223, "grad_norm": 0.007186358794569969, "learning_rate": 5.6514736842105256e-05, "loss": 0.0004, "step": 94 }, { "epoch": 0.07770961145194274, "grad_norm": 0.0493503175675869, "learning_rate": 5.5981578947368424e-05, "loss": 0.0019, "step": 95 }, { "epoch": 0.0785276073619632, "grad_norm": 0.09706468880176544, "learning_rate": 5.544842105263158e-05, "loss": 0.0017, "step": 96 }, { "epoch": 0.07934560327198364, "grad_norm": 0.0098764318972826, "learning_rate": 5.491526315789474e-05, "loss": 0.0004, "step": 97 }, { "epoch": 0.0801635991820041, "grad_norm": 0.16427290439605713, "learning_rate": 5.438210526315789e-05, "loss": 0.0069, "step": 98 }, { "epoch": 0.08098159509202454, "grad_norm": 0.02355344407260418, "learning_rate": 5.384894736842105e-05, "loss": 0.0006, "step": 99 }, { "epoch": 0.081799591002045, "grad_norm": 0.022215725854039192, "learning_rate": 5.331578947368421e-05, "loss": 0.0007, "step": 100 }, { "epoch": 0.081799591002045, "eval_loss": 0.2767239809036255, "eval_runtime": 164.6277, "eval_samples_per_second": 3.128, "eval_steps_per_second": 0.784, "step": 100 }, { "epoch": 0.08261758691206544, "grad_norm": 2.208616256713867, "learning_rate": 5.278263157894736e-05, "loss": 0.8554, "step": 101 }, { "epoch": 0.0834355828220859, "grad_norm": 0.6053175330162048, "learning_rate": 5.224947368421053e-05, "loss": 0.8102, "step": 102 }, { "epoch": 0.08425357873210634, "grad_norm": 0.443279892206192, "learning_rate": 5.171631578947368e-05, "loss": 0.6613, "step": 103 }, { "epoch": 0.0850715746421268, "grad_norm": 0.5657853484153748, "learning_rate": 5.1183157894736844e-05, "loss": 0.904, "step": 104 }, { "epoch": 0.08588957055214724, "grad_norm": 0.5122092366218567, "learning_rate": 5.065e-05, "loss": 0.7585, "step": 105 }, { "epoch": 0.08670756646216769, "grad_norm": 0.4323250353336334, "learning_rate": 5.011684210526315e-05, "loss": 0.6443, "step": 106 }, { "epoch": 0.08752556237218814, "grad_norm": 0.4903777539730072, "learning_rate": 4.958368421052631e-05, "loss": 0.8381, "step": 107 }, { "epoch": 0.08834355828220859, "grad_norm": 0.49713271856307983, "learning_rate": 4.9050526315789473e-05, "loss": 0.6974, "step": 108 }, { "epoch": 0.08916155419222904, "grad_norm": 0.5440729260444641, "learning_rate": 4.851736842105263e-05, "loss": 0.7326, "step": 109 }, { "epoch": 0.08997955010224949, "grad_norm": 0.5971400737762451, "learning_rate": 4.798421052631579e-05, "loss": 0.8564, "step": 110 }, { "epoch": 0.09079754601226994, "grad_norm": 0.4081849157810211, "learning_rate": 4.745105263157895e-05, "loss": 0.5119, "step": 111 }, { "epoch": 0.09161554192229039, "grad_norm": 0.42180880904197693, "learning_rate": 4.69178947368421e-05, "loss": 0.479, "step": 112 }, { "epoch": 0.09243353783231084, "grad_norm": 0.48688748478889465, "learning_rate": 4.638473684210526e-05, "loss": 0.5893, "step": 113 }, { "epoch": 0.09325153374233129, "grad_norm": 0.3098672926425934, "learning_rate": 4.585157894736842e-05, "loss": 0.3024, "step": 114 }, { "epoch": 0.09406952965235174, "grad_norm": 0.3973608911037445, "learning_rate": 4.531842105263158e-05, "loss": 0.4749, "step": 115 }, { "epoch": 0.09488752556237219, "grad_norm": 0.45003950595855713, "learning_rate": 4.478526315789473e-05, "loss": 0.5641, "step": 116 }, { "epoch": 0.09570552147239264, "grad_norm": 0.4103372395038605, "learning_rate": 4.425210526315789e-05, "loss": 0.3926, "step": 117 }, { "epoch": 0.09652351738241309, "grad_norm": 0.5700619220733643, "learning_rate": 4.3718947368421054e-05, "loss": 0.7772, "step": 118 }, { "epoch": 0.09734151329243354, "grad_norm": 0.5302966833114624, "learning_rate": 4.318578947368421e-05, "loss": 0.5547, "step": 119 }, { "epoch": 0.09815950920245399, "grad_norm": 0.5432780981063843, "learning_rate": 4.265263157894736e-05, "loss": 0.7294, "step": 120 }, { "epoch": 0.09897750511247444, "grad_norm": 0.5698531866073608, "learning_rate": 4.211947368421052e-05, "loss": 0.6319, "step": 121 }, { "epoch": 0.09979550102249489, "grad_norm": 0.4369616210460663, "learning_rate": 4.1586315789473684e-05, "loss": 0.4439, "step": 122 }, { "epoch": 0.10061349693251534, "grad_norm": 0.5331543684005737, "learning_rate": 4.105315789473684e-05, "loss": 0.5258, "step": 123 }, { "epoch": 0.10143149284253579, "grad_norm": 0.5208017230033875, "learning_rate": 4.052e-05, "loss": 0.5332, "step": 124 }, { "epoch": 0.10224948875255624, "grad_norm": 0.6325660943984985, "learning_rate": 3.998684210526316e-05, "loss": 0.7877, "step": 125 }, { "epoch": 0.10306748466257669, "grad_norm": 0.8065600991249084, "learning_rate": 3.945368421052631e-05, "loss": 0.8469, "step": 126 }, { "epoch": 0.10388548057259714, "grad_norm": 0.5705050230026245, "learning_rate": 3.892052631578947e-05, "loss": 0.5351, "step": 127 }, { "epoch": 0.10470347648261759, "grad_norm": 0.60113525390625, "learning_rate": 3.838736842105263e-05, "loss": 0.5405, "step": 128 }, { "epoch": 0.10552147239263804, "grad_norm": 0.5440464019775391, "learning_rate": 3.785421052631579e-05, "loss": 0.5673, "step": 129 }, { "epoch": 0.10633946830265849, "grad_norm": 0.6333761215209961, "learning_rate": 3.732105263157894e-05, "loss": 0.7063, "step": 130 }, { "epoch": 0.10715746421267894, "grad_norm": 0.3166744112968445, "learning_rate": 3.67878947368421e-05, "loss": 0.2672, "step": 131 }, { "epoch": 0.10797546012269939, "grad_norm": 0.47239211201667786, "learning_rate": 3.6254736842105264e-05, "loss": 0.2944, "step": 132 }, { "epoch": 0.10879345603271984, "grad_norm": 0.6278908252716064, "learning_rate": 3.572157894736842e-05, "loss": 0.7175, "step": 133 }, { "epoch": 0.10961145194274029, "grad_norm": 6.725371837615967, "learning_rate": 3.518842105263158e-05, "loss": 0.5899, "step": 134 }, { "epoch": 0.11042944785276074, "grad_norm": 0.3569982945919037, "learning_rate": 3.465526315789473e-05, "loss": 0.2153, "step": 135 }, { "epoch": 0.11124744376278119, "grad_norm": 0.8054752945899963, "learning_rate": 3.4122105263157894e-05, "loss": 0.6044, "step": 136 }, { "epoch": 0.11206543967280164, "grad_norm": 0.37957316637039185, "learning_rate": 3.358894736842105e-05, "loss": 0.2575, "step": 137 }, { "epoch": 0.11288343558282209, "grad_norm": 0.33962732553482056, "learning_rate": 3.305578947368421e-05, "loss": 0.1472, "step": 138 }, { "epoch": 0.11370143149284254, "grad_norm": 0.1770872324705124, "learning_rate": 3.252263157894737e-05, "loss": 0.008, "step": 139 }, { "epoch": 0.11451942740286299, "grad_norm": 0.09861530363559723, "learning_rate": 3.198947368421052e-05, "loss": 0.0033, "step": 140 }, { "epoch": 0.11533742331288344, "grad_norm": 0.1032843068242073, "learning_rate": 3.1456315789473684e-05, "loss": 0.0027, "step": 141 }, { "epoch": 0.11615541922290389, "grad_norm": 0.11989390105009079, "learning_rate": 3.092315789473684e-05, "loss": 0.0018, "step": 142 }, { "epoch": 0.11697341513292434, "grad_norm": 0.01104150153696537, "learning_rate": 3.039e-05, "loss": 0.0004, "step": 143 }, { "epoch": 0.11779141104294479, "grad_norm": 0.01930282451212406, "learning_rate": 2.9856842105263153e-05, "loss": 0.0005, "step": 144 }, { "epoch": 0.11860940695296524, "grad_norm": 0.15893378853797913, "learning_rate": 2.9323684210526317e-05, "loss": 0.0031, "step": 145 }, { "epoch": 0.11942740286298568, "grad_norm": 0.03184283524751663, "learning_rate": 2.879052631578947e-05, "loss": 0.0009, "step": 146 }, { "epoch": 0.12024539877300613, "grad_norm": 0.06102335825562477, "learning_rate": 2.8257368421052628e-05, "loss": 0.0013, "step": 147 }, { "epoch": 0.12106339468302658, "grad_norm": 0.0802997276186943, "learning_rate": 2.772421052631579e-05, "loss": 0.0015, "step": 148 }, { "epoch": 0.12188139059304703, "grad_norm": 0.02622400037944317, "learning_rate": 2.7191052631578946e-05, "loss": 0.0005, "step": 149 }, { "epoch": 0.12269938650306748, "grad_norm": 0.019685884937644005, "learning_rate": 2.6657894736842104e-05, "loss": 0.0006, "step": 150 }, { "epoch": 0.12269938650306748, "eval_loss": 0.22924812138080597, "eval_runtime": 164.6274, "eval_samples_per_second": 3.128, "eval_steps_per_second": 0.784, "step": 150 }, { "epoch": 0.12351738241308793, "grad_norm": 0.42591241002082825, "learning_rate": 2.6124736842105265e-05, "loss": 0.847, "step": 151 }, { "epoch": 0.12433537832310838, "grad_norm": 0.37749016284942627, "learning_rate": 2.5591578947368422e-05, "loss": 0.643, "step": 152 }, { "epoch": 0.12515337423312883, "grad_norm": 7.373504638671875, "learning_rate": 2.5058421052631576e-05, "loss": 1.1166, "step": 153 }, { "epoch": 0.1259713701431493, "grad_norm": 0.4218708276748657, "learning_rate": 2.4525263157894737e-05, "loss": 0.5926, "step": 154 }, { "epoch": 0.12678936605316973, "grad_norm": 0.415558397769928, "learning_rate": 2.3992105263157894e-05, "loss": 0.539, "step": 155 }, { "epoch": 0.1276073619631902, "grad_norm": 0.367865651845932, "learning_rate": 2.345894736842105e-05, "loss": 0.4419, "step": 156 }, { "epoch": 0.12842535787321063, "grad_norm": 0.47910165786743164, "learning_rate": 2.292578947368421e-05, "loss": 0.72, "step": 157 }, { "epoch": 0.1292433537832311, "grad_norm": 0.547726035118103, "learning_rate": 2.2392631578947366e-05, "loss": 0.7997, "step": 158 }, { "epoch": 0.13006134969325153, "grad_norm": 0.49771469831466675, "learning_rate": 2.1859473684210527e-05, "loss": 0.7018, "step": 159 }, { "epoch": 0.130879345603272, "grad_norm": 0.6116258502006531, "learning_rate": 2.132631578947368e-05, "loss": 0.907, "step": 160 }, { "epoch": 0.13169734151329243, "grad_norm": 0.5866420865058899, "learning_rate": 2.0793157894736842e-05, "loss": 0.6216, "step": 161 }, { "epoch": 0.1325153374233129, "grad_norm": 0.6702650785446167, "learning_rate": 2.026e-05, "loss": 0.6452, "step": 162 }, { "epoch": 0.13333333333333333, "grad_norm": 0.5496453046798706, "learning_rate": 1.9726842105263157e-05, "loss": 0.6332, "step": 163 }, { "epoch": 0.1341513292433538, "grad_norm": 0.37603631615638733, "learning_rate": 1.9193684210526314e-05, "loss": 0.3876, "step": 164 }, { "epoch": 0.13496932515337423, "grad_norm": 0.4126095473766327, "learning_rate": 1.866052631578947e-05, "loss": 0.4168, "step": 165 }, { "epoch": 0.1357873210633947, "grad_norm": 0.6646726131439209, "learning_rate": 1.8127368421052632e-05, "loss": 0.8142, "step": 166 }, { "epoch": 0.13660531697341513, "grad_norm": 0.3889711797237396, "learning_rate": 1.759421052631579e-05, "loss": 0.4465, "step": 167 }, { "epoch": 0.1374233128834356, "grad_norm": 0.44813233613967896, "learning_rate": 1.7061052631578947e-05, "loss": 0.4387, "step": 168 }, { "epoch": 0.13824130879345603, "grad_norm": 0.4522460699081421, "learning_rate": 1.6527894736842104e-05, "loss": 0.4947, "step": 169 }, { "epoch": 0.1390593047034765, "grad_norm": 0.5248112082481384, "learning_rate": 1.599473684210526e-05, "loss": 0.5836, "step": 170 }, { "epoch": 0.13987730061349693, "grad_norm": 0.6201620697975159, "learning_rate": 1.546157894736842e-05, "loss": 0.8913, "step": 171 }, { "epoch": 0.1406952965235174, "grad_norm": 0.565233051776886, "learning_rate": 1.4928421052631576e-05, "loss": 0.5188, "step": 172 }, { "epoch": 0.14151329243353783, "grad_norm": 0.4735608994960785, "learning_rate": 1.4395263157894735e-05, "loss": 0.3935, "step": 173 }, { "epoch": 0.1423312883435583, "grad_norm": 0.6751230955123901, "learning_rate": 1.3862105263157895e-05, "loss": 0.7061, "step": 174 }, { "epoch": 0.14314928425357873, "grad_norm": 0.5896326303482056, "learning_rate": 1.3328947368421052e-05, "loss": 0.6753, "step": 175 }, { "epoch": 0.1439672801635992, "grad_norm": 0.46921586990356445, "learning_rate": 1.2795789473684211e-05, "loss": 0.4023, "step": 176 }, { "epoch": 0.14478527607361963, "grad_norm": 0.8884191513061523, "learning_rate": 1.2262631578947368e-05, "loss": 0.5684, "step": 177 }, { "epoch": 0.1456032719836401, "grad_norm": 0.34956350922584534, "learning_rate": 1.1729473684210526e-05, "loss": 0.1983, "step": 178 }, { "epoch": 0.14642126789366053, "grad_norm": 0.6150217056274414, "learning_rate": 1.1196315789473683e-05, "loss": 0.6407, "step": 179 }, { "epoch": 0.147239263803681, "grad_norm": 0.41867491602897644, "learning_rate": 1.066315789473684e-05, "loss": 0.3322, "step": 180 }, { "epoch": 0.14805725971370143, "grad_norm": 0.6983818411827087, "learning_rate": 1.013e-05, "loss": 0.7427, "step": 181 }, { "epoch": 0.1488752556237219, "grad_norm": 0.7040908336639404, "learning_rate": 9.596842105263157e-06, "loss": 0.6962, "step": 182 }, { "epoch": 0.14969325153374233, "grad_norm": 0.6211279630661011, "learning_rate": 9.063684210526316e-06, "loss": 0.5084, "step": 183 }, { "epoch": 0.1505112474437628, "grad_norm": 0.4377440810203552, "learning_rate": 8.530526315789473e-06, "loss": 0.226, "step": 184 }, { "epoch": 0.15132924335378323, "grad_norm": 0.1414121389389038, "learning_rate": 7.99736842105263e-06, "loss": 0.052, "step": 185 }, { "epoch": 0.1521472392638037, "grad_norm": 0.06823990494012833, "learning_rate": 7.464210526315788e-06, "loss": 0.0031, "step": 186 }, { "epoch": 0.15296523517382413, "grad_norm": 0.013092203065752983, "learning_rate": 6.931052631578947e-06, "loss": 0.0006, "step": 187 }, { "epoch": 0.1537832310838446, "grad_norm": 0.19664902985095978, "learning_rate": 6.3978947368421055e-06, "loss": 0.0032, "step": 188 }, { "epoch": 0.15460122699386503, "grad_norm": 0.016594722867012024, "learning_rate": 5.864736842105263e-06, "loss": 0.0006, "step": 189 }, { "epoch": 0.1554192229038855, "grad_norm": 0.009124848060309887, "learning_rate": 5.33157894736842e-06, "loss": 0.0005, "step": 190 }, { "epoch": 0.15623721881390593, "grad_norm": 0.10051782429218292, "learning_rate": 4.7984210526315785e-06, "loss": 0.0042, "step": 191 }, { "epoch": 0.1570552147239264, "grad_norm": 0.0568082332611084, "learning_rate": 4.265263157894737e-06, "loss": 0.0029, "step": 192 }, { "epoch": 0.15787321063394683, "grad_norm": 0.09655116498470306, "learning_rate": 3.732105263157894e-06, "loss": 0.0036, "step": 193 }, { "epoch": 0.1586912065439673, "grad_norm": 0.17335928976535797, "learning_rate": 3.1989473684210527e-06, "loss": 0.007, "step": 194 }, { "epoch": 0.15950920245398773, "grad_norm": 0.008331399410963058, "learning_rate": 2.66578947368421e-06, "loss": 0.0005, "step": 195 }, { "epoch": 0.1603271983640082, "grad_norm": 0.1623714417219162, "learning_rate": 2.1326315789473684e-06, "loss": 0.0043, "step": 196 }, { "epoch": 0.16114519427402862, "grad_norm": 0.05319036543369293, "learning_rate": 1.5994736842105264e-06, "loss": 0.0018, "step": 197 }, { "epoch": 0.1619631901840491, "grad_norm": 0.006065657362341881, "learning_rate": 1.0663157894736842e-06, "loss": 0.0004, "step": 198 }, { "epoch": 0.16278118609406952, "grad_norm": 0.010582847520709038, "learning_rate": 5.331578947368421e-07, "loss": 0.0005, "step": 199 }, { "epoch": 0.16359918200409, "grad_norm": 0.026805628091096878, "learning_rate": 0.0, "loss": 0.001, "step": 200 }, { "epoch": 0.16359918200409, "eval_loss": 0.21579746901988983, "eval_runtime": 165.9203, "eval_samples_per_second": 3.104, "eval_steps_per_second": 0.777, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.484049111154688e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }