{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.026314635323012148, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 8.771545107670716e-05, "grad_norm": 28.291993022356824, "learning_rate": 4.385964912280702e-08, "loss": 0.9764, "step": 1 }, { "epoch": 0.00017543090215341433, "grad_norm": 11.00431285069151, "learning_rate": 8.771929824561404e-08, "loss": 0.7373, "step": 2 }, { "epoch": 0.0002631463532301215, "grad_norm": 19.575902791602918, "learning_rate": 1.3157894736842107e-07, "loss": 0.92, "step": 3 }, { "epoch": 0.00035086180430682866, "grad_norm": 28.862884630243123, "learning_rate": 1.7543859649122808e-07, "loss": 0.9196, "step": 4 }, { "epoch": 0.0004385772553835358, "grad_norm": 15.982248327528751, "learning_rate": 2.192982456140351e-07, "loss": 0.8366, "step": 5 }, { "epoch": 0.000526292706460243, "grad_norm": 31.85723876161732, "learning_rate": 2.6315789473684213e-07, "loss": 0.9335, "step": 6 }, { "epoch": 0.0006140081575369502, "grad_norm": 21.310207454796295, "learning_rate": 3.070175438596491e-07, "loss": 0.8362, "step": 7 }, { "epoch": 0.0007017236086136573, "grad_norm": 20.052830776823505, "learning_rate": 3.5087719298245616e-07, "loss": 0.8015, "step": 8 }, { "epoch": 0.0007894390596903645, "grad_norm": 16.06788143210757, "learning_rate": 3.9473684210526315e-07, "loss": 0.8729, "step": 9 }, { "epoch": 0.0008771545107670716, "grad_norm": 29.100726513914584, "learning_rate": 4.385964912280702e-07, "loss": 0.9058, "step": 10 }, { "epoch": 0.0009648699618437788, "grad_norm": 13.993390572028792, "learning_rate": 4.824561403508772e-07, "loss": 0.7093, "step": 11 }, { "epoch": 0.001052585412920486, "grad_norm": 21.107935511000072, "learning_rate": 5.263157894736843e-07, "loss": 0.8955, "step": 12 }, { "epoch": 0.0011403008639971931, "grad_norm": 13.66193898339087, "learning_rate": 5.701754385964912e-07, "loss": 0.7219, "step": 13 }, { "epoch": 0.0012280163150739003, "grad_norm": 10.537203866107753, "learning_rate": 6.140350877192982e-07, "loss": 0.8429, "step": 14 }, { "epoch": 0.0013157317661506075, "grad_norm": 12.393106853157317, "learning_rate": 6.578947368421053e-07, "loss": 0.6708, "step": 15 }, { "epoch": 0.0014034472172273146, "grad_norm": 8.734604355126535, "learning_rate": 7.017543859649123e-07, "loss": 0.6507, "step": 16 }, { "epoch": 0.0014911626683040218, "grad_norm": 9.124362491394539, "learning_rate": 7.456140350877194e-07, "loss": 0.838, "step": 17 }, { "epoch": 0.001578878119380729, "grad_norm": 8.958389642999963, "learning_rate": 7.894736842105263e-07, "loss": 0.6849, "step": 18 }, { "epoch": 0.0016665935704574361, "grad_norm": 11.542677492312867, "learning_rate": 8.333333333333333e-07, "loss": 0.6926, "step": 19 }, { "epoch": 0.0017543090215341433, "grad_norm": 8.045066225626593, "learning_rate": 8.771929824561404e-07, "loss": 0.7006, "step": 20 }, { "epoch": 0.0018420244726108505, "grad_norm": 8.146906074379428, "learning_rate": 9.210526315789474e-07, "loss": 0.6737, "step": 21 }, { "epoch": 0.0019297399236875576, "grad_norm": 6.502955757535831, "learning_rate": 9.649122807017545e-07, "loss": 0.7495, "step": 22 }, { "epoch": 0.002017455374764265, "grad_norm": 8.736982858234592, "learning_rate": 1.0087719298245615e-06, "loss": 0.7324, "step": 23 }, { "epoch": 0.002105170825840972, "grad_norm": 7.851959741269017, "learning_rate": 1.0526315789473685e-06, "loss": 0.6686, "step": 24 }, { "epoch": 0.002192886276917679, "grad_norm": 8.594840793358543, "learning_rate": 1.0964912280701756e-06, "loss": 0.8064, "step": 25 }, { "epoch": 0.0022806017279943863, "grad_norm": 8.935665287337994, "learning_rate": 1.1403508771929824e-06, "loss": 0.6751, "step": 26 }, { "epoch": 0.0023683171790710934, "grad_norm": 11.146850280588064, "learning_rate": 1.1842105263157894e-06, "loss": 0.7884, "step": 27 }, { "epoch": 0.0024560326301478006, "grad_norm": 6.917869007862471, "learning_rate": 1.2280701754385965e-06, "loss": 0.8772, "step": 28 }, { "epoch": 0.0025437480812245078, "grad_norm": 9.32145567192897, "learning_rate": 1.2719298245614037e-06, "loss": 0.6486, "step": 29 }, { "epoch": 0.002631463532301215, "grad_norm": 7.83399807213587, "learning_rate": 1.3157894736842106e-06, "loss": 0.7793, "step": 30 }, { "epoch": 0.002719178983377922, "grad_norm": 5.701851482721999, "learning_rate": 1.3596491228070178e-06, "loss": 0.6418, "step": 31 }, { "epoch": 0.0028068944344546293, "grad_norm": 6.357569510522249, "learning_rate": 1.4035087719298246e-06, "loss": 0.7803, "step": 32 }, { "epoch": 0.0028946098855313364, "grad_norm": 6.1458878660724, "learning_rate": 1.4473684210526317e-06, "loss": 0.6075, "step": 33 }, { "epoch": 0.0029823253366080436, "grad_norm": 5.258525934759675, "learning_rate": 1.4912280701754387e-06, "loss": 0.7558, "step": 34 }, { "epoch": 0.0030700407876847508, "grad_norm": 5.96497463401995, "learning_rate": 1.5350877192982458e-06, "loss": 0.5807, "step": 35 }, { "epoch": 0.003157756238761458, "grad_norm": 9.97378904781871, "learning_rate": 1.5789473684210526e-06, "loss": 0.6766, "step": 36 }, { "epoch": 0.003245471689838165, "grad_norm": 10.558130153122322, "learning_rate": 1.6228070175438598e-06, "loss": 0.6318, "step": 37 }, { "epoch": 0.0033331871409148723, "grad_norm": 7.730592682668347, "learning_rate": 1.6666666666666667e-06, "loss": 0.5723, "step": 38 }, { "epoch": 0.0034209025919915794, "grad_norm": 6.513997535111305, "learning_rate": 1.710526315789474e-06, "loss": 0.7381, "step": 39 }, { "epoch": 0.0035086180430682866, "grad_norm": 6.4186997859745185, "learning_rate": 1.7543859649122807e-06, "loss": 0.676, "step": 40 }, { "epoch": 0.0035963334941449938, "grad_norm": 4.789756704738587, "learning_rate": 1.798245614035088e-06, "loss": 0.8106, "step": 41 }, { "epoch": 0.003684048945221701, "grad_norm": 8.552415866186008, "learning_rate": 1.8421052631578948e-06, "loss": 0.7834, "step": 42 }, { "epoch": 0.003771764396298408, "grad_norm": 5.104236885105078, "learning_rate": 1.8859649122807019e-06, "loss": 0.6694, "step": 43 }, { "epoch": 0.0038594798473751152, "grad_norm": 6.998642641947579, "learning_rate": 1.929824561403509e-06, "loss": 0.7184, "step": 44 }, { "epoch": 0.003947195298451822, "grad_norm": 6.754484565741454, "learning_rate": 1.973684210526316e-06, "loss": 0.7682, "step": 45 }, { "epoch": 0.00403491074952853, "grad_norm": 5.702466747706841, "learning_rate": 2.017543859649123e-06, "loss": 0.7167, "step": 46 }, { "epoch": 0.004122626200605236, "grad_norm": 7.038100758557257, "learning_rate": 2.06140350877193e-06, "loss": 0.6709, "step": 47 }, { "epoch": 0.004210341651681944, "grad_norm": 8.659378609826204, "learning_rate": 2.105263157894737e-06, "loss": 0.6508, "step": 48 }, { "epoch": 0.004298057102758651, "grad_norm": 9.315174303463822, "learning_rate": 2.149122807017544e-06, "loss": 0.6168, "step": 49 }, { "epoch": 0.004385772553835358, "grad_norm": 7.447716885721135, "learning_rate": 2.192982456140351e-06, "loss": 0.6738, "step": 50 }, { "epoch": 0.004473488004912065, "grad_norm": 5.600770404460154, "learning_rate": 2.236842105263158e-06, "loss": 0.6311, "step": 51 }, { "epoch": 0.004561203455988773, "grad_norm": 7.059691201242354, "learning_rate": 2.280701754385965e-06, "loss": 0.7204, "step": 52 }, { "epoch": 0.004648918907065479, "grad_norm": 5.589092290239263, "learning_rate": 2.324561403508772e-06, "loss": 0.7266, "step": 53 }, { "epoch": 0.004736634358142187, "grad_norm": 5.801762781587569, "learning_rate": 2.368421052631579e-06, "loss": 0.5336, "step": 54 }, { "epoch": 0.004824349809218894, "grad_norm": 5.599754768073974, "learning_rate": 2.412280701754386e-06, "loss": 0.6338, "step": 55 }, { "epoch": 0.004912065260295601, "grad_norm": 5.66437398031977, "learning_rate": 2.456140350877193e-06, "loss": 0.7813, "step": 56 }, { "epoch": 0.004999780711372308, "grad_norm": 6.32022790188225, "learning_rate": 2.5e-06, "loss": 0.6613, "step": 57 }, { "epoch": 0.0050874961624490156, "grad_norm": 8.01474270706056, "learning_rate": 2.5438596491228075e-06, "loss": 0.6451, "step": 58 }, { "epoch": 0.005175211613525722, "grad_norm": 6.586182462850705, "learning_rate": 2.5877192982456147e-06, "loss": 0.6984, "step": 59 }, { "epoch": 0.00526292706460243, "grad_norm": 5.61553252576188, "learning_rate": 2.631578947368421e-06, "loss": 0.5773, "step": 60 }, { "epoch": 0.005350642515679137, "grad_norm": 5.5274818204706895, "learning_rate": 2.6754385964912284e-06, "loss": 0.6083, "step": 61 }, { "epoch": 0.005438357966755844, "grad_norm": 3.8762804528384254, "learning_rate": 2.7192982456140356e-06, "loss": 0.7174, "step": 62 }, { "epoch": 0.005526073417832551, "grad_norm": 5.248404081335598, "learning_rate": 2.7631578947368424e-06, "loss": 0.7066, "step": 63 }, { "epoch": 0.0056137888689092585, "grad_norm": 7.214109517049078, "learning_rate": 2.8070175438596493e-06, "loss": 0.692, "step": 64 }, { "epoch": 0.005701504319985965, "grad_norm": 5.429278596290352, "learning_rate": 2.8508771929824565e-06, "loss": 0.6145, "step": 65 }, { "epoch": 0.005789219771062673, "grad_norm": 17.638205100824422, "learning_rate": 2.8947368421052634e-06, "loss": 0.7677, "step": 66 }, { "epoch": 0.00587693522213938, "grad_norm": 5.677374136021176, "learning_rate": 2.9385964912280706e-06, "loss": 0.6779, "step": 67 }, { "epoch": 0.005964650673216087, "grad_norm": 5.453107411280262, "learning_rate": 2.9824561403508774e-06, "loss": 0.6428, "step": 68 }, { "epoch": 0.006052366124292794, "grad_norm": 5.888626008478417, "learning_rate": 3.0263157894736843e-06, "loss": 0.6342, "step": 69 }, { "epoch": 0.0061400815753695015, "grad_norm": 5.3185045733144225, "learning_rate": 3.0701754385964915e-06, "loss": 0.5644, "step": 70 }, { "epoch": 0.006227797026446208, "grad_norm": 4.902919731780363, "learning_rate": 3.1140350877192988e-06, "loss": 0.709, "step": 71 }, { "epoch": 0.006315512477522916, "grad_norm": 8.773622618503456, "learning_rate": 3.157894736842105e-06, "loss": 0.6674, "step": 72 }, { "epoch": 0.006403227928599623, "grad_norm": 6.7570883776978174, "learning_rate": 3.2017543859649124e-06, "loss": 0.6918, "step": 73 }, { "epoch": 0.00649094337967633, "grad_norm": 5.597179964370573, "learning_rate": 3.2456140350877197e-06, "loss": 0.7119, "step": 74 }, { "epoch": 0.006578658830753037, "grad_norm": 5.4824260737552795, "learning_rate": 3.289473684210527e-06, "loss": 0.5667, "step": 75 }, { "epoch": 0.0066663742818297445, "grad_norm": 6.083422094529157, "learning_rate": 3.3333333333333333e-06, "loss": 0.5972, "step": 76 }, { "epoch": 0.006754089732906451, "grad_norm": 6.688559230122185, "learning_rate": 3.3771929824561406e-06, "loss": 0.6079, "step": 77 }, { "epoch": 0.006841805183983159, "grad_norm": 4.675152512564395, "learning_rate": 3.421052631578948e-06, "loss": 0.6431, "step": 78 }, { "epoch": 0.006929520635059866, "grad_norm": 6.61824094926871, "learning_rate": 3.464912280701755e-06, "loss": 0.7219, "step": 79 }, { "epoch": 0.007017236086136573, "grad_norm": 4.3090639659166685, "learning_rate": 3.5087719298245615e-06, "loss": 0.6267, "step": 80 }, { "epoch": 0.00710495153721328, "grad_norm": 5.908526205124108, "learning_rate": 3.5526315789473687e-06, "loss": 0.5598, "step": 81 }, { "epoch": 0.0071926669882899875, "grad_norm": 4.954945711406169, "learning_rate": 3.596491228070176e-06, "loss": 0.6251, "step": 82 }, { "epoch": 0.007280382439366694, "grad_norm": 6.403352381905709, "learning_rate": 3.640350877192983e-06, "loss": 0.6921, "step": 83 }, { "epoch": 0.007368097890443402, "grad_norm": 5.8960340556018505, "learning_rate": 3.6842105263157896e-06, "loss": 0.5803, "step": 84 }, { "epoch": 0.007455813341520109, "grad_norm": 5.5832723717085795, "learning_rate": 3.728070175438597e-06, "loss": 0.7109, "step": 85 }, { "epoch": 0.007543528792596816, "grad_norm": 6.9538610646678425, "learning_rate": 3.7719298245614037e-06, "loss": 0.57, "step": 86 }, { "epoch": 0.007631244243673523, "grad_norm": 4.9040721673618615, "learning_rate": 3.815789473684211e-06, "loss": 0.6681, "step": 87 }, { "epoch": 0.0077189596947502305, "grad_norm": 4.367227562952691, "learning_rate": 3.859649122807018e-06, "loss": 0.5881, "step": 88 }, { "epoch": 0.007806675145826937, "grad_norm": 6.135869823936115, "learning_rate": 3.903508771929825e-06, "loss": 0.6333, "step": 89 }, { "epoch": 0.007894390596903644, "grad_norm": 5.26232269598073, "learning_rate": 3.947368421052632e-06, "loss": 0.6228, "step": 90 }, { "epoch": 0.007982106047980352, "grad_norm": 5.478510766614749, "learning_rate": 3.991228070175439e-06, "loss": 0.6889, "step": 91 }, { "epoch": 0.00806982149905706, "grad_norm": 7.252221492478827, "learning_rate": 4.035087719298246e-06, "loss": 0.6726, "step": 92 }, { "epoch": 0.008157536950133767, "grad_norm": 6.810323867433885, "learning_rate": 4.078947368421053e-06, "loss": 0.6186, "step": 93 }, { "epoch": 0.008245252401210473, "grad_norm": 5.1477310672971965, "learning_rate": 4.12280701754386e-06, "loss": 0.6739, "step": 94 }, { "epoch": 0.00833296785228718, "grad_norm": 4.455009313283226, "learning_rate": 4.166666666666667e-06, "loss": 0.6676, "step": 95 }, { "epoch": 0.008420683303363888, "grad_norm": 4.854476484535793, "learning_rate": 4.210526315789474e-06, "loss": 0.624, "step": 96 }, { "epoch": 0.008508398754440595, "grad_norm": 8.775528791539337, "learning_rate": 4.254385964912281e-06, "loss": 0.7236, "step": 97 }, { "epoch": 0.008596114205517301, "grad_norm": 4.656928105654083, "learning_rate": 4.298245614035088e-06, "loss": 0.4853, "step": 98 }, { "epoch": 0.008683829656594009, "grad_norm": 6.1151229878888795, "learning_rate": 4.342105263157895e-06, "loss": 0.6611, "step": 99 }, { "epoch": 0.008771545107670716, "grad_norm": 4.846266795088099, "learning_rate": 4.385964912280702e-06, "loss": 0.6899, "step": 100 }, { "epoch": 0.008859260558747424, "grad_norm": 5.63076019856985, "learning_rate": 4.429824561403509e-06, "loss": 0.7394, "step": 101 }, { "epoch": 0.00894697600982413, "grad_norm": 6.152211661702361, "learning_rate": 4.473684210526316e-06, "loss": 0.6366, "step": 102 }, { "epoch": 0.009034691460900838, "grad_norm": 5.271237730819475, "learning_rate": 4.517543859649123e-06, "loss": 0.6776, "step": 103 }, { "epoch": 0.009122406911977545, "grad_norm": 6.150704296921181, "learning_rate": 4.56140350877193e-06, "loss": 0.7287, "step": 104 }, { "epoch": 0.009210122363054253, "grad_norm": 5.511353295743786, "learning_rate": 4.605263157894737e-06, "loss": 0.7156, "step": 105 }, { "epoch": 0.009297837814130959, "grad_norm": 5.651321362023493, "learning_rate": 4.649122807017544e-06, "loss": 0.5971, "step": 106 }, { "epoch": 0.009385553265207666, "grad_norm": 4.521052312786367, "learning_rate": 4.692982456140351e-06, "loss": 0.662, "step": 107 }, { "epoch": 0.009473268716284374, "grad_norm": 6.5893774516601775, "learning_rate": 4.736842105263158e-06, "loss": 0.6838, "step": 108 }, { "epoch": 0.009560984167361081, "grad_norm": 7.413604525506308, "learning_rate": 4.780701754385965e-06, "loss": 0.6798, "step": 109 }, { "epoch": 0.009648699618437787, "grad_norm": 5.258683042524991, "learning_rate": 4.824561403508772e-06, "loss": 0.7137, "step": 110 }, { "epoch": 0.009736415069514495, "grad_norm": 3.56629655229689, "learning_rate": 4.8684210526315795e-06, "loss": 0.5524, "step": 111 }, { "epoch": 0.009824130520591202, "grad_norm": 7.972594797604, "learning_rate": 4.912280701754386e-06, "loss": 0.7946, "step": 112 }, { "epoch": 0.00991184597166791, "grad_norm": 5.9169587346561965, "learning_rate": 4.956140350877193e-06, "loss": 0.6985, "step": 113 }, { "epoch": 0.009999561422744616, "grad_norm": 4.9028768240583895, "learning_rate": 5e-06, "loss": 0.7471, "step": 114 }, { "epoch": 0.010087276873821324, "grad_norm": 4.952040118758915, "learning_rate": 4.999999903143301e-06, "loss": 0.6645, "step": 115 }, { "epoch": 0.010174992324898031, "grad_norm": 5.307375041926707, "learning_rate": 4.999999612573212e-06, "loss": 0.6568, "step": 116 }, { "epoch": 0.010262707775974739, "grad_norm": 4.417210142946582, "learning_rate": 4.9999991282897545e-06, "loss": 0.6633, "step": 117 }, { "epoch": 0.010350423227051445, "grad_norm": 6.813103500844099, "learning_rate": 4.999998450292966e-06, "loss": 0.7479, "step": 118 }, { "epoch": 0.010438138678128152, "grad_norm": 5.220452049535287, "learning_rate": 4.9999975785829e-06, "loss": 0.5982, "step": 119 }, { "epoch": 0.01052585412920486, "grad_norm": 6.470241976711781, "learning_rate": 4.999996513159624e-06, "loss": 0.5915, "step": 120 }, { "epoch": 0.010613569580281567, "grad_norm": 5.236784827517624, "learning_rate": 4.99999525402322e-06, "loss": 0.665, "step": 121 }, { "epoch": 0.010701285031358273, "grad_norm": 5.5322906674158565, "learning_rate": 4.999993801173785e-06, "loss": 0.473, "step": 122 }, { "epoch": 0.01078900048243498, "grad_norm": 5.643434680672429, "learning_rate": 4.999992154611433e-06, "loss": 0.5802, "step": 123 }, { "epoch": 0.010876715933511688, "grad_norm": 4.909123022379139, "learning_rate": 4.9999903143362905e-06, "loss": 0.6103, "step": 124 }, { "epoch": 0.010964431384588396, "grad_norm": 7.046173121098522, "learning_rate": 4.999988280348501e-06, "loss": 0.6601, "step": 125 }, { "epoch": 0.011052146835665102, "grad_norm": 5.567754476589664, "learning_rate": 4.99998605264822e-06, "loss": 0.7144, "step": 126 }, { "epoch": 0.01113986228674181, "grad_norm": 6.670512866037107, "learning_rate": 4.999983631235623e-06, "loss": 0.5034, "step": 127 }, { "epoch": 0.011227577737818517, "grad_norm": 5.068760146843144, "learning_rate": 4.999981016110896e-06, "loss": 0.5965, "step": 128 }, { "epoch": 0.011315293188895225, "grad_norm": 5.493410339028754, "learning_rate": 4.999978207274243e-06, "loss": 0.6697, "step": 129 }, { "epoch": 0.01140300863997193, "grad_norm": 5.662089015796081, "learning_rate": 4.999975204725879e-06, "loss": 0.7182, "step": 130 }, { "epoch": 0.011490724091048638, "grad_norm": 3.734356064938746, "learning_rate": 4.999972008466039e-06, "loss": 0.632, "step": 131 }, { "epoch": 0.011578439542125346, "grad_norm": 4.29907687663725, "learning_rate": 4.99996861849497e-06, "loss": 0.6321, "step": 132 }, { "epoch": 0.011666154993202053, "grad_norm": 5.292963722155827, "learning_rate": 4.999965034812934e-06, "loss": 0.5768, "step": 133 }, { "epoch": 0.01175387044427876, "grad_norm": 4.564589196086129, "learning_rate": 4.99996125742021e-06, "loss": 0.5991, "step": 134 }, { "epoch": 0.011841585895355467, "grad_norm": 5.889974426321806, "learning_rate": 4.99995728631709e-06, "loss": 0.568, "step": 135 }, { "epoch": 0.011929301346432174, "grad_norm": 4.903556688362067, "learning_rate": 4.999953121503881e-06, "loss": 0.6221, "step": 136 }, { "epoch": 0.012017016797508882, "grad_norm": 4.652137494582458, "learning_rate": 4.999948762980906e-06, "loss": 0.6499, "step": 137 }, { "epoch": 0.012104732248585588, "grad_norm": 7.2681565015460965, "learning_rate": 4.999944210748504e-06, "loss": 0.7997, "step": 138 }, { "epoch": 0.012192447699662295, "grad_norm": 4.498966830496647, "learning_rate": 4.999939464807027e-06, "loss": 0.7033, "step": 139 }, { "epoch": 0.012280163150739003, "grad_norm": 5.658829625864849, "learning_rate": 4.999934525156842e-06, "loss": 0.6234, "step": 140 }, { "epoch": 0.01236787860181571, "grad_norm": 6.170987539440289, "learning_rate": 4.9999293917983325e-06, "loss": 0.7359, "step": 141 }, { "epoch": 0.012455594052892417, "grad_norm": 4.889450035742974, "learning_rate": 4.999924064731896e-06, "loss": 0.6418, "step": 142 }, { "epoch": 0.012543309503969124, "grad_norm": 5.565665252735285, "learning_rate": 4.9999185439579445e-06, "loss": 0.8114, "step": 143 }, { "epoch": 0.012631024955045832, "grad_norm": 5.009655972578068, "learning_rate": 4.9999128294769075e-06, "loss": 0.7307, "step": 144 }, { "epoch": 0.01271874040612254, "grad_norm": 5.011444448419762, "learning_rate": 4.999906921289227e-06, "loss": 0.6434, "step": 145 }, { "epoch": 0.012806455857199245, "grad_norm": 5.91290249112379, "learning_rate": 4.999900819395361e-06, "loss": 0.7576, "step": 146 }, { "epoch": 0.012894171308275953, "grad_norm": 5.291827066915767, "learning_rate": 4.9998945237957814e-06, "loss": 0.717, "step": 147 }, { "epoch": 0.01298188675935266, "grad_norm": 6.889695918810895, "learning_rate": 4.9998880344909765e-06, "loss": 0.6566, "step": 148 }, { "epoch": 0.013069602210429368, "grad_norm": 4.139725258131711, "learning_rate": 4.999881351481449e-06, "loss": 0.6139, "step": 149 }, { "epoch": 0.013157317661506074, "grad_norm": 5.041147601092224, "learning_rate": 4.999874474767718e-06, "loss": 0.7046, "step": 150 }, { "epoch": 0.013245033112582781, "grad_norm": 4.850191233243735, "learning_rate": 4.999867404350315e-06, "loss": 0.6494, "step": 151 }, { "epoch": 0.013332748563659489, "grad_norm": 5.608814210289025, "learning_rate": 4.999860140229788e-06, "loss": 0.8654, "step": 152 }, { "epoch": 0.013420464014736197, "grad_norm": 4.097824317856954, "learning_rate": 4.9998526824067e-06, "loss": 0.6889, "step": 153 }, { "epoch": 0.013508179465812903, "grad_norm": 6.425927321695068, "learning_rate": 4.999845030881629e-06, "loss": 0.5837, "step": 154 }, { "epoch": 0.01359589491688961, "grad_norm": 7.686652681051417, "learning_rate": 4.999837185655168e-06, "loss": 0.6869, "step": 155 }, { "epoch": 0.013683610367966318, "grad_norm": 6.199666417167642, "learning_rate": 4.9998291467279245e-06, "loss": 0.7371, "step": 156 }, { "epoch": 0.013771325819043024, "grad_norm": 6.797879751043678, "learning_rate": 4.999820914100522e-06, "loss": 0.6912, "step": 157 }, { "epoch": 0.013859041270119731, "grad_norm": 9.837640179642968, "learning_rate": 4.999812487773597e-06, "loss": 0.8045, "step": 158 }, { "epoch": 0.013946756721196439, "grad_norm": 6.620454193744729, "learning_rate": 4.9998038677478044e-06, "loss": 0.6018, "step": 159 }, { "epoch": 0.014034472172273146, "grad_norm": 4.952380418390811, "learning_rate": 4.99979505402381e-06, "loss": 0.5851, "step": 160 }, { "epoch": 0.014122187623349852, "grad_norm": 4.571346505498035, "learning_rate": 4.999786046602299e-06, "loss": 0.6633, "step": 161 }, { "epoch": 0.01420990307442656, "grad_norm": 6.745466717777739, "learning_rate": 4.999776845483968e-06, "loss": 0.714, "step": 162 }, { "epoch": 0.014297618525503267, "grad_norm": 4.888639355192875, "learning_rate": 4.999767450669531e-06, "loss": 0.5328, "step": 163 }, { "epoch": 0.014385333976579975, "grad_norm": 5.263414218540685, "learning_rate": 4.999757862159713e-06, "loss": 0.6746, "step": 164 }, { "epoch": 0.014473049427656681, "grad_norm": 5.8723140369149895, "learning_rate": 4.99974807995526e-06, "loss": 0.7101, "step": 165 }, { "epoch": 0.014560764878733388, "grad_norm": 4.125348885535371, "learning_rate": 4.999738104056931e-06, "loss": 0.6418, "step": 166 }, { "epoch": 0.014648480329810096, "grad_norm": 5.079939786355144, "learning_rate": 4.999727934465495e-06, "loss": 0.6757, "step": 167 }, { "epoch": 0.014736195780886804, "grad_norm": 4.436648943550616, "learning_rate": 4.999717571181742e-06, "loss": 0.6878, "step": 168 }, { "epoch": 0.01482391123196351, "grad_norm": 4.6070293178483706, "learning_rate": 4.999707014206475e-06, "loss": 0.6882, "step": 169 }, { "epoch": 0.014911626683040217, "grad_norm": 4.337658765605819, "learning_rate": 4.999696263540513e-06, "loss": 0.6418, "step": 170 }, { "epoch": 0.014999342134116925, "grad_norm": 5.834498841218243, "learning_rate": 4.999685319184688e-06, "loss": 0.6367, "step": 171 }, { "epoch": 0.015087057585193632, "grad_norm": 6.027148776110112, "learning_rate": 4.999674181139848e-06, "loss": 0.7505, "step": 172 }, { "epoch": 0.015174773036270338, "grad_norm": 4.712652033599274, "learning_rate": 4.999662849406855e-06, "loss": 0.7515, "step": 173 }, { "epoch": 0.015262488487347046, "grad_norm": 5.325275991673836, "learning_rate": 4.99965132398659e-06, "loss": 0.7871, "step": 174 }, { "epoch": 0.015350203938423753, "grad_norm": 5.006048437293231, "learning_rate": 4.999639604879943e-06, "loss": 0.6038, "step": 175 }, { "epoch": 0.015437919389500461, "grad_norm": 4.692976251794895, "learning_rate": 4.999627692087824e-06, "loss": 0.7106, "step": 176 }, { "epoch": 0.015525634840577167, "grad_norm": 6.484912012474024, "learning_rate": 4.999615585611156e-06, "loss": 0.6456, "step": 177 }, { "epoch": 0.015613350291653874, "grad_norm": 7.072312221146792, "learning_rate": 4.999603285450875e-06, "loss": 0.6986, "step": 178 }, { "epoch": 0.015701065742730582, "grad_norm": 5.072158684292459, "learning_rate": 4.999590791607936e-06, "loss": 0.6386, "step": 179 }, { "epoch": 0.015788781193807288, "grad_norm": 5.674801641765509, "learning_rate": 4.999578104083307e-06, "loss": 0.6512, "step": 180 }, { "epoch": 0.015876496644883997, "grad_norm": 6.011232915930249, "learning_rate": 4.9995652228779715e-06, "loss": 0.6166, "step": 181 }, { "epoch": 0.015964212095960703, "grad_norm": 7.067996556252431, "learning_rate": 4.999552147992926e-06, "loss": 0.8316, "step": 182 }, { "epoch": 0.01605192754703741, "grad_norm": 6.191586224655665, "learning_rate": 4.999538879429183e-06, "loss": 0.7167, "step": 183 }, { "epoch": 0.01613964299811412, "grad_norm": 5.40861794404673, "learning_rate": 4.999525417187774e-06, "loss": 0.6604, "step": 184 }, { "epoch": 0.016227358449190824, "grad_norm": 5.619694849325643, "learning_rate": 4.999511761269739e-06, "loss": 0.7141, "step": 185 }, { "epoch": 0.016315073900267534, "grad_norm": 7.467663008400906, "learning_rate": 4.999497911676138e-06, "loss": 0.6086, "step": 186 }, { "epoch": 0.01640278935134424, "grad_norm": 4.645589903763359, "learning_rate": 4.999483868408043e-06, "loss": 0.6932, "step": 187 }, { "epoch": 0.016490504802420945, "grad_norm": 4.819294533224638, "learning_rate": 4.999469631466544e-06, "loss": 0.6256, "step": 188 }, { "epoch": 0.016578220253497655, "grad_norm": 4.711171445741636, "learning_rate": 4.999455200852741e-06, "loss": 0.7445, "step": 189 }, { "epoch": 0.01666593570457436, "grad_norm": 4.371758877075776, "learning_rate": 4.999440576567755e-06, "loss": 0.6801, "step": 190 }, { "epoch": 0.016753651155651066, "grad_norm": 5.761171404408883, "learning_rate": 4.999425758612718e-06, "loss": 0.6701, "step": 191 }, { "epoch": 0.016841366606727776, "grad_norm": 4.340375314807721, "learning_rate": 4.999410746988778e-06, "loss": 0.5556, "step": 192 }, { "epoch": 0.01692908205780448, "grad_norm": 4.775058922031801, "learning_rate": 4.9993955416970986e-06, "loss": 0.6915, "step": 193 }, { "epoch": 0.01701679750888119, "grad_norm": 4.301940379009061, "learning_rate": 4.999380142738857e-06, "loss": 0.6982, "step": 194 }, { "epoch": 0.017104512959957897, "grad_norm": 4.746670538298819, "learning_rate": 4.9993645501152485e-06, "loss": 0.5392, "step": 195 }, { "epoch": 0.017192228411034603, "grad_norm": 5.312812102176541, "learning_rate": 4.999348763827479e-06, "loss": 0.6254, "step": 196 }, { "epoch": 0.017279943862111312, "grad_norm": 6.073252701324542, "learning_rate": 4.999332783876774e-06, "loss": 0.7221, "step": 197 }, { "epoch": 0.017367659313188018, "grad_norm": 6.783014797465277, "learning_rate": 4.999316610264369e-06, "loss": 0.5914, "step": 198 }, { "epoch": 0.017455374764264724, "grad_norm": 5.105373260000072, "learning_rate": 4.999300242991519e-06, "loss": 0.4895, "step": 199 }, { "epoch": 0.017543090215341433, "grad_norm": 5.3256898167081825, "learning_rate": 4.999283682059493e-06, "loss": 0.714, "step": 200 }, { "epoch": 0.01763080566641814, "grad_norm": 7.815945435660424, "learning_rate": 4.999266927469572e-06, "loss": 0.7691, "step": 201 }, { "epoch": 0.017718521117494848, "grad_norm": 4.350216346007481, "learning_rate": 4.999249979223056e-06, "loss": 0.7205, "step": 202 }, { "epoch": 0.017806236568571554, "grad_norm": 4.167534183562087, "learning_rate": 4.999232837321257e-06, "loss": 0.6716, "step": 203 }, { "epoch": 0.01789395201964826, "grad_norm": 6.564156035042191, "learning_rate": 4.999215501765504e-06, "loss": 0.6139, "step": 204 }, { "epoch": 0.01798166747072497, "grad_norm": 4.58988335300785, "learning_rate": 4.9991979725571395e-06, "loss": 0.6241, "step": 205 }, { "epoch": 0.018069382921801675, "grad_norm": 7.14774553510386, "learning_rate": 4.999180249697524e-06, "loss": 0.7338, "step": 206 }, { "epoch": 0.01815709837287838, "grad_norm": 4.3154768710391656, "learning_rate": 4.999162333188028e-06, "loss": 0.646, "step": 207 }, { "epoch": 0.01824481382395509, "grad_norm": 3.930924147546703, "learning_rate": 4.999144223030041e-06, "loss": 0.7162, "step": 208 }, { "epoch": 0.018332529275031796, "grad_norm": 3.75066761929553, "learning_rate": 4.999125919224966e-06, "loss": 0.6283, "step": 209 }, { "epoch": 0.018420244726108505, "grad_norm": 4.916459254987505, "learning_rate": 4.999107421774222e-06, "loss": 0.6716, "step": 210 }, { "epoch": 0.01850796017718521, "grad_norm": 4.570226928027306, "learning_rate": 4.999088730679241e-06, "loss": 0.6527, "step": 211 }, { "epoch": 0.018595675628261917, "grad_norm": 3.6658012035372605, "learning_rate": 4.999069845941472e-06, "loss": 0.5452, "step": 212 }, { "epoch": 0.018683391079338627, "grad_norm": 4.697816375671605, "learning_rate": 4.999050767562379e-06, "loss": 0.7316, "step": 213 }, { "epoch": 0.018771106530415332, "grad_norm": 5.639876519194002, "learning_rate": 4.99903149554344e-06, "loss": 0.5152, "step": 214 }, { "epoch": 0.018858821981492038, "grad_norm": 5.527702869650481, "learning_rate": 4.999012029886147e-06, "loss": 0.6119, "step": 215 }, { "epoch": 0.018946537432568748, "grad_norm": 6.019639388484205, "learning_rate": 4.998992370592008e-06, "loss": 0.7366, "step": 216 }, { "epoch": 0.019034252883645453, "grad_norm": 4.014799337285965, "learning_rate": 4.998972517662549e-06, "loss": 0.7088, "step": 217 }, { "epoch": 0.019121968334722163, "grad_norm": 7.876499612097003, "learning_rate": 4.998952471099307e-06, "loss": 0.5565, "step": 218 }, { "epoch": 0.01920968378579887, "grad_norm": 7.386792956892447, "learning_rate": 4.998932230903835e-06, "loss": 0.6387, "step": 219 }, { "epoch": 0.019297399236875575, "grad_norm": 5.346097163630257, "learning_rate": 4.998911797077701e-06, "loss": 0.6237, "step": 220 }, { "epoch": 0.019385114687952284, "grad_norm": 6.133310652425816, "learning_rate": 4.998891169622488e-06, "loss": 0.7428, "step": 221 }, { "epoch": 0.01947283013902899, "grad_norm": 4.224801633855712, "learning_rate": 4.998870348539797e-06, "loss": 0.7206, "step": 222 }, { "epoch": 0.019560545590105696, "grad_norm": 5.648869005800134, "learning_rate": 4.998849333831238e-06, "loss": 0.6249, "step": 223 }, { "epoch": 0.019648261041182405, "grad_norm": 4.634920959306503, "learning_rate": 4.998828125498441e-06, "loss": 0.6764, "step": 224 }, { "epoch": 0.01973597649225911, "grad_norm": 4.882651557085375, "learning_rate": 4.998806723543049e-06, "loss": 0.6682, "step": 225 }, { "epoch": 0.01982369194333582, "grad_norm": 4.5073631852916645, "learning_rate": 4.998785127966721e-06, "loss": 0.7658, "step": 226 }, { "epoch": 0.019911407394412526, "grad_norm": 6.444404326993186, "learning_rate": 4.99876333877113e-06, "loss": 0.7161, "step": 227 }, { "epoch": 0.019999122845489232, "grad_norm": 5.926254683053582, "learning_rate": 4.998741355957963e-06, "loss": 0.6083, "step": 228 }, { "epoch": 0.02008683829656594, "grad_norm": 4.715935033600424, "learning_rate": 4.998719179528925e-06, "loss": 0.5764, "step": 229 }, { "epoch": 0.020174553747642647, "grad_norm": 4.06642116262848, "learning_rate": 4.998696809485734e-06, "loss": 0.6436, "step": 230 }, { "epoch": 0.020262269198719353, "grad_norm": 4.060536926809771, "learning_rate": 4.998674245830123e-06, "loss": 0.6455, "step": 231 }, { "epoch": 0.020349984649796062, "grad_norm": 5.769596888340199, "learning_rate": 4.9986514885638405e-06, "loss": 0.6422, "step": 232 }, { "epoch": 0.020437700100872768, "grad_norm": 5.619149975421577, "learning_rate": 4.99862853768865e-06, "loss": 0.5151, "step": 233 }, { "epoch": 0.020525415551949477, "grad_norm": 5.738973149236573, "learning_rate": 4.998605393206329e-06, "loss": 0.5698, "step": 234 }, { "epoch": 0.020613131003026183, "grad_norm": 3.9117936997485443, "learning_rate": 4.998582055118672e-06, "loss": 0.6139, "step": 235 }, { "epoch": 0.02070084645410289, "grad_norm": 5.594946157519774, "learning_rate": 4.998558523427488e-06, "loss": 0.6305, "step": 236 }, { "epoch": 0.0207885619051796, "grad_norm": 3.7796595114227816, "learning_rate": 4.998534798134598e-06, "loss": 0.6064, "step": 237 }, { "epoch": 0.020876277356256304, "grad_norm": 5.530110712124758, "learning_rate": 4.998510879241842e-06, "loss": 0.7404, "step": 238 }, { "epoch": 0.02096399280733301, "grad_norm": 5.795681054870311, "learning_rate": 4.998486766751073e-06, "loss": 0.6637, "step": 239 }, { "epoch": 0.02105170825840972, "grad_norm": 5.250443330736557, "learning_rate": 4.99846246066416e-06, "loss": 0.7229, "step": 240 }, { "epoch": 0.021139423709486425, "grad_norm": 5.307033877732376, "learning_rate": 4.998437960982985e-06, "loss": 0.729, "step": 241 }, { "epoch": 0.021227139160563135, "grad_norm": 4.264326950314863, "learning_rate": 4.998413267709446e-06, "loss": 0.6363, "step": 242 }, { "epoch": 0.02131485461163984, "grad_norm": 4.56674428695937, "learning_rate": 4.99838838084546e-06, "loss": 0.573, "step": 243 }, { "epoch": 0.021402570062716546, "grad_norm": 5.367393577306364, "learning_rate": 4.998363300392951e-06, "loss": 0.6187, "step": 244 }, { "epoch": 0.021490285513793256, "grad_norm": 5.58627031411974, "learning_rate": 4.998338026353865e-06, "loss": 0.635, "step": 245 }, { "epoch": 0.02157800096486996, "grad_norm": 4.1536241104050005, "learning_rate": 4.9983125587301594e-06, "loss": 0.7296, "step": 246 }, { "epoch": 0.021665716415946668, "grad_norm": 5.369955138376355, "learning_rate": 4.998286897523808e-06, "loss": 0.5939, "step": 247 }, { "epoch": 0.021753431867023377, "grad_norm": 4.749169550030242, "learning_rate": 4.998261042736799e-06, "loss": 0.7125, "step": 248 }, { "epoch": 0.021841147318100083, "grad_norm": 3.847851803716185, "learning_rate": 4.998234994371135e-06, "loss": 0.6874, "step": 249 }, { "epoch": 0.021928862769176792, "grad_norm": 6.3610718821634755, "learning_rate": 4.998208752428836e-06, "loss": 0.6839, "step": 250 }, { "epoch": 0.022016578220253498, "grad_norm": 6.90892255007994, "learning_rate": 4.998182316911934e-06, "loss": 0.6706, "step": 251 }, { "epoch": 0.022104293671330204, "grad_norm": 4.842858396629252, "learning_rate": 4.998155687822478e-06, "loss": 0.7887, "step": 252 }, { "epoch": 0.022192009122406913, "grad_norm": 6.80960196083629, "learning_rate": 4.99812886516253e-06, "loss": 0.6891, "step": 253 }, { "epoch": 0.02227972457348362, "grad_norm": 6.897100992823047, "learning_rate": 4.998101848934171e-06, "loss": 0.7213, "step": 254 }, { "epoch": 0.022367440024560325, "grad_norm": 4.383904436150581, "learning_rate": 4.9980746391394916e-06, "loss": 0.5472, "step": 255 }, { "epoch": 0.022455155475637034, "grad_norm": 6.136102422729719, "learning_rate": 4.998047235780603e-06, "loss": 0.7462, "step": 256 }, { "epoch": 0.02254287092671374, "grad_norm": 5.873462354540876, "learning_rate": 4.9980196388596255e-06, "loss": 0.6893, "step": 257 }, { "epoch": 0.02263058637779045, "grad_norm": 5.36389164609212, "learning_rate": 4.9979918483787e-06, "loss": 0.725, "step": 258 }, { "epoch": 0.022718301828867155, "grad_norm": 6.634852411669424, "learning_rate": 4.997963864339978e-06, "loss": 0.7619, "step": 259 }, { "epoch": 0.02280601727994386, "grad_norm": 4.201015694891079, "learning_rate": 4.99793568674563e-06, "loss": 0.653, "step": 260 }, { "epoch": 0.02289373273102057, "grad_norm": 4.951129353141893, "learning_rate": 4.997907315597836e-06, "loss": 0.7543, "step": 261 }, { "epoch": 0.022981448182097276, "grad_norm": 4.331792323630216, "learning_rate": 4.997878750898798e-06, "loss": 0.6553, "step": 262 }, { "epoch": 0.023069163633173982, "grad_norm": 4.764837636647203, "learning_rate": 4.997849992650727e-06, "loss": 0.719, "step": 263 }, { "epoch": 0.02315687908425069, "grad_norm": 7.315146297212186, "learning_rate": 4.997821040855852e-06, "loss": 0.8217, "step": 264 }, { "epoch": 0.023244594535327397, "grad_norm": 4.5164891139288015, "learning_rate": 4.997791895516417e-06, "loss": 0.5553, "step": 265 }, { "epoch": 0.023332309986404107, "grad_norm": 4.651549875308793, "learning_rate": 4.99776255663468e-06, "loss": 0.6981, "step": 266 }, { "epoch": 0.023420025437480813, "grad_norm": 4.941120481014187, "learning_rate": 4.997733024212913e-06, "loss": 0.604, "step": 267 }, { "epoch": 0.02350774088855752, "grad_norm": 6.3616778757465315, "learning_rate": 4.997703298253406e-06, "loss": 0.7253, "step": 268 }, { "epoch": 0.023595456339634228, "grad_norm": 4.723855693485358, "learning_rate": 4.997673378758462e-06, "loss": 0.7335, "step": 269 }, { "epoch": 0.023683171790710934, "grad_norm": 4.336523073382538, "learning_rate": 4.997643265730399e-06, "loss": 0.5665, "step": 270 }, { "epoch": 0.02377088724178764, "grad_norm": 6.547875149524498, "learning_rate": 4.997612959171549e-06, "loss": 0.6542, "step": 271 }, { "epoch": 0.02385860269286435, "grad_norm": 5.285021138793967, "learning_rate": 4.997582459084264e-06, "loss": 0.7824, "step": 272 }, { "epoch": 0.023946318143941055, "grad_norm": 4.447718203152539, "learning_rate": 4.9975517654709025e-06, "loss": 0.6728, "step": 273 }, { "epoch": 0.024034033595017764, "grad_norm": 4.323105158596241, "learning_rate": 4.997520878333847e-06, "loss": 0.6516, "step": 274 }, { "epoch": 0.02412174904609447, "grad_norm": 4.091596093860627, "learning_rate": 4.997489797675489e-06, "loss": 0.5786, "step": 275 }, { "epoch": 0.024209464497171176, "grad_norm": 4.50262054947591, "learning_rate": 4.997458523498236e-06, "loss": 0.6632, "step": 276 }, { "epoch": 0.024297179948247885, "grad_norm": 5.394966563241667, "learning_rate": 4.997427055804513e-06, "loss": 0.7415, "step": 277 }, { "epoch": 0.02438489539932459, "grad_norm": 5.134838704391961, "learning_rate": 4.9973953945967565e-06, "loss": 0.6225, "step": 278 }, { "epoch": 0.024472610850401297, "grad_norm": 4.555937935551801, "learning_rate": 4.9973635398774226e-06, "loss": 0.7451, "step": 279 }, { "epoch": 0.024560326301478006, "grad_norm": 4.014041307501394, "learning_rate": 4.997331491648976e-06, "loss": 0.607, "step": 280 }, { "epoch": 0.024648041752554712, "grad_norm": 5.398424400960683, "learning_rate": 4.9972992499139025e-06, "loss": 0.665, "step": 281 }, { "epoch": 0.02473575720363142, "grad_norm": 6.959554022697295, "learning_rate": 4.9972668146746995e-06, "loss": 0.8175, "step": 282 }, { "epoch": 0.024823472654708127, "grad_norm": 5.048396931572014, "learning_rate": 4.997234185933879e-06, "loss": 0.6961, "step": 283 }, { "epoch": 0.024911188105784833, "grad_norm": 4.737474855724115, "learning_rate": 4.997201363693972e-06, "loss": 0.5337, "step": 284 }, { "epoch": 0.024998903556861542, "grad_norm": 7.374843310231967, "learning_rate": 4.997168347957521e-06, "loss": 0.6791, "step": 285 }, { "epoch": 0.025086619007938248, "grad_norm": 4.306967488515473, "learning_rate": 4.997135138727081e-06, "loss": 0.8791, "step": 286 }, { "epoch": 0.025174334459014954, "grad_norm": 3.7949900410813737, "learning_rate": 4.99710173600523e-06, "loss": 0.7743, "step": 287 }, { "epoch": 0.025262049910091663, "grad_norm": 4.842604758031469, "learning_rate": 4.997068139794554e-06, "loss": 0.6602, "step": 288 }, { "epoch": 0.02534976536116837, "grad_norm": 3.531764677671023, "learning_rate": 4.9970343500976545e-06, "loss": 0.6317, "step": 289 }, { "epoch": 0.02543748081224508, "grad_norm": 5.68234167540357, "learning_rate": 4.997000366917153e-06, "loss": 0.7404, "step": 290 }, { "epoch": 0.025525196263321785, "grad_norm": 4.623883782994243, "learning_rate": 4.9969661902556804e-06, "loss": 0.6093, "step": 291 }, { "epoch": 0.02561291171439849, "grad_norm": 5.9956405593570175, "learning_rate": 4.996931820115885e-06, "loss": 0.6773, "step": 292 }, { "epoch": 0.0257006271654752, "grad_norm": 5.06274620174889, "learning_rate": 4.996897256500433e-06, "loss": 0.7249, "step": 293 }, { "epoch": 0.025788342616551906, "grad_norm": 5.989915075597491, "learning_rate": 4.996862499411998e-06, "loss": 0.7526, "step": 294 }, { "epoch": 0.02587605806762861, "grad_norm": 4.58567195302804, "learning_rate": 4.996827548853276e-06, "loss": 0.6762, "step": 295 }, { "epoch": 0.02596377351870532, "grad_norm": 4.097368677404026, "learning_rate": 4.996792404826974e-06, "loss": 0.6238, "step": 296 }, { "epoch": 0.026051488969782027, "grad_norm": 4.021749832913485, "learning_rate": 4.996757067335816e-06, "loss": 0.7958, "step": 297 }, { "epoch": 0.026139204420858736, "grad_norm": 4.679522912267575, "learning_rate": 4.99672153638254e-06, "loss": 0.6583, "step": 298 }, { "epoch": 0.026226919871935442, "grad_norm": 4.256974035317045, "learning_rate": 4.996685811969898e-06, "loss": 0.6464, "step": 299 }, { "epoch": 0.026314635323012148, "grad_norm": 4.4862335847168096, "learning_rate": 4.996649894100659e-06, "loss": 0.6116, "step": 300 } ], "logging_steps": 1, "max_steps": 11400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7802380615680.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }