{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.9815950920245395, "eval_steps": 500, "global_step": 1015, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0049079754601227, "grad_norm": 2.422234535217285, "learning_rate": 1.4000000000000001e-06, "loss": 0.5835, "step": 1 }, { "epoch": 0.0098159509202454, "grad_norm": 0.8900374174118042, "learning_rate": 2.8000000000000003e-06, "loss": 0.2599, "step": 2 }, { "epoch": 0.014723926380368098, "grad_norm": 0.9694510102272034, "learning_rate": 4.2e-06, "loss": 0.1512, "step": 3 }, { "epoch": 0.0196319018404908, "grad_norm": 2.383556604385376, "learning_rate": 5.600000000000001e-06, "loss": 0.5828, "step": 4 }, { "epoch": 0.024539877300613498, "grad_norm": 0.5773197412490845, "learning_rate": 7e-06, "loss": 0.156, "step": 5 }, { "epoch": 0.029447852760736196, "grad_norm": 0.8293938636779785, "learning_rate": 6.993069306930693e-06, "loss": 0.1723, "step": 6 }, { "epoch": 0.0343558282208589, "grad_norm": 0.02590053714811802, "learning_rate": 6.986138613861386e-06, "loss": 0.0011, "step": 7 }, { "epoch": 0.0392638036809816, "grad_norm": 0.627031683921814, "learning_rate": 6.979207920792079e-06, "loss": 0.2592, "step": 8 }, { "epoch": 0.044171779141104296, "grad_norm": 6.028290271759033, "learning_rate": 6.972277227722772e-06, "loss": 0.7911, "step": 9 }, { "epoch": 0.049079754601226995, "grad_norm": 1.6350908279418945, "learning_rate": 6.965346534653465e-06, "loss": 0.2866, "step": 10 }, { "epoch": 0.053987730061349694, "grad_norm": 1.3962496519088745, "learning_rate": 6.958415841584158e-06, "loss": 0.2447, "step": 11 }, { "epoch": 0.05889570552147239, "grad_norm": 0.385162889957428, "learning_rate": 6.951485148514851e-06, "loss": 0.1153, "step": 12 }, { "epoch": 0.0638036809815951, "grad_norm": 0.6873381733894348, "learning_rate": 6.9445544554455444e-06, "loss": 0.6868, "step": 13 }, { "epoch": 0.0687116564417178, "grad_norm": 0.3449646234512329, "learning_rate": 6.9376237623762375e-06, "loss": 0.1523, "step": 14 }, { "epoch": 0.0736196319018405, "grad_norm": 2.022113084793091, "learning_rate": 6.930693069306931e-06, "loss": 0.3527, "step": 15 }, { "epoch": 0.0785276073619632, "grad_norm": 0.661878764629364, "learning_rate": 6.923762376237624e-06, "loss": 0.2449, "step": 16 }, { "epoch": 0.0834355828220859, "grad_norm": 0.7884008884429932, "learning_rate": 6.916831683168317e-06, "loss": 0.1496, "step": 17 }, { "epoch": 0.08834355828220859, "grad_norm": 0.8798872828483582, "learning_rate": 6.90990099009901e-06, "loss": 0.2151, "step": 18 }, { "epoch": 0.09325153374233129, "grad_norm": 1.3640304803848267, "learning_rate": 6.902970297029703e-06, "loss": 0.356, "step": 19 }, { "epoch": 0.09815950920245399, "grad_norm": 1.1427268981933594, "learning_rate": 6.896039603960396e-06, "loss": 0.2645, "step": 20 }, { "epoch": 0.10306748466257669, "grad_norm": 0.9606547951698303, "learning_rate": 6.889108910891089e-06, "loss": 0.3541, "step": 21 }, { "epoch": 0.10797546012269939, "grad_norm": 0.23967993259429932, "learning_rate": 6.882178217821782e-06, "loss": 0.0407, "step": 22 }, { "epoch": 0.11288343558282209, "grad_norm": 5.597607612609863, "learning_rate": 6.875247524752475e-06, "loss": 1.0892, "step": 23 }, { "epoch": 0.11779141104294479, "grad_norm": 0.6763832569122314, "learning_rate": 6.868316831683168e-06, "loss": 0.4406, "step": 24 }, { "epoch": 0.12269938650306748, "grad_norm": 5.346296310424805, "learning_rate": 6.861386138613861e-06, "loss": 1.2512, "step": 25 }, { "epoch": 0.1276073619631902, "grad_norm": 0.9197913408279419, "learning_rate": 6.854455445544554e-06, "loss": 0.1855, "step": 26 }, { "epoch": 0.1325153374233129, "grad_norm": 0.6145322918891907, "learning_rate": 6.847524752475247e-06, "loss": 0.0562, "step": 27 }, { "epoch": 0.1374233128834356, "grad_norm": 1.5995681285858154, "learning_rate": 6.8405940594059405e-06, "loss": 0.405, "step": 28 }, { "epoch": 0.1423312883435583, "grad_norm": 0.6084582805633545, "learning_rate": 6.8336633663366335e-06, "loss": 0.0953, "step": 29 }, { "epoch": 0.147239263803681, "grad_norm": 0.09425808489322662, "learning_rate": 6.826732673267327e-06, "loss": 0.0128, "step": 30 }, { "epoch": 0.1521472392638037, "grad_norm": 2.628293991088867, "learning_rate": 6.81980198019802e-06, "loss": 0.4032, "step": 31 }, { "epoch": 0.1570552147239264, "grad_norm": 0.4179680347442627, "learning_rate": 6.812871287128713e-06, "loss": 0.0317, "step": 32 }, { "epoch": 0.1619631901840491, "grad_norm": 1.106016755104065, "learning_rate": 6.805940594059406e-06, "loss": 0.1495, "step": 33 }, { "epoch": 0.1668711656441718, "grad_norm": 0.3168316185474396, "learning_rate": 6.799009900990099e-06, "loss": 0.1907, "step": 34 }, { "epoch": 0.17177914110429449, "grad_norm": 0.4211124777793884, "learning_rate": 6.792079207920792e-06, "loss": 0.0519, "step": 35 }, { "epoch": 0.17668711656441718, "grad_norm": 1.7166955471038818, "learning_rate": 6.785148514851485e-06, "loss": 0.1693, "step": 36 }, { "epoch": 0.18159509202453988, "grad_norm": 1.6126145124435425, "learning_rate": 6.778217821782178e-06, "loss": 0.5255, "step": 37 }, { "epoch": 0.18650306748466258, "grad_norm": 0.6241395473480225, "learning_rate": 6.771287128712871e-06, "loss": 0.2787, "step": 38 }, { "epoch": 0.19141104294478528, "grad_norm": 0.8319867253303528, "learning_rate": 6.764356435643564e-06, "loss": 0.0874, "step": 39 }, { "epoch": 0.19631901840490798, "grad_norm": 2.6518406867980957, "learning_rate": 6.757425742574257e-06, "loss": 0.1322, "step": 40 }, { "epoch": 0.20122699386503068, "grad_norm": 0.830451488494873, "learning_rate": 6.75049504950495e-06, "loss": 0.0785, "step": 41 }, { "epoch": 0.20613496932515338, "grad_norm": 0.9671738147735596, "learning_rate": 6.7435643564356434e-06, "loss": 0.1281, "step": 42 }, { "epoch": 0.21104294478527608, "grad_norm": 0.2914385497570038, "learning_rate": 6.7366336633663365e-06, "loss": 0.0565, "step": 43 }, { "epoch": 0.21595092024539878, "grad_norm": 1.4892910718917847, "learning_rate": 6.7297029702970296e-06, "loss": 0.3326, "step": 44 }, { "epoch": 0.22085889570552147, "grad_norm": 0.3593141734600067, "learning_rate": 6.722772277227723e-06, "loss": 0.0499, "step": 45 }, { "epoch": 0.22576687116564417, "grad_norm": 0.8170046210289001, "learning_rate": 6.715841584158416e-06, "loss": 0.028, "step": 46 }, { "epoch": 0.23067484662576687, "grad_norm": 1.7005876302719116, "learning_rate": 6.708910891089109e-06, "loss": 0.1811, "step": 47 }, { "epoch": 0.23558282208588957, "grad_norm": 3.5889787673950195, "learning_rate": 6.701980198019802e-06, "loss": 0.5892, "step": 48 }, { "epoch": 0.24049079754601227, "grad_norm": 0.26327264308929443, "learning_rate": 6.695049504950495e-06, "loss": 0.0134, "step": 49 }, { "epoch": 0.24539877300613497, "grad_norm": 3.6891543865203857, "learning_rate": 6.688118811881188e-06, "loss": 0.2651, "step": 50 }, { "epoch": 0.25030674846625767, "grad_norm": 0.229234978556633, "learning_rate": 6.681188118811881e-06, "loss": 0.0307, "step": 51 }, { "epoch": 0.2552147239263804, "grad_norm": 0.4476153552532196, "learning_rate": 6.674257425742574e-06, "loss": 0.0974, "step": 52 }, { "epoch": 0.26012269938650306, "grad_norm": 0.8651221394538879, "learning_rate": 6.667326732673267e-06, "loss": 0.136, "step": 53 }, { "epoch": 0.2650306748466258, "grad_norm": 0.8453481197357178, "learning_rate": 6.66039603960396e-06, "loss": 0.0714, "step": 54 }, { "epoch": 0.26993865030674846, "grad_norm": 0.8162591457366943, "learning_rate": 6.653465346534653e-06, "loss": 0.0607, "step": 55 }, { "epoch": 0.2748466257668712, "grad_norm": 1.0115143060684204, "learning_rate": 6.646534653465346e-06, "loss": 0.0377, "step": 56 }, { "epoch": 0.27975460122699386, "grad_norm": 0.18834809958934784, "learning_rate": 6.6396039603960395e-06, "loss": 0.0062, "step": 57 }, { "epoch": 0.2846625766871166, "grad_norm": 0.06441894173622131, "learning_rate": 6.6326732673267325e-06, "loss": 0.0094, "step": 58 }, { "epoch": 0.28957055214723926, "grad_norm": 1.1535841226577759, "learning_rate": 6.625742574257426e-06, "loss": 0.0234, "step": 59 }, { "epoch": 0.294478527607362, "grad_norm": 0.1546783298254013, "learning_rate": 6.618811881188119e-06, "loss": 0.0192, "step": 60 }, { "epoch": 0.29938650306748466, "grad_norm": 6.198948383331299, "learning_rate": 6.611881188118812e-06, "loss": 0.0637, "step": 61 }, { "epoch": 0.3042944785276074, "grad_norm": 1.3115315437316895, "learning_rate": 6.604950495049505e-06, "loss": 0.074, "step": 62 }, { "epoch": 0.30920245398773005, "grad_norm": 0.5492228269577026, "learning_rate": 6.598019801980198e-06, "loss": 0.0291, "step": 63 }, { "epoch": 0.3141104294478528, "grad_norm": 0.46307000517845154, "learning_rate": 6.591089108910891e-06, "loss": 0.1191, "step": 64 }, { "epoch": 0.31901840490797545, "grad_norm": 0.16621133685112, "learning_rate": 6.584158415841584e-06, "loss": 0.0317, "step": 65 }, { "epoch": 0.3239263803680982, "grad_norm": 0.56168532371521, "learning_rate": 6.577227722772277e-06, "loss": 0.0797, "step": 66 }, { "epoch": 0.32883435582822085, "grad_norm": 0.21208958327770233, "learning_rate": 6.57029702970297e-06, "loss": 0.0039, "step": 67 }, { "epoch": 0.3337423312883436, "grad_norm": 0.5419512391090393, "learning_rate": 6.563366336633663e-06, "loss": 0.0269, "step": 68 }, { "epoch": 0.33865030674846625, "grad_norm": 0.278899222612381, "learning_rate": 6.556435643564357e-06, "loss": 0.0593, "step": 69 }, { "epoch": 0.34355828220858897, "grad_norm": 0.6312870979309082, "learning_rate": 6.54950495049505e-06, "loss": 0.0751, "step": 70 }, { "epoch": 0.34846625766871164, "grad_norm": 0.19221121072769165, "learning_rate": 6.542574257425743e-06, "loss": 0.0037, "step": 71 }, { "epoch": 0.35337423312883437, "grad_norm": 0.29226556420326233, "learning_rate": 6.5356435643564355e-06, "loss": 0.0417, "step": 72 }, { "epoch": 0.35828220858895704, "grad_norm": 0.32889777421951294, "learning_rate": 6.5287128712871286e-06, "loss": 0.0767, "step": 73 }, { "epoch": 0.36319018404907977, "grad_norm": 0.665745735168457, "learning_rate": 6.521782178217822e-06, "loss": 0.1935, "step": 74 }, { "epoch": 0.36809815950920244, "grad_norm": 0.10821861028671265, "learning_rate": 6.514851485148515e-06, "loss": 0.0087, "step": 75 }, { "epoch": 0.37300613496932516, "grad_norm": 1.0068309307098389, "learning_rate": 6.507920792079208e-06, "loss": 0.0232, "step": 76 }, { "epoch": 0.37791411042944784, "grad_norm": 0.03962863236665726, "learning_rate": 6.500990099009901e-06, "loss": 0.0051, "step": 77 }, { "epoch": 0.38282208588957056, "grad_norm": 9.40488052368164, "learning_rate": 6.494059405940594e-06, "loss": 0.2292, "step": 78 }, { "epoch": 0.38773006134969323, "grad_norm": 0.49305009841918945, "learning_rate": 6.487128712871287e-06, "loss": 0.0924, "step": 79 }, { "epoch": 0.39263803680981596, "grad_norm": 0.72562575340271, "learning_rate": 6.48019801980198e-06, "loss": 0.0549, "step": 80 }, { "epoch": 0.39754601226993863, "grad_norm": 0.015473410487174988, "learning_rate": 6.473267326732673e-06, "loss": 0.0017, "step": 81 }, { "epoch": 0.40245398773006136, "grad_norm": 0.13165496289730072, "learning_rate": 6.466336633663366e-06, "loss": 0.0068, "step": 82 }, { "epoch": 0.40736196319018403, "grad_norm": 0.42630520462989807, "learning_rate": 6.459405940594059e-06, "loss": 0.0927, "step": 83 }, { "epoch": 0.41226993865030676, "grad_norm": 0.059238456189632416, "learning_rate": 6.452475247524752e-06, "loss": 0.0036, "step": 84 }, { "epoch": 0.4171779141104294, "grad_norm": 0.8484250903129578, "learning_rate": 6.445544554455445e-06, "loss": 0.2075, "step": 85 }, { "epoch": 0.42208588957055215, "grad_norm": 0.3800269067287445, "learning_rate": 6.4386138613861384e-06, "loss": 0.1287, "step": 86 }, { "epoch": 0.4269938650306748, "grad_norm": 0.002063202438876033, "learning_rate": 6.4316831683168315e-06, "loss": 0.0008, "step": 87 }, { "epoch": 0.43190184049079755, "grad_norm": 0.05038560926914215, "learning_rate": 6.424752475247525e-06, "loss": 0.0017, "step": 88 }, { "epoch": 0.4368098159509202, "grad_norm": 0.0683569684624672, "learning_rate": 6.417821782178218e-06, "loss": 0.0035, "step": 89 }, { "epoch": 0.44171779141104295, "grad_norm": 0.028999239206314087, "learning_rate": 6.410891089108911e-06, "loss": 0.002, "step": 90 }, { "epoch": 0.4466257668711656, "grad_norm": 0.1129206195473671, "learning_rate": 6.403960396039604e-06, "loss": 0.0045, "step": 91 }, { "epoch": 0.45153374233128835, "grad_norm": 0.012081787921488285, "learning_rate": 6.397029702970297e-06, "loss": 0.0016, "step": 92 }, { "epoch": 0.456441717791411, "grad_norm": 0.29454007744789124, "learning_rate": 6.39009900990099e-06, "loss": 0.0047, "step": 93 }, { "epoch": 0.46134969325153374, "grad_norm": 0.342790812253952, "learning_rate": 6.383168316831683e-06, "loss": 0.0182, "step": 94 }, { "epoch": 0.4662576687116564, "grad_norm": 0.3073101341724396, "learning_rate": 6.376237623762376e-06, "loss": 0.1144, "step": 95 }, { "epoch": 0.47116564417177914, "grad_norm": 0.028703156858682632, "learning_rate": 6.369306930693069e-06, "loss": 0.0017, "step": 96 }, { "epoch": 0.47607361963190187, "grad_norm": 0.2344651073217392, "learning_rate": 6.362376237623762e-06, "loss": 0.1945, "step": 97 }, { "epoch": 0.48098159509202454, "grad_norm": 0.4256807565689087, "learning_rate": 6.355445544554455e-06, "loss": 0.0432, "step": 98 }, { "epoch": 0.48588957055214727, "grad_norm": 0.690371572971344, "learning_rate": 6.348514851485148e-06, "loss": 0.1223, "step": 99 }, { "epoch": 0.49079754601226994, "grad_norm": 0.006785046309232712, "learning_rate": 6.341584158415841e-06, "loss": 0.0009, "step": 100 }, { "epoch": 0.49570552147239266, "grad_norm": 0.3789716362953186, "learning_rate": 6.3346534653465345e-06, "loss": 0.0753, "step": 101 }, { "epoch": 0.5006134969325153, "grad_norm": 0.2657737731933594, "learning_rate": 6.3277227722772275e-06, "loss": 0.0068, "step": 102 }, { "epoch": 0.505521472392638, "grad_norm": 0.0025652130134403706, "learning_rate": 6.320792079207921e-06, "loss": 0.0005, "step": 103 }, { "epoch": 0.5104294478527608, "grad_norm": 0.026512114331126213, "learning_rate": 6.313861386138614e-06, "loss": 0.0012, "step": 104 }, { "epoch": 0.5153374233128835, "grad_norm": 0.020907960832118988, "learning_rate": 6.306930693069307e-06, "loss": 0.002, "step": 105 }, { "epoch": 0.5202453987730061, "grad_norm": 0.024951621890068054, "learning_rate": 6.3e-06, "loss": 0.0019, "step": 106 }, { "epoch": 0.5251533742331288, "grad_norm": 0.03137391433119774, "learning_rate": 6.293069306930693e-06, "loss": 0.0022, "step": 107 }, { "epoch": 0.5300613496932516, "grad_norm": 0.01158374547958374, "learning_rate": 6.286138613861386e-06, "loss": 0.002, "step": 108 }, { "epoch": 0.5349693251533743, "grad_norm": 0.20386900007724762, "learning_rate": 6.279207920792079e-06, "loss": 0.0438, "step": 109 }, { "epoch": 0.5398773006134969, "grad_norm": 0.21870765089988708, "learning_rate": 6.272277227722772e-06, "loss": 0.0147, "step": 110 }, { "epoch": 0.5447852760736196, "grad_norm": 0.01832536980509758, "learning_rate": 6.265346534653465e-06, "loss": 0.0013, "step": 111 }, { "epoch": 0.5496932515337424, "grad_norm": 1.1292037963867188, "learning_rate": 6.258415841584158e-06, "loss": 0.0233, "step": 112 }, { "epoch": 0.554601226993865, "grad_norm": 0.23187655210494995, "learning_rate": 6.251485148514851e-06, "loss": 0.0964, "step": 113 }, { "epoch": 0.5595092024539877, "grad_norm": 0.05976148694753647, "learning_rate": 6.244554455445544e-06, "loss": 0.0026, "step": 114 }, { "epoch": 0.5644171779141104, "grad_norm": 0.2549298107624054, "learning_rate": 6.2376237623762374e-06, "loss": 0.0204, "step": 115 }, { "epoch": 0.5693251533742332, "grad_norm": 0.5240129232406616, "learning_rate": 6.2306930693069305e-06, "loss": 0.0091, "step": 116 }, { "epoch": 0.5742331288343558, "grad_norm": 0.008397325873374939, "learning_rate": 6.2237623762376236e-06, "loss": 0.0031, "step": 117 }, { "epoch": 0.5791411042944785, "grad_norm": 0.17320525646209717, "learning_rate": 6.216831683168317e-06, "loss": 0.0696, "step": 118 }, { "epoch": 0.5840490797546012, "grad_norm": 0.8707240223884583, "learning_rate": 6.20990099009901e-06, "loss": 0.0563, "step": 119 }, { "epoch": 0.588957055214724, "grad_norm": 0.03828573226928711, "learning_rate": 6.202970297029703e-06, "loss": 0.0025, "step": 120 }, { "epoch": 0.5938650306748466, "grad_norm": 0.39433979988098145, "learning_rate": 6.196039603960396e-06, "loss": 0.0286, "step": 121 }, { "epoch": 0.5987730061349693, "grad_norm": 0.24125856161117554, "learning_rate": 6.189108910891089e-06, "loss": 0.0474, "step": 122 }, { "epoch": 0.603680981595092, "grad_norm": 0.3200415074825287, "learning_rate": 6.182178217821782e-06, "loss": 0.0138, "step": 123 }, { "epoch": 0.6085889570552148, "grad_norm": 0.016078324988484383, "learning_rate": 6.175247524752475e-06, "loss": 0.0015, "step": 124 }, { "epoch": 0.6134969325153374, "grad_norm": 0.023426106199622154, "learning_rate": 6.168316831683168e-06, "loss": 0.0025, "step": 125 }, { "epoch": 0.6184049079754601, "grad_norm": 0.255380779504776, "learning_rate": 6.161386138613861e-06, "loss": 0.0091, "step": 126 }, { "epoch": 0.6233128834355828, "grad_norm": 0.02590188756585121, "learning_rate": 6.154455445544554e-06, "loss": 0.0025, "step": 127 }, { "epoch": 0.6282208588957056, "grad_norm": 1.4449337720870972, "learning_rate": 6.147524752475247e-06, "loss": 0.1513, "step": 128 }, { "epoch": 0.6331288343558282, "grad_norm": 0.27828818559646606, "learning_rate": 6.14059405940594e-06, "loss": 0.0542, "step": 129 }, { "epoch": 0.6380368098159509, "grad_norm": 0.0807948037981987, "learning_rate": 6.1336633663366335e-06, "loss": 0.0033, "step": 130 }, { "epoch": 0.6429447852760736, "grad_norm": 0.5472558736801147, "learning_rate": 6.1267326732673265e-06, "loss": 0.1161, "step": 131 }, { "epoch": 0.6478527607361964, "grad_norm": 0.18089702725410461, "learning_rate": 6.11980198019802e-06, "loss": 0.0464, "step": 132 }, { "epoch": 0.652760736196319, "grad_norm": 0.025230491533875465, "learning_rate": 6.112871287128713e-06, "loss": 0.0016, "step": 133 }, { "epoch": 0.6576687116564417, "grad_norm": 0.2929389178752899, "learning_rate": 6.105940594059406e-06, "loss": 0.0404, "step": 134 }, { "epoch": 0.6625766871165644, "grad_norm": 0.2341107577085495, "learning_rate": 6.099009900990099e-06, "loss": 0.0401, "step": 135 }, { "epoch": 0.6674846625766871, "grad_norm": 0.11038243025541306, "learning_rate": 6.092079207920792e-06, "loss": 0.0045, "step": 136 }, { "epoch": 0.6723926380368098, "grad_norm": 0.19471372663974762, "learning_rate": 6.085148514851485e-06, "loss": 0.0296, "step": 137 }, { "epoch": 0.6773006134969325, "grad_norm": 0.3493005037307739, "learning_rate": 6.078217821782178e-06, "loss": 0.0435, "step": 138 }, { "epoch": 0.6822085889570552, "grad_norm": 0.2767382860183716, "learning_rate": 6.071287128712871e-06, "loss": 0.0392, "step": 139 }, { "epoch": 0.6871165644171779, "grad_norm": 0.2326585054397583, "learning_rate": 6.064356435643564e-06, "loss": 0.0372, "step": 140 }, { "epoch": 0.6920245398773006, "grad_norm": 0.023760871961712837, "learning_rate": 6.057425742574257e-06, "loss": 0.0017, "step": 141 }, { "epoch": 0.6969325153374233, "grad_norm": 0.02614918164908886, "learning_rate": 6.05049504950495e-06, "loss": 0.0019, "step": 142 }, { "epoch": 0.701840490797546, "grad_norm": 0.21519997715950012, "learning_rate": 6.043564356435643e-06, "loss": 0.0249, "step": 143 }, { "epoch": 0.7067484662576687, "grad_norm": 0.04296768456697464, "learning_rate": 6.0366336633663364e-06, "loss": 0.002, "step": 144 }, { "epoch": 0.7116564417177914, "grad_norm": 0.0280557032674551, "learning_rate": 6.0297029702970295e-06, "loss": 0.0012, "step": 145 }, { "epoch": 0.7165644171779141, "grad_norm": 0.016091475263237953, "learning_rate": 6.0227722772277226e-06, "loss": 0.001, "step": 146 }, { "epoch": 0.7214723926380369, "grad_norm": 0.4104843735694885, "learning_rate": 6.015841584158416e-06, "loss": 0.0495, "step": 147 }, { "epoch": 0.7263803680981595, "grad_norm": 0.28999537229537964, "learning_rate": 6.008910891089109e-06, "loss": 0.0377, "step": 148 }, { "epoch": 0.7312883435582822, "grad_norm": 0.005074529442936182, "learning_rate": 6.001980198019802e-06, "loss": 0.0006, "step": 149 }, { "epoch": 0.7361963190184049, "grad_norm": 0.031069966033101082, "learning_rate": 5.995049504950495e-06, "loss": 0.0013, "step": 150 }, { "epoch": 0.7411042944785277, "grad_norm": 0.0037307555321604013, "learning_rate": 5.988118811881188e-06, "loss": 0.0004, "step": 151 }, { "epoch": 0.7460122699386503, "grad_norm": 0.05543583631515503, "learning_rate": 5.981188118811881e-06, "loss": 0.0056, "step": 152 }, { "epoch": 0.750920245398773, "grad_norm": 0.15376004576683044, "learning_rate": 5.974257425742574e-06, "loss": 0.0072, "step": 153 }, { "epoch": 0.7558282208588957, "grad_norm": 0.17796596884727478, "learning_rate": 5.967326732673267e-06, "loss": 0.0329, "step": 154 }, { "epoch": 0.7607361963190185, "grad_norm": 0.15298157930374146, "learning_rate": 5.96039603960396e-06, "loss": 0.014, "step": 155 }, { "epoch": 0.7656441717791411, "grad_norm": 0.017613040283322334, "learning_rate": 5.953465346534653e-06, "loss": 0.0016, "step": 156 }, { "epoch": 0.7705521472392638, "grad_norm": 0.33242282271385193, "learning_rate": 5.946534653465346e-06, "loss": 0.0333, "step": 157 }, { "epoch": 0.7754601226993865, "grad_norm": 0.010239041410386562, "learning_rate": 5.939603960396039e-06, "loss": 0.0005, "step": 158 }, { "epoch": 0.7803680981595092, "grad_norm": 0.01592794805765152, "learning_rate": 5.9326732673267325e-06, "loss": 0.0025, "step": 159 }, { "epoch": 0.7852760736196319, "grad_norm": 0.3407064974308014, "learning_rate": 5.9257425742574255e-06, "loss": 0.0106, "step": 160 }, { "epoch": 0.7901840490797546, "grad_norm": 0.11607719957828522, "learning_rate": 5.918811881188119e-06, "loss": 0.0197, "step": 161 }, { "epoch": 0.7950920245398773, "grad_norm": 0.2373722642660141, "learning_rate": 5.911881188118812e-06, "loss": 0.009, "step": 162 }, { "epoch": 0.8, "grad_norm": 0.013574715703725815, "learning_rate": 5.904950495049505e-06, "loss": 0.0012, "step": 163 }, { "epoch": 0.8049079754601227, "grad_norm": 0.1712418496608734, "learning_rate": 5.898019801980198e-06, "loss": 0.0033, "step": 164 }, { "epoch": 0.8098159509202454, "grad_norm": 0.06470532715320587, "learning_rate": 5.891089108910891e-06, "loss": 0.0022, "step": 165 }, { "epoch": 0.8147239263803681, "grad_norm": 0.2027478665113449, "learning_rate": 5.884158415841584e-06, "loss": 0.0075, "step": 166 }, { "epoch": 0.8196319018404908, "grad_norm": 0.018224777653813362, "learning_rate": 5.877227722772277e-06, "loss": 0.0022, "step": 167 }, { "epoch": 0.8245398773006135, "grad_norm": 0.2861013114452362, "learning_rate": 5.87029702970297e-06, "loss": 0.0213, "step": 168 }, { "epoch": 0.8294478527607362, "grad_norm": 0.1993497610092163, "learning_rate": 5.863366336633663e-06, "loss": 0.0047, "step": 169 }, { "epoch": 0.8343558282208589, "grad_norm": 0.010481802746653557, "learning_rate": 5.856435643564356e-06, "loss": 0.001, "step": 170 }, { "epoch": 0.8392638036809816, "grad_norm": 0.018094172701239586, "learning_rate": 5.849504950495049e-06, "loss": 0.0018, "step": 171 }, { "epoch": 0.8441717791411043, "grad_norm": 0.06920409202575684, "learning_rate": 5.842574257425742e-06, "loss": 0.0042, "step": 172 }, { "epoch": 0.849079754601227, "grad_norm": 0.18376444280147552, "learning_rate": 5.835643564356435e-06, "loss": 0.0066, "step": 173 }, { "epoch": 0.8539877300613496, "grad_norm": 0.1790069192647934, "learning_rate": 5.8287128712871285e-06, "loss": 0.0147, "step": 174 }, { "epoch": 0.8588957055214724, "grad_norm": 0.018982654437422752, "learning_rate": 5.8217821782178216e-06, "loss": 0.003, "step": 175 }, { "epoch": 0.8638036809815951, "grad_norm": 0.13061672449111938, "learning_rate": 5.814851485148515e-06, "loss": 0.0016, "step": 176 }, { "epoch": 0.8687116564417178, "grad_norm": 0.0860314592719078, "learning_rate": 5.807920792079208e-06, "loss": 0.0047, "step": 177 }, { "epoch": 0.8736196319018404, "grad_norm": 0.05230861157178879, "learning_rate": 5.800990099009901e-06, "loss": 0.0076, "step": 178 }, { "epoch": 0.8785276073619632, "grad_norm": 0.3662849962711334, "learning_rate": 5.794059405940594e-06, "loss": 0.026, "step": 179 }, { "epoch": 0.8834355828220859, "grad_norm": 0.2983264923095703, "learning_rate": 5.787128712871287e-06, "loss": 0.0203, "step": 180 }, { "epoch": 0.8883435582822086, "grad_norm": 0.0335027277469635, "learning_rate": 5.78019801980198e-06, "loss": 0.0022, "step": 181 }, { "epoch": 0.8932515337423312, "grad_norm": 0.4801461100578308, "learning_rate": 5.773267326732673e-06, "loss": 0.0082, "step": 182 }, { "epoch": 0.898159509202454, "grad_norm": 0.014546169899404049, "learning_rate": 5.766336633663366e-06, "loss": 0.0031, "step": 183 }, { "epoch": 0.9030674846625767, "grad_norm": 0.05124737694859505, "learning_rate": 5.759405940594059e-06, "loss": 0.002, "step": 184 }, { "epoch": 0.9079754601226994, "grad_norm": 0.01893136277794838, "learning_rate": 5.752475247524752e-06, "loss": 0.0013, "step": 185 }, { "epoch": 0.912883435582822, "grad_norm": 0.15197233855724335, "learning_rate": 5.745544554455445e-06, "loss": 0.0026, "step": 186 }, { "epoch": 0.9177914110429448, "grad_norm": 0.8003377318382263, "learning_rate": 5.738613861386138e-06, "loss": 0.0146, "step": 187 }, { "epoch": 0.9226993865030675, "grad_norm": 0.023129871115088463, "learning_rate": 5.7316831683168314e-06, "loss": 0.0035, "step": 188 }, { "epoch": 0.9276073619631902, "grad_norm": 0.5904732346534729, "learning_rate": 5.7247524752475245e-06, "loss": 0.0223, "step": 189 }, { "epoch": 0.9325153374233128, "grad_norm": 0.02393723465502262, "learning_rate": 5.717821782178218e-06, "loss": 0.0017, "step": 190 }, { "epoch": 0.9374233128834356, "grad_norm": 0.2537831664085388, "learning_rate": 5.710891089108911e-06, "loss": 0.0113, "step": 191 }, { "epoch": 0.9423312883435583, "grad_norm": 0.005700583104044199, "learning_rate": 5.703960396039604e-06, "loss": 0.0007, "step": 192 }, { "epoch": 0.947239263803681, "grad_norm": 0.020626569166779518, "learning_rate": 5.697029702970297e-06, "loss": 0.002, "step": 193 }, { "epoch": 0.9521472392638037, "grad_norm": 0.009778481908142567, "learning_rate": 5.69009900990099e-06, "loss": 0.0011, "step": 194 }, { "epoch": 0.9570552147239264, "grad_norm": 0.020891468971967697, "learning_rate": 5.683168316831684e-06, "loss": 0.0017, "step": 195 }, { "epoch": 0.9619631901840491, "grad_norm": 0.01422948855906725, "learning_rate": 5.676237623762377e-06, "loss": 0.0017, "step": 196 }, { "epoch": 0.9668711656441717, "grad_norm": 0.3881951868534088, "learning_rate": 5.66930693069307e-06, "loss": 0.0239, "step": 197 }, { "epoch": 0.9717791411042945, "grad_norm": 0.19655589759349823, "learning_rate": 5.662376237623763e-06, "loss": 0.0166, "step": 198 }, { "epoch": 0.9766871165644172, "grad_norm": 0.22294123470783234, "learning_rate": 5.655445544554456e-06, "loss": 0.0106, "step": 199 }, { "epoch": 0.9815950920245399, "grad_norm": 0.13569732010364532, "learning_rate": 5.648514851485149e-06, "loss": 0.0054, "step": 200 }, { "epoch": 0.9865030674846625, "grad_norm": 0.1216830313205719, "learning_rate": 5.641584158415842e-06, "loss": 0.0075, "step": 201 }, { "epoch": 0.9914110429447853, "grad_norm": 0.14887885749340057, "learning_rate": 5.634653465346535e-06, "loss": 0.0059, "step": 202 }, { "epoch": 0.996319018404908, "grad_norm": 0.34408119320869446, "learning_rate": 5.627722772277228e-06, "loss": 0.0052, "step": 203 }, { "epoch": 1.0012269938650307, "grad_norm": 0.022362129762768745, "learning_rate": 5.620792079207921e-06, "loss": 0.0016, "step": 204 }, { "epoch": 1.0061349693251533, "grad_norm": 0.2365986853837967, "learning_rate": 5.6138613861386145e-06, "loss": 0.0205, "step": 205 }, { "epoch": 1.011042944785276, "grad_norm": 0.007400259375572205, "learning_rate": 5.6069306930693075e-06, "loss": 0.0007, "step": 206 }, { "epoch": 1.0159509202453987, "grad_norm": 0.016991781070828438, "learning_rate": 5.600000000000001e-06, "loss": 0.0019, "step": 207 }, { "epoch": 1.0208588957055216, "grad_norm": 0.035710081458091736, "learning_rate": 5.593069306930694e-06, "loss": 0.0016, "step": 208 }, { "epoch": 1.0257668711656442, "grad_norm": 0.16605080664157867, "learning_rate": 5.586138613861387e-06, "loss": 0.0127, "step": 209 }, { "epoch": 1.030674846625767, "grad_norm": 0.029172230511903763, "learning_rate": 5.57920792079208e-06, "loss": 0.0026, "step": 210 }, { "epoch": 1.0355828220858896, "grad_norm": 0.012522108852863312, "learning_rate": 5.572277227722773e-06, "loss": 0.0013, "step": 211 }, { "epoch": 1.0404907975460123, "grad_norm": 0.2723598778247833, "learning_rate": 5.565346534653466e-06, "loss": 0.0244, "step": 212 }, { "epoch": 1.045398773006135, "grad_norm": 0.10059408843517303, "learning_rate": 5.558415841584159e-06, "loss": 0.0052, "step": 213 }, { "epoch": 1.0503067484662576, "grad_norm": 0.040718916803598404, "learning_rate": 5.551485148514852e-06, "loss": 0.0027, "step": 214 }, { "epoch": 1.0552147239263803, "grad_norm": 0.044893424957990646, "learning_rate": 5.544554455445545e-06, "loss": 0.0025, "step": 215 }, { "epoch": 1.0601226993865032, "grad_norm": 0.0643078088760376, "learning_rate": 5.537623762376238e-06, "loss": 0.0026, "step": 216 }, { "epoch": 1.0650306748466258, "grad_norm": 0.23729190230369568, "learning_rate": 5.530693069306931e-06, "loss": 0.0073, "step": 217 }, { "epoch": 1.0699386503067485, "grad_norm": 0.03545796498656273, "learning_rate": 5.5237623762376235e-06, "loss": 0.0043, "step": 218 }, { "epoch": 1.0748466257668712, "grad_norm": 0.12645600736141205, "learning_rate": 5.5168316831683166e-06, "loss": 0.0168, "step": 219 }, { "epoch": 1.0797546012269938, "grad_norm": 0.22768345475196838, "learning_rate": 5.50990099009901e-06, "loss": 0.0066, "step": 220 }, { "epoch": 1.0846625766871165, "grad_norm": 0.008956272155046463, "learning_rate": 5.502970297029703e-06, "loss": 0.002, "step": 221 }, { "epoch": 1.0895705521472392, "grad_norm": 0.14142072200775146, "learning_rate": 5.496039603960396e-06, "loss": 0.005, "step": 222 }, { "epoch": 1.0944785276073619, "grad_norm": 0.3319880962371826, "learning_rate": 5.489108910891089e-06, "loss": 0.0116, "step": 223 }, { "epoch": 1.0993865030674848, "grad_norm": 0.03383160009980202, "learning_rate": 5.482178217821782e-06, "loss": 0.0021, "step": 224 }, { "epoch": 1.1042944785276074, "grad_norm": 0.03335406631231308, "learning_rate": 5.475247524752475e-06, "loss": 0.0022, "step": 225 }, { "epoch": 1.10920245398773, "grad_norm": 0.06353459507226944, "learning_rate": 5.468316831683168e-06, "loss": 0.0062, "step": 226 }, { "epoch": 1.1141104294478528, "grad_norm": 0.2315845489501953, "learning_rate": 5.461386138613861e-06, "loss": 0.0221, "step": 227 }, { "epoch": 1.1190184049079754, "grad_norm": 0.04464046284556389, "learning_rate": 5.454455445544554e-06, "loss": 0.0023, "step": 228 }, { "epoch": 1.123926380368098, "grad_norm": 0.49593213200569153, "learning_rate": 5.447524752475247e-06, "loss": 0.0484, "step": 229 }, { "epoch": 1.1288343558282208, "grad_norm": 0.050679679960012436, "learning_rate": 5.44059405940594e-06, "loss": 0.0021, "step": 230 }, { "epoch": 1.1337423312883437, "grad_norm": 0.08641895651817322, "learning_rate": 5.433663366336633e-06, "loss": 0.0022, "step": 231 }, { "epoch": 1.1386503067484663, "grad_norm": 0.046292319893836975, "learning_rate": 5.4267326732673265e-06, "loss": 0.0034, "step": 232 }, { "epoch": 1.143558282208589, "grad_norm": 0.11266610026359558, "learning_rate": 5.4198019801980195e-06, "loss": 0.0048, "step": 233 }, { "epoch": 1.1484662576687117, "grad_norm": 0.4683886170387268, "learning_rate": 5.412871287128713e-06, "loss": 0.0095, "step": 234 }, { "epoch": 1.1533742331288344, "grad_norm": 0.17976729571819305, "learning_rate": 5.405940594059406e-06, "loss": 0.0221, "step": 235 }, { "epoch": 1.158282208588957, "grad_norm": 0.08795922249555588, "learning_rate": 5.399009900990099e-06, "loss": 0.003, "step": 236 }, { "epoch": 1.1631901840490797, "grad_norm": 0.012228120118379593, "learning_rate": 5.392079207920792e-06, "loss": 0.0007, "step": 237 }, { "epoch": 1.1680981595092024, "grad_norm": 0.04390159249305725, "learning_rate": 5.385148514851485e-06, "loss": 0.0019, "step": 238 }, { "epoch": 1.173006134969325, "grad_norm": 0.03781568259000778, "learning_rate": 5.378217821782178e-06, "loss": 0.002, "step": 239 }, { "epoch": 1.177914110429448, "grad_norm": 0.18868562579154968, "learning_rate": 5.371287128712871e-06, "loss": 0.0078, "step": 240 }, { "epoch": 1.1828220858895706, "grad_norm": 0.12037086486816406, "learning_rate": 5.364356435643564e-06, "loss": 0.0047, "step": 241 }, { "epoch": 1.1877300613496933, "grad_norm": 0.21462522447109222, "learning_rate": 5.357425742574257e-06, "loss": 0.0075, "step": 242 }, { "epoch": 1.192638036809816, "grad_norm": 0.14189103245735168, "learning_rate": 5.35049504950495e-06, "loss": 0.0109, "step": 243 }, { "epoch": 1.1975460122699386, "grad_norm": 0.30977487564086914, "learning_rate": 5.343564356435643e-06, "loss": 0.0142, "step": 244 }, { "epoch": 1.2024539877300613, "grad_norm": 0.03001783974468708, "learning_rate": 5.336633663366336e-06, "loss": 0.0011, "step": 245 }, { "epoch": 1.207361963190184, "grad_norm": 0.35600486397743225, "learning_rate": 5.329702970297029e-06, "loss": 0.0343, "step": 246 }, { "epoch": 1.2122699386503069, "grad_norm": 0.023076960816979408, "learning_rate": 5.3227722772277225e-06, "loss": 0.0021, "step": 247 }, { "epoch": 1.2171779141104295, "grad_norm": 0.23361201584339142, "learning_rate": 5.3158415841584156e-06, "loss": 0.0068, "step": 248 }, { "epoch": 1.2220858895705522, "grad_norm": 0.21508128941059113, "learning_rate": 5.308910891089109e-06, "loss": 0.0247, "step": 249 }, { "epoch": 1.2269938650306749, "grad_norm": 0.1582375317811966, "learning_rate": 5.301980198019802e-06, "loss": 0.0037, "step": 250 }, { "epoch": 1.2319018404907975, "grad_norm": 0.037315189838409424, "learning_rate": 5.295049504950495e-06, "loss": 0.0014, "step": 251 }, { "epoch": 1.2368098159509202, "grad_norm": 0.07501725107431412, "learning_rate": 5.288118811881188e-06, "loss": 0.0037, "step": 252 }, { "epoch": 1.2417177914110429, "grad_norm": 0.0727037638425827, "learning_rate": 5.281188118811881e-06, "loss": 0.0028, "step": 253 }, { "epoch": 1.2466257668711656, "grad_norm": 0.04310867562890053, "learning_rate": 5.274257425742574e-06, "loss": 0.0039, "step": 254 }, { "epoch": 1.2515337423312882, "grad_norm": 0.042814526706933975, "learning_rate": 5.267326732673267e-06, "loss": 0.0025, "step": 255 }, { "epoch": 1.2564417177914111, "grad_norm": 0.11775332689285278, "learning_rate": 5.26039603960396e-06, "loss": 0.0127, "step": 256 }, { "epoch": 1.2613496932515338, "grad_norm": 0.040224045515060425, "learning_rate": 5.253465346534653e-06, "loss": 0.0017, "step": 257 }, { "epoch": 1.2662576687116565, "grad_norm": 0.011041068471968174, "learning_rate": 5.246534653465346e-06, "loss": 0.0011, "step": 258 }, { "epoch": 1.2711656441717791, "grad_norm": 0.019592339172959328, "learning_rate": 5.239603960396039e-06, "loss": 0.0013, "step": 259 }, { "epoch": 1.2760736196319018, "grad_norm": 0.0652371346950531, "learning_rate": 5.232673267326732e-06, "loss": 0.0026, "step": 260 }, { "epoch": 1.2809815950920245, "grad_norm": 0.5446717143058777, "learning_rate": 5.2257425742574254e-06, "loss": 0.0305, "step": 261 }, { "epoch": 1.2858895705521474, "grad_norm": 0.02951255440711975, "learning_rate": 5.2188118811881185e-06, "loss": 0.001, "step": 262 }, { "epoch": 1.29079754601227, "grad_norm": 0.1153414323925972, "learning_rate": 5.211881188118812e-06, "loss": 0.0088, "step": 263 }, { "epoch": 1.2957055214723927, "grad_norm": 0.17192097008228302, "learning_rate": 5.204950495049505e-06, "loss": 0.0043, "step": 264 }, { "epoch": 1.3006134969325154, "grad_norm": 0.04299500212073326, "learning_rate": 5.198019801980198e-06, "loss": 0.0042, "step": 265 }, { "epoch": 1.305521472392638, "grad_norm": 0.02218775264918804, "learning_rate": 5.191089108910891e-06, "loss": 0.001, "step": 266 }, { "epoch": 1.3104294478527607, "grad_norm": 0.7823259830474854, "learning_rate": 5.184158415841584e-06, "loss": 0.0556, "step": 267 }, { "epoch": 1.3153374233128834, "grad_norm": 0.09460531175136566, "learning_rate": 5.177227722772277e-06, "loss": 0.0091, "step": 268 }, { "epoch": 1.320245398773006, "grad_norm": 0.04762015491724014, "learning_rate": 5.17029702970297e-06, "loss": 0.0039, "step": 269 }, { "epoch": 1.3251533742331287, "grad_norm": 0.033739686012268066, "learning_rate": 5.163366336633663e-06, "loss": 0.0017, "step": 270 }, { "epoch": 1.3300613496932514, "grad_norm": 0.06530934572219849, "learning_rate": 5.156435643564356e-06, "loss": 0.0047, "step": 271 }, { "epoch": 1.3349693251533743, "grad_norm": 0.014586723409593105, "learning_rate": 5.149504950495049e-06, "loss": 0.001, "step": 272 }, { "epoch": 1.339877300613497, "grad_norm": 0.05435250699520111, "learning_rate": 5.142574257425742e-06, "loss": 0.0028, "step": 273 }, { "epoch": 1.3447852760736196, "grad_norm": 0.1281612366437912, "learning_rate": 5.135643564356435e-06, "loss": 0.0026, "step": 274 }, { "epoch": 1.3496932515337423, "grad_norm": 0.18499918282032013, "learning_rate": 5.128712871287128e-06, "loss": 0.0057, "step": 275 }, { "epoch": 1.354601226993865, "grad_norm": 0.07541365176439285, "learning_rate": 5.1217821782178215e-06, "loss": 0.003, "step": 276 }, { "epoch": 1.3595092024539877, "grad_norm": 0.061677657067775726, "learning_rate": 5.1148514851485145e-06, "loss": 0.002, "step": 277 }, { "epoch": 1.3644171779141105, "grad_norm": 0.16630059480667114, "learning_rate": 5.107920792079208e-06, "loss": 0.0138, "step": 278 }, { "epoch": 1.3693251533742332, "grad_norm": 0.13260015845298767, "learning_rate": 5.100990099009901e-06, "loss": 0.0015, "step": 279 }, { "epoch": 1.3742331288343559, "grad_norm": 0.04029810056090355, "learning_rate": 5.094059405940594e-06, "loss": 0.0022, "step": 280 }, { "epoch": 1.3791411042944786, "grad_norm": 0.07820600271224976, "learning_rate": 5.087128712871287e-06, "loss": 0.0012, "step": 281 }, { "epoch": 1.3840490797546012, "grad_norm": 0.11421211808919907, "learning_rate": 5.08019801980198e-06, "loss": 0.0036, "step": 282 }, { "epoch": 1.388957055214724, "grad_norm": 0.02426535077393055, "learning_rate": 5.073267326732673e-06, "loss": 0.0015, "step": 283 }, { "epoch": 1.3938650306748466, "grad_norm": 0.00859303679317236, "learning_rate": 5.066336633663366e-06, "loss": 0.001, "step": 284 }, { "epoch": 1.3987730061349692, "grad_norm": 0.03431880474090576, "learning_rate": 5.059405940594059e-06, "loss": 0.0021, "step": 285 }, { "epoch": 1.403680981595092, "grad_norm": 0.013164886273443699, "learning_rate": 5.052475247524752e-06, "loss": 0.0009, "step": 286 }, { "epoch": 1.4085889570552146, "grad_norm": 0.1018059104681015, "learning_rate": 5.045544554455445e-06, "loss": 0.003, "step": 287 }, { "epoch": 1.4134969325153375, "grad_norm": 0.3569484055042267, "learning_rate": 5.038613861386138e-06, "loss": 0.0193, "step": 288 }, { "epoch": 1.4184049079754601, "grad_norm": 0.03560711443424225, "learning_rate": 5.031683168316831e-06, "loss": 0.0041, "step": 289 }, { "epoch": 1.4233128834355828, "grad_norm": 0.049912016838788986, "learning_rate": 5.0247524752475244e-06, "loss": 0.0019, "step": 290 }, { "epoch": 1.4282208588957055, "grad_norm": 0.00541495019569993, "learning_rate": 5.0178217821782175e-06, "loss": 0.0008, "step": 291 }, { "epoch": 1.4331288343558282, "grad_norm": 0.188345804810524, "learning_rate": 5.0108910891089106e-06, "loss": 0.0138, "step": 292 }, { "epoch": 1.438036809815951, "grad_norm": 1.1841977834701538, "learning_rate": 5.003960396039604e-06, "loss": 0.0285, "step": 293 }, { "epoch": 1.4429447852760737, "grad_norm": 0.01752946898341179, "learning_rate": 4.997029702970297e-06, "loss": 0.003, "step": 294 }, { "epoch": 1.4478527607361964, "grad_norm": 0.009354379959404469, "learning_rate": 4.99009900990099e-06, "loss": 0.001, "step": 295 }, { "epoch": 1.452760736196319, "grad_norm": 0.5457541346549988, "learning_rate": 4.983168316831683e-06, "loss": 0.0028, "step": 296 }, { "epoch": 1.4576687116564417, "grad_norm": 0.012560038827359676, "learning_rate": 4.976237623762376e-06, "loss": 0.0014, "step": 297 }, { "epoch": 1.4625766871165644, "grad_norm": 0.08460962772369385, "learning_rate": 4.969306930693069e-06, "loss": 0.0054, "step": 298 }, { "epoch": 1.467484662576687, "grad_norm": 0.02863544225692749, "learning_rate": 4.962376237623762e-06, "loss": 0.0019, "step": 299 }, { "epoch": 1.4723926380368098, "grad_norm": 0.12013754993677139, "learning_rate": 4.955445544554455e-06, "loss": 0.0108, "step": 300 }, { "epoch": 1.4773006134969324, "grad_norm": 0.06849315017461777, "learning_rate": 4.948514851485148e-06, "loss": 0.0023, "step": 301 }, { "epoch": 1.482208588957055, "grad_norm": 0.07875117659568787, "learning_rate": 4.941584158415841e-06, "loss": 0.003, "step": 302 }, { "epoch": 1.487116564417178, "grad_norm": 0.03466253727674484, "learning_rate": 4.934653465346534e-06, "loss": 0.0023, "step": 303 }, { "epoch": 1.4920245398773007, "grad_norm": 0.02622813917696476, "learning_rate": 4.927722772277227e-06, "loss": 0.0019, "step": 304 }, { "epoch": 1.4969325153374233, "grad_norm": 0.08774946630001068, "learning_rate": 4.9207920792079205e-06, "loss": 0.0062, "step": 305 }, { "epoch": 1.501840490797546, "grad_norm": 0.038571231067180634, "learning_rate": 4.9138613861386135e-06, "loss": 0.002, "step": 306 }, { "epoch": 1.5067484662576687, "grad_norm": 0.058768339455127716, "learning_rate": 4.906930693069307e-06, "loss": 0.0024, "step": 307 }, { "epoch": 1.5116564417177916, "grad_norm": 0.01673268899321556, "learning_rate": 4.9e-06, "loss": 0.0016, "step": 308 }, { "epoch": 1.5165644171779142, "grad_norm": 0.1858752965927124, "learning_rate": 4.893069306930693e-06, "loss": 0.0042, "step": 309 }, { "epoch": 1.521472392638037, "grad_norm": 0.017566069960594177, "learning_rate": 4.886138613861386e-06, "loss": 0.0012, "step": 310 }, { "epoch": 1.5263803680981596, "grad_norm": 0.021138716489076614, "learning_rate": 4.879207920792079e-06, "loss": 0.0018, "step": 311 }, { "epoch": 1.5312883435582823, "grad_norm": 0.005446423310786486, "learning_rate": 4.872277227722772e-06, "loss": 0.0012, "step": 312 }, { "epoch": 1.536196319018405, "grad_norm": 0.034122321754693985, "learning_rate": 4.865346534653465e-06, "loss": 0.0029, "step": 313 }, { "epoch": 1.5411042944785276, "grad_norm": 0.015392904169857502, "learning_rate": 4.858415841584158e-06, "loss": 0.001, "step": 314 }, { "epoch": 1.5460122699386503, "grad_norm": 0.09795702993869781, "learning_rate": 4.851485148514851e-06, "loss": 0.006, "step": 315 }, { "epoch": 1.550920245398773, "grad_norm": 0.18449151515960693, "learning_rate": 4.844554455445544e-06, "loss": 0.0543, "step": 316 }, { "epoch": 1.5558282208588956, "grad_norm": 0.3047311007976532, "learning_rate": 4.837623762376237e-06, "loss": 0.0148, "step": 317 }, { "epoch": 1.5607361963190183, "grad_norm": 0.3502406179904938, "learning_rate": 4.83069306930693e-06, "loss": 0.0083, "step": 318 }, { "epoch": 1.565644171779141, "grad_norm": 0.08116799592971802, "learning_rate": 4.8237623762376234e-06, "loss": 0.011, "step": 319 }, { "epoch": 1.5705521472392638, "grad_norm": 6.128328323364258, "learning_rate": 4.8168316831683165e-06, "loss": 0.0234, "step": 320 }, { "epoch": 1.5754601226993865, "grad_norm": 0.31991851329803467, "learning_rate": 4.80990099009901e-06, "loss": 0.0161, "step": 321 }, { "epoch": 1.5803680981595092, "grad_norm": 0.08932381123304367, "learning_rate": 4.8029702970297035e-06, "loss": 0.0024, "step": 322 }, { "epoch": 1.5852760736196319, "grad_norm": 0.03641815483570099, "learning_rate": 4.7960396039603965e-06, "loss": 0.0013, "step": 323 }, { "epoch": 1.5901840490797547, "grad_norm": 0.12057910859584808, "learning_rate": 4.78910891089109e-06, "loss": 0.0045, "step": 324 }, { "epoch": 1.5950920245398774, "grad_norm": 0.022239159792661667, "learning_rate": 4.782178217821783e-06, "loss": 0.002, "step": 325 }, { "epoch": 1.6, "grad_norm": 0.02001642808318138, "learning_rate": 4.775247524752476e-06, "loss": 0.0011, "step": 326 }, { "epoch": 1.6049079754601228, "grad_norm": 0.0681408941745758, "learning_rate": 4.768316831683169e-06, "loss": 0.0034, "step": 327 }, { "epoch": 1.6098159509202454, "grad_norm": 0.07200402021408081, "learning_rate": 4.761386138613862e-06, "loss": 0.0023, "step": 328 }, { "epoch": 1.614723926380368, "grad_norm": 0.11577661335468292, "learning_rate": 4.754455445544555e-06, "loss": 0.0122, "step": 329 }, { "epoch": 1.6196319018404908, "grad_norm": 0.12311868369579315, "learning_rate": 4.747524752475248e-06, "loss": 0.0148, "step": 330 }, { "epoch": 1.6245398773006134, "grad_norm": 0.07884922623634338, "learning_rate": 4.740594059405941e-06, "loss": 0.0026, "step": 331 }, { "epoch": 1.6294478527607361, "grad_norm": 0.013971218839287758, "learning_rate": 4.733663366336634e-06, "loss": 0.0014, "step": 332 }, { "epoch": 1.6343558282208588, "grad_norm": 0.17173822224140167, "learning_rate": 4.726732673267327e-06, "loss": 0.0041, "step": 333 }, { "epoch": 1.6392638036809815, "grad_norm": 0.24089990556240082, "learning_rate": 4.71980198019802e-06, "loss": 0.0114, "step": 334 }, { "epoch": 1.6441717791411041, "grad_norm": 0.026243751868605614, "learning_rate": 4.712871287128713e-06, "loss": 0.0011, "step": 335 }, { "epoch": 1.649079754601227, "grad_norm": 0.08318141102790833, "learning_rate": 4.7059405940594064e-06, "loss": 0.0034, "step": 336 }, { "epoch": 1.6539877300613497, "grad_norm": 0.010400490835309029, "learning_rate": 4.6990099009900995e-06, "loss": 0.0022, "step": 337 }, { "epoch": 1.6588957055214724, "grad_norm": 0.044309504330158234, "learning_rate": 4.692079207920793e-06, "loss": 0.004, "step": 338 }, { "epoch": 1.6638036809815953, "grad_norm": 0.02144954912364483, "learning_rate": 4.685148514851486e-06, "loss": 0.0018, "step": 339 }, { "epoch": 1.668711656441718, "grad_norm": 0.0410357341170311, "learning_rate": 4.678217821782179e-06, "loss": 0.002, "step": 340 }, { "epoch": 1.6736196319018406, "grad_norm": 0.02522197738289833, "learning_rate": 4.671287128712872e-06, "loss": 0.0014, "step": 341 }, { "epoch": 1.6785276073619633, "grad_norm": 0.008343094028532505, "learning_rate": 4.664356435643565e-06, "loss": 0.0016, "step": 342 }, { "epoch": 1.683435582822086, "grad_norm": 0.19290384650230408, "learning_rate": 4.657425742574258e-06, "loss": 0.0063, "step": 343 }, { "epoch": 1.6883435582822086, "grad_norm": 0.038624536246061325, "learning_rate": 4.650495049504951e-06, "loss": 0.0016, "step": 344 }, { "epoch": 1.6932515337423313, "grad_norm": 0.21920740604400635, "learning_rate": 4.643564356435644e-06, "loss": 0.0105, "step": 345 }, { "epoch": 1.698159509202454, "grad_norm": 0.057425156235694885, "learning_rate": 4.636633663366337e-06, "loss": 0.0033, "step": 346 }, { "epoch": 1.7030674846625766, "grad_norm": 0.024981388822197914, "learning_rate": 4.62970297029703e-06, "loss": 0.0025, "step": 347 }, { "epoch": 1.7079754601226993, "grad_norm": 0.020687798038125038, "learning_rate": 4.622772277227723e-06, "loss": 0.0022, "step": 348 }, { "epoch": 1.712883435582822, "grad_norm": 0.028213948011398315, "learning_rate": 4.615841584158416e-06, "loss": 0.0014, "step": 349 }, { "epoch": 1.7177914110429446, "grad_norm": 0.05201176926493645, "learning_rate": 4.608910891089109e-06, "loss": 0.0068, "step": 350 }, { "epoch": 1.7226993865030675, "grad_norm": 0.057419948279857635, "learning_rate": 4.6019801980198025e-06, "loss": 0.0027, "step": 351 }, { "epoch": 1.7276073619631902, "grad_norm": 0.026078782975673676, "learning_rate": 4.5950495049504955e-06, "loss": 0.0017, "step": 352 }, { "epoch": 1.7325153374233129, "grad_norm": 0.09312699735164642, "learning_rate": 4.588118811881189e-06, "loss": 0.0048, "step": 353 }, { "epoch": 1.7374233128834355, "grad_norm": 0.027963656932115555, "learning_rate": 4.581188118811882e-06, "loss": 0.0018, "step": 354 }, { "epoch": 1.7423312883435584, "grad_norm": 1.0171724557876587, "learning_rate": 4.574257425742575e-06, "loss": 0.044, "step": 355 }, { "epoch": 1.747239263803681, "grad_norm": 0.0031565092504024506, "learning_rate": 4.567326732673268e-06, "loss": 0.0003, "step": 356 }, { "epoch": 1.7521472392638038, "grad_norm": 0.023803675547242165, "learning_rate": 4.560396039603961e-06, "loss": 0.003, "step": 357 }, { "epoch": 1.7570552147239265, "grad_norm": 0.11927448213100433, "learning_rate": 4.553465346534654e-06, "loss": 0.0046, "step": 358 }, { "epoch": 1.7619631901840491, "grad_norm": 0.20977821946144104, "learning_rate": 4.546534653465347e-06, "loss": 0.0287, "step": 359 }, { "epoch": 1.7668711656441718, "grad_norm": 0.024297788739204407, "learning_rate": 4.53960396039604e-06, "loss": 0.0016, "step": 360 }, { "epoch": 1.7717791411042945, "grad_norm": 0.0834914818406105, "learning_rate": 4.532673267326733e-06, "loss": 0.0032, "step": 361 }, { "epoch": 1.7766871165644171, "grad_norm": 0.07108546048402786, "learning_rate": 4.525742574257426e-06, "loss": 0.0021, "step": 362 }, { "epoch": 1.7815950920245398, "grad_norm": 0.037000950425863266, "learning_rate": 4.518811881188119e-06, "loss": 0.0019, "step": 363 }, { "epoch": 1.7865030674846625, "grad_norm": 0.050357453525066376, "learning_rate": 4.5118811881188115e-06, "loss": 0.0017, "step": 364 }, { "epoch": 1.7914110429447851, "grad_norm": 0.15029194951057434, "learning_rate": 4.504950495049505e-06, "loss": 0.0054, "step": 365 }, { "epoch": 1.7963190184049078, "grad_norm": 0.10820876061916351, "learning_rate": 4.498019801980198e-06, "loss": 0.0029, "step": 366 }, { "epoch": 1.8012269938650307, "grad_norm": 0.014579812064766884, "learning_rate": 4.491089108910891e-06, "loss": 0.0014, "step": 367 }, { "epoch": 1.8061349693251534, "grad_norm": 0.022982638329267502, "learning_rate": 4.484158415841584e-06, "loss": 0.0012, "step": 368 }, { "epoch": 1.811042944785276, "grad_norm": 0.12965136766433716, "learning_rate": 4.477227722772277e-06, "loss": 0.0059, "step": 369 }, { "epoch": 1.8159509202453987, "grad_norm": 0.2906913161277771, "learning_rate": 4.47029702970297e-06, "loss": 0.0061, "step": 370 }, { "epoch": 1.8208588957055216, "grad_norm": 0.04156769439578056, "learning_rate": 4.463366336633663e-06, "loss": 0.0031, "step": 371 }, { "epoch": 1.8257668711656443, "grad_norm": 0.07649008929729462, "learning_rate": 4.456435643564356e-06, "loss": 0.0027, "step": 372 }, { "epoch": 1.830674846625767, "grad_norm": 0.11019697785377502, "learning_rate": 4.449504950495049e-06, "loss": 0.0023, "step": 373 }, { "epoch": 1.8355828220858896, "grad_norm": 0.03163640573620796, "learning_rate": 4.442574257425742e-06, "loss": 0.0024, "step": 374 }, { "epoch": 1.8404907975460123, "grad_norm": 0.2059604823589325, "learning_rate": 4.435643564356435e-06, "loss": 0.0154, "step": 375 }, { "epoch": 1.845398773006135, "grad_norm": 0.00596796628087759, "learning_rate": 4.428712871287128e-06, "loss": 0.0003, "step": 376 }, { "epoch": 1.8503067484662576, "grad_norm": 0.1259843111038208, "learning_rate": 4.421782178217821e-06, "loss": 0.002, "step": 377 }, { "epoch": 1.8552147239263803, "grad_norm": 0.051318589597940445, "learning_rate": 4.4148514851485145e-06, "loss": 0.0054, "step": 378 }, { "epoch": 1.860122699386503, "grad_norm": 0.022389305755496025, "learning_rate": 4.4079207920792075e-06, "loss": 0.0011, "step": 379 }, { "epoch": 1.8650306748466257, "grad_norm": 0.03204929828643799, "learning_rate": 4.400990099009901e-06, "loss": 0.002, "step": 380 }, { "epoch": 1.8699386503067483, "grad_norm": 0.14024299383163452, "learning_rate": 4.394059405940594e-06, "loss": 0.0044, "step": 381 }, { "epoch": 1.874846625766871, "grad_norm": 0.1390925794839859, "learning_rate": 4.387128712871287e-06, "loss": 0.0125, "step": 382 }, { "epoch": 1.879754601226994, "grad_norm": 0.19391588866710663, "learning_rate": 4.38019801980198e-06, "loss": 0.0053, "step": 383 }, { "epoch": 1.8846625766871166, "grad_norm": 0.059033554047346115, "learning_rate": 4.373267326732673e-06, "loss": 0.0036, "step": 384 }, { "epoch": 1.8895705521472392, "grad_norm": 0.03923160582780838, "learning_rate": 4.366336633663366e-06, "loss": 0.0021, "step": 385 }, { "epoch": 1.8944785276073621, "grad_norm": 0.2464819848537445, "learning_rate": 4.359405940594059e-06, "loss": 0.0157, "step": 386 }, { "epoch": 1.8993865030674848, "grad_norm": 0.02465728111565113, "learning_rate": 4.352475247524752e-06, "loss": 0.002, "step": 387 }, { "epoch": 1.9042944785276075, "grad_norm": 0.017552992329001427, "learning_rate": 4.345544554455445e-06, "loss": 0.0013, "step": 388 }, { "epoch": 1.9092024539877301, "grad_norm": 0.07727736979722977, "learning_rate": 4.338613861386138e-06, "loss": 0.004, "step": 389 }, { "epoch": 1.9141104294478528, "grad_norm": 0.0594131238758564, "learning_rate": 4.331683168316831e-06, "loss": 0.0021, "step": 390 }, { "epoch": 1.9190184049079755, "grad_norm": 0.03132615610957146, "learning_rate": 4.324752475247524e-06, "loss": 0.0057, "step": 391 }, { "epoch": 1.9239263803680982, "grad_norm": 0.02295020781457424, "learning_rate": 4.3178217821782174e-06, "loss": 0.0014, "step": 392 }, { "epoch": 1.9288343558282208, "grad_norm": 0.04230022802948952, "learning_rate": 4.3108910891089105e-06, "loss": 0.0035, "step": 393 }, { "epoch": 1.9337423312883435, "grad_norm": 0.012272198684513569, "learning_rate": 4.3039603960396036e-06, "loss": 0.0027, "step": 394 }, { "epoch": 1.9386503067484662, "grad_norm": 0.017140038311481476, "learning_rate": 4.297029702970297e-06, "loss": 0.0015, "step": 395 }, { "epoch": 1.9435582822085888, "grad_norm": 0.011837287805974483, "learning_rate": 4.29009900990099e-06, "loss": 0.0017, "step": 396 }, { "epoch": 1.9484662576687115, "grad_norm": 0.03815172612667084, "learning_rate": 4.283168316831683e-06, "loss": 0.0036, "step": 397 }, { "epoch": 1.9533742331288344, "grad_norm": 0.029975654557347298, "learning_rate": 4.276237623762376e-06, "loss": 0.0017, "step": 398 }, { "epoch": 1.958282208588957, "grad_norm": 0.09029775857925415, "learning_rate": 4.269306930693069e-06, "loss": 0.0034, "step": 399 }, { "epoch": 1.9631901840490797, "grad_norm": 0.013478988781571388, "learning_rate": 4.262376237623762e-06, "loss": 0.0012, "step": 400 }, { "epoch": 1.9680981595092024, "grad_norm": 0.14020872116088867, "learning_rate": 4.255445544554455e-06, "loss": 0.0036, "step": 401 }, { "epoch": 1.9730061349693253, "grad_norm": 0.08427014946937561, "learning_rate": 4.248514851485148e-06, "loss": 0.0032, "step": 402 }, { "epoch": 1.977914110429448, "grad_norm": 0.02368674799799919, "learning_rate": 4.241584158415841e-06, "loss": 0.0018, "step": 403 }, { "epoch": 1.9828220858895707, "grad_norm": 0.17322583496570587, "learning_rate": 4.234653465346534e-06, "loss": 0.0075, "step": 404 }, { "epoch": 1.9877300613496933, "grad_norm": 0.2020941525697708, "learning_rate": 4.227722772277227e-06, "loss": 0.0056, "step": 405 }, { "epoch": 1.992638036809816, "grad_norm": 0.01503934245556593, "learning_rate": 4.22079207920792e-06, "loss": 0.0014, "step": 406 }, { "epoch": 1.9975460122699387, "grad_norm": 0.08337133377790451, "learning_rate": 4.2138613861386135e-06, "loss": 0.0022, "step": 407 }, { "epoch": 2.0024539877300613, "grad_norm": 0.03418401628732681, "learning_rate": 4.2069306930693065e-06, "loss": 0.0014, "step": 408 }, { "epoch": 2.007361963190184, "grad_norm": 0.15800650417804718, "learning_rate": 4.2e-06, "loss": 0.0152, "step": 409 }, { "epoch": 2.0122699386503067, "grad_norm": 0.07395298033952713, "learning_rate": 4.193069306930693e-06, "loss": 0.0027, "step": 410 }, { "epoch": 2.0171779141104293, "grad_norm": 0.013158326968550682, "learning_rate": 4.186138613861386e-06, "loss": 0.0015, "step": 411 }, { "epoch": 2.022085889570552, "grad_norm": 0.0122208371758461, "learning_rate": 4.179207920792079e-06, "loss": 0.0015, "step": 412 }, { "epoch": 2.0269938650306747, "grad_norm": 0.04391651973128319, "learning_rate": 4.172277227722772e-06, "loss": 0.0028, "step": 413 }, { "epoch": 2.0319018404907974, "grad_norm": 0.007677063811570406, "learning_rate": 4.165346534653465e-06, "loss": 0.0004, "step": 414 }, { "epoch": 2.03680981595092, "grad_norm": 0.01934950426220894, "learning_rate": 4.158415841584158e-06, "loss": 0.0013, "step": 415 }, { "epoch": 2.041717791411043, "grad_norm": 0.004791139159351587, "learning_rate": 4.151485148514851e-06, "loss": 0.0005, "step": 416 }, { "epoch": 2.046625766871166, "grad_norm": 0.03984725847840309, "learning_rate": 4.144554455445544e-06, "loss": 0.0015, "step": 417 }, { "epoch": 2.0515337423312885, "grad_norm": 0.6277703642845154, "learning_rate": 4.137623762376237e-06, "loss": 0.0269, "step": 418 }, { "epoch": 2.056441717791411, "grad_norm": 0.1700657159090042, "learning_rate": 4.13069306930693e-06, "loss": 0.0196, "step": 419 }, { "epoch": 2.061349693251534, "grad_norm": 0.045236505568027496, "learning_rate": 4.123762376237623e-06, "loss": 0.003, "step": 420 }, { "epoch": 2.0662576687116565, "grad_norm": 0.034617915749549866, "learning_rate": 4.116831683168316e-06, "loss": 0.0045, "step": 421 }, { "epoch": 2.071165644171779, "grad_norm": 0.022638553753495216, "learning_rate": 4.1099009900990095e-06, "loss": 0.0014, "step": 422 }, { "epoch": 2.076073619631902, "grad_norm": 0.011074609123170376, "learning_rate": 4.1029702970297026e-06, "loss": 0.0006, "step": 423 }, { "epoch": 2.0809815950920245, "grad_norm": 0.007294974289834499, "learning_rate": 4.096039603960396e-06, "loss": 0.0013, "step": 424 }, { "epoch": 2.085889570552147, "grad_norm": 0.09458526223897934, "learning_rate": 4.089108910891089e-06, "loss": 0.0018, "step": 425 }, { "epoch": 2.09079754601227, "grad_norm": 0.02894427254796028, "learning_rate": 4.082178217821782e-06, "loss": 0.0016, "step": 426 }, { "epoch": 2.0957055214723925, "grad_norm": 0.030396826565265656, "learning_rate": 4.075247524752475e-06, "loss": 0.0018, "step": 427 }, { "epoch": 2.100613496932515, "grad_norm": 0.11630258709192276, "learning_rate": 4.068316831683168e-06, "loss": 0.0081, "step": 428 }, { "epoch": 2.105521472392638, "grad_norm": 0.010553350672125816, "learning_rate": 4.061386138613861e-06, "loss": 0.0008, "step": 429 }, { "epoch": 2.1104294478527605, "grad_norm": 0.04130052775144577, "learning_rate": 4.054455445544554e-06, "loss": 0.0026, "step": 430 }, { "epoch": 2.1153374233128837, "grad_norm": 0.10501858592033386, "learning_rate": 4.047524752475247e-06, "loss": 0.0396, "step": 431 }, { "epoch": 2.1202453987730063, "grad_norm": 0.13223466277122498, "learning_rate": 4.04059405940594e-06, "loss": 0.0111, "step": 432 }, { "epoch": 2.125153374233129, "grad_norm": 0.015712805092334747, "learning_rate": 4.033663366336633e-06, "loss": 0.0012, "step": 433 }, { "epoch": 2.1300613496932517, "grad_norm": 0.023326637223362923, "learning_rate": 4.026732673267326e-06, "loss": 0.0017, "step": 434 }, { "epoch": 2.1349693251533743, "grad_norm": 0.03263499587774277, "learning_rate": 4.019801980198019e-06, "loss": 0.002, "step": 435 }, { "epoch": 2.139877300613497, "grad_norm": 0.018768969923257828, "learning_rate": 4.0128712871287124e-06, "loss": 0.0015, "step": 436 }, { "epoch": 2.1447852760736197, "grad_norm": 0.4300386905670166, "learning_rate": 4.0059405940594055e-06, "loss": 0.064, "step": 437 }, { "epoch": 2.1496932515337424, "grad_norm": 0.01180424727499485, "learning_rate": 3.999009900990099e-06, "loss": 0.0018, "step": 438 }, { "epoch": 2.154601226993865, "grad_norm": 0.007693049497902393, "learning_rate": 3.992079207920792e-06, "loss": 0.0007, "step": 439 }, { "epoch": 2.1595092024539877, "grad_norm": 0.04772064834833145, "learning_rate": 3.985148514851485e-06, "loss": 0.0031, "step": 440 }, { "epoch": 2.1644171779141104, "grad_norm": 0.017260396853089333, "learning_rate": 3.978217821782178e-06, "loss": 0.001, "step": 441 }, { "epoch": 2.169325153374233, "grad_norm": 0.08507797867059708, "learning_rate": 3.971287128712871e-06, "loss": 0.0063, "step": 442 }, { "epoch": 2.1742331288343557, "grad_norm": 0.02232922799885273, "learning_rate": 3.964356435643564e-06, "loss": 0.002, "step": 443 }, { "epoch": 2.1791411042944784, "grad_norm": 0.06088268384337425, "learning_rate": 3.957425742574257e-06, "loss": 0.0058, "step": 444 }, { "epoch": 2.184049079754601, "grad_norm": 0.15273047983646393, "learning_rate": 3.95049504950495e-06, "loss": 0.0035, "step": 445 }, { "epoch": 2.1889570552147237, "grad_norm": 0.014187682420015335, "learning_rate": 3.943564356435643e-06, "loss": 0.0014, "step": 446 }, { "epoch": 2.1938650306748464, "grad_norm": 0.1353190392255783, "learning_rate": 3.936633663366337e-06, "loss": 0.003, "step": 447 }, { "epoch": 2.1987730061349695, "grad_norm": 0.1862998902797699, "learning_rate": 3.92970297029703e-06, "loss": 0.0232, "step": 448 }, { "epoch": 2.203680981595092, "grad_norm": 0.06980832666158676, "learning_rate": 3.922772277227723e-06, "loss": 0.0018, "step": 449 }, { "epoch": 2.208588957055215, "grad_norm": 0.012548700906336308, "learning_rate": 3.915841584158416e-06, "loss": 0.0009, "step": 450 }, { "epoch": 2.2134969325153375, "grad_norm": 0.0944492369890213, "learning_rate": 3.908910891089109e-06, "loss": 0.0071, "step": 451 }, { "epoch": 2.21840490797546, "grad_norm": 0.07349948585033417, "learning_rate": 3.901980198019802e-06, "loss": 0.0073, "step": 452 }, { "epoch": 2.223312883435583, "grad_norm": 0.016509253531694412, "learning_rate": 3.8950495049504955e-06, "loss": 0.0012, "step": 453 }, { "epoch": 2.2282208588957055, "grad_norm": 0.0530993714928627, "learning_rate": 3.8881188118811885e-06, "loss": 0.0025, "step": 454 }, { "epoch": 2.233128834355828, "grad_norm": 0.014129874296486378, "learning_rate": 3.881188118811882e-06, "loss": 0.002, "step": 455 }, { "epoch": 2.238036809815951, "grad_norm": 0.16294129192829132, "learning_rate": 3.874257425742575e-06, "loss": 0.0049, "step": 456 }, { "epoch": 2.2429447852760735, "grad_norm": 0.03273903578519821, "learning_rate": 3.867326732673268e-06, "loss": 0.0019, "step": 457 }, { "epoch": 2.247852760736196, "grad_norm": 0.04015541449189186, "learning_rate": 3.860396039603961e-06, "loss": 0.0021, "step": 458 }, { "epoch": 2.252760736196319, "grad_norm": 0.02188378944993019, "learning_rate": 3.853465346534654e-06, "loss": 0.001, "step": 459 }, { "epoch": 2.2576687116564416, "grad_norm": 0.02690410614013672, "learning_rate": 3.846534653465347e-06, "loss": 0.0013, "step": 460 }, { "epoch": 2.2625766871165642, "grad_norm": 0.08112508058547974, "learning_rate": 3.83960396039604e-06, "loss": 0.0027, "step": 461 }, { "epoch": 2.2674846625766873, "grad_norm": 0.015935292467474937, "learning_rate": 3.832673267326733e-06, "loss": 0.0024, "step": 462 }, { "epoch": 2.27239263803681, "grad_norm": 0.26772162318229675, "learning_rate": 3.825742574257426e-06, "loss": 0.0118, "step": 463 }, { "epoch": 2.2773006134969327, "grad_norm": 0.035102855414152145, "learning_rate": 3.818811881188119e-06, "loss": 0.001, "step": 464 }, { "epoch": 2.2822085889570554, "grad_norm": 0.03706807270646095, "learning_rate": 3.8118811881188123e-06, "loss": 0.0019, "step": 465 }, { "epoch": 2.287116564417178, "grad_norm": 0.012720320373773575, "learning_rate": 3.8049504950495054e-06, "loss": 0.0012, "step": 466 }, { "epoch": 2.2920245398773007, "grad_norm": 0.07940246909856796, "learning_rate": 3.7980198019801984e-06, "loss": 0.0045, "step": 467 }, { "epoch": 2.2969325153374234, "grad_norm": 0.017415238544344902, "learning_rate": 3.7910891089108915e-06, "loss": 0.0012, "step": 468 }, { "epoch": 2.301840490797546, "grad_norm": 0.1183788999915123, "learning_rate": 3.7841584158415846e-06, "loss": 0.0076, "step": 469 }, { "epoch": 2.3067484662576687, "grad_norm": 0.01171356625854969, "learning_rate": 3.7772277227722776e-06, "loss": 0.001, "step": 470 }, { "epoch": 2.3116564417177914, "grad_norm": 0.010136552155017853, "learning_rate": 3.7702970297029703e-06, "loss": 0.0009, "step": 471 }, { "epoch": 2.316564417177914, "grad_norm": 0.028086962178349495, "learning_rate": 3.7633663366336633e-06, "loss": 0.0011, "step": 472 }, { "epoch": 2.3214723926380367, "grad_norm": 0.00958373211324215, "learning_rate": 3.7564356435643564e-06, "loss": 0.0014, "step": 473 }, { "epoch": 2.3263803680981594, "grad_norm": 0.034549593925476074, "learning_rate": 3.7495049504950495e-06, "loss": 0.0016, "step": 474 }, { "epoch": 2.331288343558282, "grad_norm": 0.02567419782280922, "learning_rate": 3.7425742574257425e-06, "loss": 0.0017, "step": 475 }, { "epoch": 2.3361963190184047, "grad_norm": 0.0217987522482872, "learning_rate": 3.7356435643564356e-06, "loss": 0.0012, "step": 476 }, { "epoch": 2.3411042944785274, "grad_norm": 0.00880768895149231, "learning_rate": 3.7287128712871287e-06, "loss": 0.0012, "step": 477 }, { "epoch": 2.34601226993865, "grad_norm": 0.0350213460624218, "learning_rate": 3.7217821782178218e-06, "loss": 0.0038, "step": 478 }, { "epoch": 2.3509202453987728, "grad_norm": 0.009084700606763363, "learning_rate": 3.714851485148515e-06, "loss": 0.0018, "step": 479 }, { "epoch": 2.355828220858896, "grad_norm": 0.056382663547992706, "learning_rate": 3.707920792079208e-06, "loss": 0.0018, "step": 480 }, { "epoch": 2.3607361963190185, "grad_norm": 0.03614302724599838, "learning_rate": 3.700990099009901e-06, "loss": 0.0025, "step": 481 }, { "epoch": 2.365644171779141, "grad_norm": 0.03676525875926018, "learning_rate": 3.694059405940594e-06, "loss": 0.001, "step": 482 }, { "epoch": 2.370552147239264, "grad_norm": 0.1925876885652542, "learning_rate": 3.687128712871287e-06, "loss": 0.0141, "step": 483 }, { "epoch": 2.3754601226993866, "grad_norm": 0.013960366137325764, "learning_rate": 3.68019801980198e-06, "loss": 0.0012, "step": 484 }, { "epoch": 2.3803680981595092, "grad_norm": 0.03343435749411583, "learning_rate": 3.6732673267326732e-06, "loss": 0.0014, "step": 485 }, { "epoch": 2.385276073619632, "grad_norm": 0.022218871861696243, "learning_rate": 3.6663366336633663e-06, "loss": 0.0026, "step": 486 }, { "epoch": 2.3901840490797546, "grad_norm": 0.015421630814671516, "learning_rate": 3.6594059405940594e-06, "loss": 0.001, "step": 487 }, { "epoch": 2.3950920245398772, "grad_norm": 0.011735438369214535, "learning_rate": 3.6524752475247524e-06, "loss": 0.0011, "step": 488 }, { "epoch": 2.4, "grad_norm": 0.06969886273145676, "learning_rate": 3.6455445544554455e-06, "loss": 0.0035, "step": 489 }, { "epoch": 2.4049079754601226, "grad_norm": 0.009630633518099785, "learning_rate": 3.6386138613861386e-06, "loss": 0.0012, "step": 490 }, { "epoch": 2.4098159509202453, "grad_norm": 0.18442873656749725, "learning_rate": 3.6316831683168316e-06, "loss": 0.0034, "step": 491 }, { "epoch": 2.414723926380368, "grad_norm": 0.0310515183955431, "learning_rate": 3.6247524752475247e-06, "loss": 0.002, "step": 492 }, { "epoch": 2.419631901840491, "grad_norm": 0.107456274330616, "learning_rate": 3.6178217821782178e-06, "loss": 0.0036, "step": 493 }, { "epoch": 2.4245398773006137, "grad_norm": 0.018645675852894783, "learning_rate": 3.610891089108911e-06, "loss": 0.0018, "step": 494 }, { "epoch": 2.4294478527607364, "grad_norm": 0.03003678098320961, "learning_rate": 3.603960396039604e-06, "loss": 0.0026, "step": 495 }, { "epoch": 2.434355828220859, "grad_norm": 0.00237216055393219, "learning_rate": 3.597029702970297e-06, "loss": 0.0014, "step": 496 }, { "epoch": 2.4392638036809817, "grad_norm": 0.11389046162366867, "learning_rate": 3.59009900990099e-06, "loss": 0.0036, "step": 497 }, { "epoch": 2.4441717791411044, "grad_norm": 0.16872182488441467, "learning_rate": 3.583168316831683e-06, "loss": 0.0127, "step": 498 }, { "epoch": 2.449079754601227, "grad_norm": 0.011766748502850533, "learning_rate": 3.576237623762376e-06, "loss": 0.0006, "step": 499 }, { "epoch": 2.4539877300613497, "grad_norm": 0.014331258833408356, "learning_rate": 3.5693069306930693e-06, "loss": 0.0014, "step": 500 }, { "epoch": 2.4588957055214724, "grad_norm": 0.009422739036381245, "learning_rate": 3.5623762376237623e-06, "loss": 0.0012, "step": 501 }, { "epoch": 2.463803680981595, "grad_norm": 0.04546496644616127, "learning_rate": 3.5554455445544554e-06, "loss": 0.0067, "step": 502 }, { "epoch": 2.4687116564417177, "grad_norm": 0.026389574632048607, "learning_rate": 3.5485148514851485e-06, "loss": 0.0016, "step": 503 }, { "epoch": 2.4736196319018404, "grad_norm": 0.025073140859603882, "learning_rate": 3.5415841584158415e-06, "loss": 0.0011, "step": 504 }, { "epoch": 2.478527607361963, "grad_norm": 0.032442186027765274, "learning_rate": 3.5346534653465346e-06, "loss": 0.0018, "step": 505 }, { "epoch": 2.4834355828220858, "grad_norm": 0.07158058136701584, "learning_rate": 3.5277227722772277e-06, "loss": 0.003, "step": 506 }, { "epoch": 2.4883435582822084, "grad_norm": 0.005822812672704458, "learning_rate": 3.5207920792079207e-06, "loss": 0.0009, "step": 507 }, { "epoch": 2.493251533742331, "grad_norm": 0.0328555554151535, "learning_rate": 3.513861386138614e-06, "loss": 0.0013, "step": 508 }, { "epoch": 2.4981595092024538, "grad_norm": 0.010760471224784851, "learning_rate": 3.506930693069307e-06, "loss": 0.001, "step": 509 }, { "epoch": 2.5030674846625764, "grad_norm": 0.02453591674566269, "learning_rate": 3.5e-06, "loss": 0.0015, "step": 510 }, { "epoch": 2.507975460122699, "grad_norm": 0.025182075798511505, "learning_rate": 3.493069306930693e-06, "loss": 0.0015, "step": 511 }, { "epoch": 2.5128834355828222, "grad_norm": 0.25042542815208435, "learning_rate": 3.486138613861386e-06, "loss": 0.009, "step": 512 }, { "epoch": 2.517791411042945, "grad_norm": 0.03413018584251404, "learning_rate": 3.479207920792079e-06, "loss": 0.0018, "step": 513 }, { "epoch": 2.5226993865030676, "grad_norm": 0.16198301315307617, "learning_rate": 3.4722772277227722e-06, "loss": 0.0134, "step": 514 }, { "epoch": 2.5276073619631902, "grad_norm": 0.17140725255012512, "learning_rate": 3.4653465346534653e-06, "loss": 0.0338, "step": 515 }, { "epoch": 2.532515337423313, "grad_norm": 0.06337208300828934, "learning_rate": 3.4584158415841584e-06, "loss": 0.0042, "step": 516 }, { "epoch": 2.5374233128834356, "grad_norm": 0.00469438498839736, "learning_rate": 3.4514851485148514e-06, "loss": 0.0005, "step": 517 }, { "epoch": 2.5423312883435583, "grad_norm": 0.053614478558301926, "learning_rate": 3.4445544554455445e-06, "loss": 0.0063, "step": 518 }, { "epoch": 2.547239263803681, "grad_norm": 0.019361039623618126, "learning_rate": 3.4376237623762376e-06, "loss": 0.0011, "step": 519 }, { "epoch": 2.5521472392638036, "grad_norm": 0.017689630389213562, "learning_rate": 3.4306930693069306e-06, "loss": 0.0011, "step": 520 }, { "epoch": 2.5570552147239263, "grad_norm": 0.03599102422595024, "learning_rate": 3.4237623762376237e-06, "loss": 0.0021, "step": 521 }, { "epoch": 2.561963190184049, "grad_norm": 0.6623808145523071, "learning_rate": 3.4168316831683168e-06, "loss": 0.0231, "step": 522 }, { "epoch": 2.5668711656441716, "grad_norm": 0.013597175478935242, "learning_rate": 3.40990099009901e-06, "loss": 0.0018, "step": 523 }, { "epoch": 2.5717791411042947, "grad_norm": 0.07440067827701569, "learning_rate": 3.402970297029703e-06, "loss": 0.0043, "step": 524 }, { "epoch": 2.5766871165644174, "grad_norm": 0.016382288187742233, "learning_rate": 3.396039603960396e-06, "loss": 0.0013, "step": 525 }, { "epoch": 2.58159509202454, "grad_norm": 0.04037494957447052, "learning_rate": 3.389108910891089e-06, "loss": 0.0034, "step": 526 }, { "epoch": 2.5865030674846627, "grad_norm": 0.08392734825611115, "learning_rate": 3.382178217821782e-06, "loss": 0.0022, "step": 527 }, { "epoch": 2.5914110429447854, "grad_norm": 0.041856564581394196, "learning_rate": 3.375247524752475e-06, "loss": 0.0019, "step": 528 }, { "epoch": 2.596319018404908, "grad_norm": 0.0511699840426445, "learning_rate": 3.3683168316831683e-06, "loss": 0.0016, "step": 529 }, { "epoch": 2.6012269938650308, "grad_norm": 0.012307001277804375, "learning_rate": 3.3613861386138613e-06, "loss": 0.0011, "step": 530 }, { "epoch": 2.6061349693251534, "grad_norm": 0.033212386071681976, "learning_rate": 3.3544554455445544e-06, "loss": 0.0018, "step": 531 }, { "epoch": 2.611042944785276, "grad_norm": 0.033674679696559906, "learning_rate": 3.3475247524752475e-06, "loss": 0.0009, "step": 532 }, { "epoch": 2.6159509202453988, "grad_norm": 0.022887440398335457, "learning_rate": 3.3405940594059405e-06, "loss": 0.0009, "step": 533 }, { "epoch": 2.6208588957055214, "grad_norm": 0.08867110311985016, "learning_rate": 3.3336633663366336e-06, "loss": 0.0033, "step": 534 }, { "epoch": 2.625766871165644, "grad_norm": 0.015654679387807846, "learning_rate": 3.3267326732673267e-06, "loss": 0.0011, "step": 535 }, { "epoch": 2.630674846625767, "grad_norm": 0.07898303866386414, "learning_rate": 3.3198019801980197e-06, "loss": 0.0029, "step": 536 }, { "epoch": 2.6355828220858895, "grad_norm": 0.1686147004365921, "learning_rate": 3.312871287128713e-06, "loss": 0.0115, "step": 537 }, { "epoch": 2.640490797546012, "grad_norm": 0.03158680722117424, "learning_rate": 3.305940594059406e-06, "loss": 0.0017, "step": 538 }, { "epoch": 2.645398773006135, "grad_norm": 0.11354830116033554, "learning_rate": 3.299009900990099e-06, "loss": 0.0091, "step": 539 }, { "epoch": 2.6503067484662575, "grad_norm": 0.08498039096593857, "learning_rate": 3.292079207920792e-06, "loss": 0.0038, "step": 540 }, { "epoch": 2.65521472392638, "grad_norm": 0.08356107771396637, "learning_rate": 3.285148514851485e-06, "loss": 0.0019, "step": 541 }, { "epoch": 2.660122699386503, "grad_norm": 0.24042771756649017, "learning_rate": 3.2782178217821786e-06, "loss": 0.0071, "step": 542 }, { "epoch": 2.665030674846626, "grad_norm": 0.034460801631212234, "learning_rate": 3.2712871287128716e-06, "loss": 0.0017, "step": 543 }, { "epoch": 2.6699386503067486, "grad_norm": 0.09137725085020065, "learning_rate": 3.2643564356435643e-06, "loss": 0.0022, "step": 544 }, { "epoch": 2.6748466257668713, "grad_norm": 0.04045404493808746, "learning_rate": 3.2574257425742573e-06, "loss": 0.0016, "step": 545 }, { "epoch": 2.679754601226994, "grad_norm": 0.24448804557323456, "learning_rate": 3.2504950495049504e-06, "loss": 0.0228, "step": 546 }, { "epoch": 2.6846625766871166, "grad_norm": 0.02107204869389534, "learning_rate": 3.2435643564356435e-06, "loss": 0.0022, "step": 547 }, { "epoch": 2.6895705521472393, "grad_norm": 0.02561573125422001, "learning_rate": 3.2366336633663366e-06, "loss": 0.0014, "step": 548 }, { "epoch": 2.694478527607362, "grad_norm": 0.02062409184873104, "learning_rate": 3.2297029702970296e-06, "loss": 0.0009, "step": 549 }, { "epoch": 2.6993865030674846, "grad_norm": 0.053445011377334595, "learning_rate": 3.2227722772277227e-06, "loss": 0.0025, "step": 550 }, { "epoch": 2.7042944785276073, "grad_norm": 0.0313737653195858, "learning_rate": 3.2158415841584158e-06, "loss": 0.0021, "step": 551 }, { "epoch": 2.70920245398773, "grad_norm": 0.04768161103129387, "learning_rate": 3.208910891089109e-06, "loss": 0.006, "step": 552 }, { "epoch": 2.7141104294478526, "grad_norm": 0.1395123153924942, "learning_rate": 3.201980198019802e-06, "loss": 0.0095, "step": 553 }, { "epoch": 2.7190184049079753, "grad_norm": 0.022861650213599205, "learning_rate": 3.195049504950495e-06, "loss": 0.0014, "step": 554 }, { "epoch": 2.7239263803680984, "grad_norm": 0.08337707072496414, "learning_rate": 3.188118811881188e-06, "loss": 0.0053, "step": 555 }, { "epoch": 2.728834355828221, "grad_norm": 0.1470242589712143, "learning_rate": 3.181188118811881e-06, "loss": 0.0089, "step": 556 }, { "epoch": 2.7337423312883438, "grad_norm": 0.1716291755437851, "learning_rate": 3.174257425742574e-06, "loss": 0.014, "step": 557 }, { "epoch": 2.7386503067484664, "grad_norm": 0.015148639678955078, "learning_rate": 3.1673267326732672e-06, "loss": 0.001, "step": 558 }, { "epoch": 2.743558282208589, "grad_norm": 0.019988784566521645, "learning_rate": 3.1603960396039603e-06, "loss": 0.0015, "step": 559 }, { "epoch": 2.7484662576687118, "grad_norm": 0.2989899814128876, "learning_rate": 3.1534653465346534e-06, "loss": 0.0092, "step": 560 }, { "epoch": 2.7533742331288344, "grad_norm": 0.1491839736700058, "learning_rate": 3.1465346534653464e-06, "loss": 0.0046, "step": 561 }, { "epoch": 2.758282208588957, "grad_norm": 0.02539440244436264, "learning_rate": 3.1396039603960395e-06, "loss": 0.0014, "step": 562 }, { "epoch": 2.76319018404908, "grad_norm": 0.31437137722969055, "learning_rate": 3.1326732673267326e-06, "loss": 0.0109, "step": 563 }, { "epoch": 2.7680981595092025, "grad_norm": 0.013294244185090065, "learning_rate": 3.1257425742574257e-06, "loss": 0.0016, "step": 564 }, { "epoch": 2.773006134969325, "grad_norm": 0.025412971153855324, "learning_rate": 3.1188118811881187e-06, "loss": 0.002, "step": 565 }, { "epoch": 2.777914110429448, "grad_norm": 0.05206981673836708, "learning_rate": 3.1118811881188118e-06, "loss": 0.0017, "step": 566 }, { "epoch": 2.7828220858895705, "grad_norm": 0.0227506086230278, "learning_rate": 3.104950495049505e-06, "loss": 0.001, "step": 567 }, { "epoch": 2.787730061349693, "grad_norm": 0.019159123301506042, "learning_rate": 3.098019801980198e-06, "loss": 0.0012, "step": 568 }, { "epoch": 2.792638036809816, "grad_norm": 0.035373397171497345, "learning_rate": 3.091089108910891e-06, "loss": 0.0011, "step": 569 }, { "epoch": 2.7975460122699385, "grad_norm": 0.06474697589874268, "learning_rate": 3.084158415841584e-06, "loss": 0.0017, "step": 570 }, { "epoch": 2.802453987730061, "grad_norm": 0.01330810971558094, "learning_rate": 3.077227722772277e-06, "loss": 0.0006, "step": 571 }, { "epoch": 2.807361963190184, "grad_norm": 0.018572993576526642, "learning_rate": 3.07029702970297e-06, "loss": 0.0012, "step": 572 }, { "epoch": 2.8122699386503065, "grad_norm": 0.04764172062277794, "learning_rate": 3.0633663366336633e-06, "loss": 0.0027, "step": 573 }, { "epoch": 2.817177914110429, "grad_norm": 0.05075710266828537, "learning_rate": 3.0564356435643563e-06, "loss": 0.0018, "step": 574 }, { "epoch": 2.8220858895705523, "grad_norm": 0.0751049742102623, "learning_rate": 3.0495049504950494e-06, "loss": 0.0023, "step": 575 }, { "epoch": 2.826993865030675, "grad_norm": 0.21285609900951385, "learning_rate": 3.0425742574257425e-06, "loss": 0.0089, "step": 576 }, { "epoch": 2.8319018404907976, "grad_norm": 0.048560746014118195, "learning_rate": 3.0356435643564355e-06, "loss": 0.0029, "step": 577 }, { "epoch": 2.8368098159509203, "grad_norm": 0.09979841113090515, "learning_rate": 3.0287128712871286e-06, "loss": 0.0035, "step": 578 }, { "epoch": 2.841717791411043, "grad_norm": 0.012869380414485931, "learning_rate": 3.0217821782178217e-06, "loss": 0.0011, "step": 579 }, { "epoch": 2.8466257668711656, "grad_norm": 0.04724825918674469, "learning_rate": 3.0148514851485147e-06, "loss": 0.0021, "step": 580 }, { "epoch": 2.8515337423312883, "grad_norm": 0.10769210010766983, "learning_rate": 3.007920792079208e-06, "loss": 0.0097, "step": 581 }, { "epoch": 2.856441717791411, "grad_norm": 0.00834321416914463, "learning_rate": 3.000990099009901e-06, "loss": 0.0008, "step": 582 }, { "epoch": 2.8613496932515337, "grad_norm": 0.04464031755924225, "learning_rate": 2.994059405940594e-06, "loss": 0.0026, "step": 583 }, { "epoch": 2.8662576687116563, "grad_norm": 0.01902065984904766, "learning_rate": 2.987128712871287e-06, "loss": 0.0014, "step": 584 }, { "epoch": 2.871165644171779, "grad_norm": 0.038237448781728745, "learning_rate": 2.98019801980198e-06, "loss": 0.0015, "step": 585 }, { "epoch": 2.876073619631902, "grad_norm": 0.013939250260591507, "learning_rate": 2.973267326732673e-06, "loss": 0.001, "step": 586 }, { "epoch": 2.880981595092025, "grad_norm": 0.03710507974028587, "learning_rate": 2.9663366336633662e-06, "loss": 0.0027, "step": 587 }, { "epoch": 2.8858895705521475, "grad_norm": 0.04779844731092453, "learning_rate": 2.9594059405940593e-06, "loss": 0.0012, "step": 588 }, { "epoch": 2.89079754601227, "grad_norm": 0.00711404625326395, "learning_rate": 2.9524752475247524e-06, "loss": 0.001, "step": 589 }, { "epoch": 2.895705521472393, "grad_norm": 0.017887057736516, "learning_rate": 2.9455445544554454e-06, "loss": 0.0015, "step": 590 }, { "epoch": 2.9006134969325155, "grad_norm": 0.01937202550470829, "learning_rate": 2.9386138613861385e-06, "loss": 0.0012, "step": 591 }, { "epoch": 2.905521472392638, "grad_norm": 0.005528996232897043, "learning_rate": 2.9316831683168316e-06, "loss": 0.0011, "step": 592 }, { "epoch": 2.910429447852761, "grad_norm": 0.025270938873291016, "learning_rate": 2.9247524752475246e-06, "loss": 0.0011, "step": 593 }, { "epoch": 2.9153374233128835, "grad_norm": 0.022803837433457375, "learning_rate": 2.9178217821782177e-06, "loss": 0.0012, "step": 594 }, { "epoch": 2.920245398773006, "grad_norm": 0.4284270405769348, "learning_rate": 2.9108910891089108e-06, "loss": 0.0206, "step": 595 }, { "epoch": 2.925153374233129, "grad_norm": 0.03455421328544617, "learning_rate": 2.903960396039604e-06, "loss": 0.0016, "step": 596 }, { "epoch": 2.9300613496932515, "grad_norm": 0.06015002727508545, "learning_rate": 2.897029702970297e-06, "loss": 0.0026, "step": 597 }, { "epoch": 2.934969325153374, "grad_norm": 0.07220069319009781, "learning_rate": 2.89009900990099e-06, "loss": 0.0035, "step": 598 }, { "epoch": 2.939877300613497, "grad_norm": 0.0011812745360657573, "learning_rate": 2.883168316831683e-06, "loss": 0.0003, "step": 599 }, { "epoch": 2.9447852760736195, "grad_norm": 0.030697235837578773, "learning_rate": 2.876237623762376e-06, "loss": 0.0022, "step": 600 }, { "epoch": 2.949693251533742, "grad_norm": 0.028459953144192696, "learning_rate": 2.869306930693069e-06, "loss": 0.0014, "step": 601 }, { "epoch": 2.954601226993865, "grad_norm": 0.03279620781540871, "learning_rate": 2.8623762376237623e-06, "loss": 0.0019, "step": 602 }, { "epoch": 2.9595092024539875, "grad_norm": 0.03629060834646225, "learning_rate": 2.8554455445544553e-06, "loss": 0.0023, "step": 603 }, { "epoch": 2.96441717791411, "grad_norm": 0.008510543964803219, "learning_rate": 2.8485148514851484e-06, "loss": 0.0006, "step": 604 }, { "epoch": 2.969325153374233, "grad_norm": 0.014275304973125458, "learning_rate": 2.841584158415842e-06, "loss": 0.001, "step": 605 }, { "epoch": 2.974233128834356, "grad_norm": 0.06183558329939842, "learning_rate": 2.834653465346535e-06, "loss": 0.0021, "step": 606 }, { "epoch": 2.9791411042944786, "grad_norm": 0.07287970185279846, "learning_rate": 2.827722772277228e-06, "loss": 0.0029, "step": 607 }, { "epoch": 2.9840490797546013, "grad_norm": 0.02359975501894951, "learning_rate": 2.820792079207921e-06, "loss": 0.0015, "step": 608 }, { "epoch": 2.988957055214724, "grad_norm": 0.02087857946753502, "learning_rate": 2.813861386138614e-06, "loss": 0.0027, "step": 609 }, { "epoch": 2.9938650306748467, "grad_norm": 0.036885183304548264, "learning_rate": 2.8069306930693072e-06, "loss": 0.0016, "step": 610 }, { "epoch": 2.9987730061349693, "grad_norm": 0.08759594708681107, "learning_rate": 2.8000000000000003e-06, "loss": 0.013, "step": 611 }, { "epoch": 3.003680981595092, "grad_norm": 0.0203255582600832, "learning_rate": 2.7930693069306934e-06, "loss": 0.0046, "step": 612 }, { "epoch": 3.0085889570552147, "grad_norm": 0.04549151286482811, "learning_rate": 2.7861386138613864e-06, "loss": 0.0032, "step": 613 }, { "epoch": 3.0134969325153373, "grad_norm": 0.02024606615304947, "learning_rate": 2.7792079207920795e-06, "loss": 0.0014, "step": 614 }, { "epoch": 3.01840490797546, "grad_norm": 0.03339584916830063, "learning_rate": 2.7722772277227726e-06, "loss": 0.0018, "step": 615 }, { "epoch": 3.0233128834355827, "grad_norm": 0.02375354804098606, "learning_rate": 2.7653465346534656e-06, "loss": 0.0028, "step": 616 }, { "epoch": 3.0282208588957054, "grad_norm": 0.021728744730353355, "learning_rate": 2.7584158415841583e-06, "loss": 0.0031, "step": 617 }, { "epoch": 3.033128834355828, "grad_norm": 0.032941028475761414, "learning_rate": 2.7514851485148514e-06, "loss": 0.0018, "step": 618 }, { "epoch": 3.038036809815951, "grad_norm": 0.020893137902021408, "learning_rate": 2.7445544554455444e-06, "loss": 0.0013, "step": 619 }, { "epoch": 3.042944785276074, "grad_norm": 0.012155863456428051, "learning_rate": 2.7376237623762375e-06, "loss": 0.0013, "step": 620 }, { "epoch": 3.0478527607361965, "grad_norm": 0.0235965047031641, "learning_rate": 2.7306930693069306e-06, "loss": 0.0019, "step": 621 }, { "epoch": 3.052760736196319, "grad_norm": 0.03598389774560928, "learning_rate": 2.7237623762376236e-06, "loss": 0.002, "step": 622 }, { "epoch": 3.057668711656442, "grad_norm": 0.033302754163742065, "learning_rate": 2.7168316831683167e-06, "loss": 0.0011, "step": 623 }, { "epoch": 3.0625766871165645, "grad_norm": 0.011287711560726166, "learning_rate": 2.7099009900990098e-06, "loss": 0.0015, "step": 624 }, { "epoch": 3.067484662576687, "grad_norm": 0.04231281206011772, "learning_rate": 2.702970297029703e-06, "loss": 0.0032, "step": 625 }, { "epoch": 3.07239263803681, "grad_norm": 0.020790139213204384, "learning_rate": 2.696039603960396e-06, "loss": 0.001, "step": 626 }, { "epoch": 3.0773006134969325, "grad_norm": 0.020422058179974556, "learning_rate": 2.689108910891089e-06, "loss": 0.0017, "step": 627 }, { "epoch": 3.082208588957055, "grad_norm": 0.02271398715674877, "learning_rate": 2.682178217821782e-06, "loss": 0.0013, "step": 628 }, { "epoch": 3.087116564417178, "grad_norm": 0.013147884979844093, "learning_rate": 2.675247524752475e-06, "loss": 0.0009, "step": 629 }, { "epoch": 3.0920245398773005, "grad_norm": 0.023107754066586494, "learning_rate": 2.668316831683168e-06, "loss": 0.0021, "step": 630 }, { "epoch": 3.096932515337423, "grad_norm": 0.027083350345492363, "learning_rate": 2.6613861386138612e-06, "loss": 0.0016, "step": 631 }, { "epoch": 3.101840490797546, "grad_norm": 0.01890239305794239, "learning_rate": 2.6544554455445543e-06, "loss": 0.0021, "step": 632 }, { "epoch": 3.1067484662576685, "grad_norm": 0.0020133615471422672, "learning_rate": 2.6475247524752474e-06, "loss": 0.0004, "step": 633 }, { "epoch": 3.111656441717791, "grad_norm": 0.022084610536694527, "learning_rate": 2.6405940594059405e-06, "loss": 0.0015, "step": 634 }, { "epoch": 3.116564417177914, "grad_norm": 0.008074000477790833, "learning_rate": 2.6336633663366335e-06, "loss": 0.0011, "step": 635 }, { "epoch": 3.121472392638037, "grad_norm": 0.03898672014474869, "learning_rate": 2.6267326732673266e-06, "loss": 0.002, "step": 636 }, { "epoch": 3.1263803680981597, "grad_norm": 0.00946191418915987, "learning_rate": 2.6198019801980197e-06, "loss": 0.001, "step": 637 }, { "epoch": 3.1312883435582823, "grad_norm": 0.1219155415892601, "learning_rate": 2.6128712871287127e-06, "loss": 0.0079, "step": 638 }, { "epoch": 3.136196319018405, "grad_norm": 0.025132469832897186, "learning_rate": 2.605940594059406e-06, "loss": 0.0015, "step": 639 }, { "epoch": 3.1411042944785277, "grad_norm": 0.01082681119441986, "learning_rate": 2.599009900990099e-06, "loss": 0.0008, "step": 640 }, { "epoch": 3.1460122699386504, "grad_norm": 0.14690011739730835, "learning_rate": 2.592079207920792e-06, "loss": 0.0117, "step": 641 }, { "epoch": 3.150920245398773, "grad_norm": 0.010698092170059681, "learning_rate": 2.585148514851485e-06, "loss": 0.0013, "step": 642 }, { "epoch": 3.1558282208588957, "grad_norm": 0.2589493691921234, "learning_rate": 2.578217821782178e-06, "loss": 0.0089, "step": 643 }, { "epoch": 3.1607361963190184, "grad_norm": 0.01592063717544079, "learning_rate": 2.571287128712871e-06, "loss": 0.0011, "step": 644 }, { "epoch": 3.165644171779141, "grad_norm": 0.02394460327923298, "learning_rate": 2.564356435643564e-06, "loss": 0.0014, "step": 645 }, { "epoch": 3.1705521472392637, "grad_norm": 0.02034229226410389, "learning_rate": 2.5574257425742573e-06, "loss": 0.0023, "step": 646 }, { "epoch": 3.1754601226993864, "grad_norm": 0.032998789101839066, "learning_rate": 2.5504950495049503e-06, "loss": 0.0021, "step": 647 }, { "epoch": 3.180368098159509, "grad_norm": 0.013732580468058586, "learning_rate": 2.5435643564356434e-06, "loss": 0.0005, "step": 648 }, { "epoch": 3.1852760736196317, "grad_norm": 0.046656981110572815, "learning_rate": 2.5366336633663365e-06, "loss": 0.0031, "step": 649 }, { "epoch": 3.190184049079755, "grad_norm": 0.02960779331624508, "learning_rate": 2.5297029702970295e-06, "loss": 0.0013, "step": 650 }, { "epoch": 3.1950920245398775, "grad_norm": 0.051251210272312164, "learning_rate": 2.5227722772277226e-06, "loss": 0.0025, "step": 651 }, { "epoch": 3.2, "grad_norm": 0.03321881592273712, "learning_rate": 2.5158415841584157e-06, "loss": 0.0047, "step": 652 }, { "epoch": 3.204907975460123, "grad_norm": 0.005035923328250647, "learning_rate": 2.5089108910891088e-06, "loss": 0.0007, "step": 653 }, { "epoch": 3.2098159509202455, "grad_norm": 0.07169622927904129, "learning_rate": 2.501980198019802e-06, "loss": 0.0026, "step": 654 }, { "epoch": 3.214723926380368, "grad_norm": 0.017996247857809067, "learning_rate": 2.495049504950495e-06, "loss": 0.0012, "step": 655 }, { "epoch": 3.219631901840491, "grad_norm": 0.025565218180418015, "learning_rate": 2.488118811881188e-06, "loss": 0.0013, "step": 656 }, { "epoch": 3.2245398773006135, "grad_norm": 0.014627007767558098, "learning_rate": 2.481188118811881e-06, "loss": 0.0015, "step": 657 }, { "epoch": 3.229447852760736, "grad_norm": 0.011715116910636425, "learning_rate": 2.474257425742574e-06, "loss": 0.001, "step": 658 }, { "epoch": 3.234355828220859, "grad_norm": 0.01887853443622589, "learning_rate": 2.467326732673267e-06, "loss": 0.0013, "step": 659 }, { "epoch": 3.2392638036809815, "grad_norm": 0.028289880603551865, "learning_rate": 2.4603960396039602e-06, "loss": 0.0024, "step": 660 }, { "epoch": 3.244171779141104, "grad_norm": 0.15456917881965637, "learning_rate": 2.4534653465346533e-06, "loss": 0.021, "step": 661 }, { "epoch": 3.249079754601227, "grad_norm": 0.012570716440677643, "learning_rate": 2.4465346534653464e-06, "loss": 0.0012, "step": 662 }, { "epoch": 3.2539877300613496, "grad_norm": 0.006434513721615076, "learning_rate": 2.4396039603960394e-06, "loss": 0.0006, "step": 663 }, { "epoch": 3.2588957055214722, "grad_norm": 0.003889314830303192, "learning_rate": 2.4326732673267325e-06, "loss": 0.0007, "step": 664 }, { "epoch": 3.263803680981595, "grad_norm": 0.16952529549598694, "learning_rate": 2.4257425742574256e-06, "loss": 0.008, "step": 665 }, { "epoch": 3.2687116564417176, "grad_norm": 0.02936733514070511, "learning_rate": 2.4188118811881186e-06, "loss": 0.0016, "step": 666 }, { "epoch": 3.2736196319018402, "grad_norm": 0.10733254253864288, "learning_rate": 2.4118811881188117e-06, "loss": 0.0029, "step": 667 }, { "epoch": 3.2785276073619634, "grad_norm": 0.02705569751560688, "learning_rate": 2.404950495049505e-06, "loss": 0.0017, "step": 668 }, { "epoch": 3.283435582822086, "grad_norm": 0.024813305586576462, "learning_rate": 2.3980198019801983e-06, "loss": 0.0011, "step": 669 }, { "epoch": 3.2883435582822087, "grad_norm": 0.11823663115501404, "learning_rate": 2.3910891089108913e-06, "loss": 0.0034, "step": 670 }, { "epoch": 3.2932515337423314, "grad_norm": 0.15911535918712616, "learning_rate": 2.3841584158415844e-06, "loss": 0.0038, "step": 671 }, { "epoch": 3.298159509202454, "grad_norm": 0.03651705011725426, "learning_rate": 2.3772277227722775e-06, "loss": 0.0013, "step": 672 }, { "epoch": 3.3030674846625767, "grad_norm": 0.20837312936782837, "learning_rate": 2.3702970297029705e-06, "loss": 0.0076, "step": 673 }, { "epoch": 3.3079754601226994, "grad_norm": 0.013823213055729866, "learning_rate": 2.3633663366336636e-06, "loss": 0.0013, "step": 674 }, { "epoch": 3.312883435582822, "grad_norm": 0.0133537407964468, "learning_rate": 2.3564356435643567e-06, "loss": 0.0018, "step": 675 }, { "epoch": 3.3177914110429447, "grad_norm": 0.01537949126213789, "learning_rate": 2.3495049504950498e-06, "loss": 0.0013, "step": 676 }, { "epoch": 3.3226993865030674, "grad_norm": 0.03161802887916565, "learning_rate": 2.342574257425743e-06, "loss": 0.0013, "step": 677 }, { "epoch": 3.32760736196319, "grad_norm": 0.01877240464091301, "learning_rate": 2.335643564356436e-06, "loss": 0.001, "step": 678 }, { "epoch": 3.3325153374233127, "grad_norm": 0.20145629346370697, "learning_rate": 2.328712871287129e-06, "loss": 0.0099, "step": 679 }, { "epoch": 3.3374233128834354, "grad_norm": 0.17651820182800293, "learning_rate": 2.321782178217822e-06, "loss": 0.0298, "step": 680 }, { "epoch": 3.3423312883435585, "grad_norm": 0.031260013580322266, "learning_rate": 2.314851485148515e-06, "loss": 0.0017, "step": 681 }, { "epoch": 3.347239263803681, "grad_norm": 0.049838535487651825, "learning_rate": 2.307920792079208e-06, "loss": 0.0016, "step": 682 }, { "epoch": 3.352147239263804, "grad_norm": 0.01825704425573349, "learning_rate": 2.3009900990099012e-06, "loss": 0.0009, "step": 683 }, { "epoch": 3.3570552147239265, "grad_norm": 0.2820407450199127, "learning_rate": 2.2940594059405943e-06, "loss": 0.0095, "step": 684 }, { "epoch": 3.361963190184049, "grad_norm": 0.013412845320999622, "learning_rate": 2.2871287128712874e-06, "loss": 0.0016, "step": 685 }, { "epoch": 3.366871165644172, "grad_norm": 0.043872177600860596, "learning_rate": 2.2801980198019804e-06, "loss": 0.0023, "step": 686 }, { "epoch": 3.3717791411042946, "grad_norm": 0.024329353123903275, "learning_rate": 2.2732673267326735e-06, "loss": 0.001, "step": 687 }, { "epoch": 3.3766871165644172, "grad_norm": 0.016059909015893936, "learning_rate": 2.2663366336633666e-06, "loss": 0.0012, "step": 688 }, { "epoch": 3.38159509202454, "grad_norm": 0.014483323320746422, "learning_rate": 2.2594059405940596e-06, "loss": 0.0011, "step": 689 }, { "epoch": 3.3865030674846626, "grad_norm": 0.05332216992974281, "learning_rate": 2.2524752475247523e-06, "loss": 0.0023, "step": 690 }, { "epoch": 3.3914110429447852, "grad_norm": 0.005560703109949827, "learning_rate": 2.2455445544554454e-06, "loss": 0.0006, "step": 691 }, { "epoch": 3.396319018404908, "grad_norm": 0.029570411890745163, "learning_rate": 2.2386138613861384e-06, "loss": 0.0026, "step": 692 }, { "epoch": 3.4012269938650306, "grad_norm": 0.02160765416920185, "learning_rate": 2.2316831683168315e-06, "loss": 0.0014, "step": 693 }, { "epoch": 3.4061349693251532, "grad_norm": 0.012106803245842457, "learning_rate": 2.2247524752475246e-06, "loss": 0.0008, "step": 694 }, { "epoch": 3.411042944785276, "grad_norm": 0.027164770290255547, "learning_rate": 2.2178217821782176e-06, "loss": 0.001, "step": 695 }, { "epoch": 3.4159509202453986, "grad_norm": 0.03465467691421509, "learning_rate": 2.2108910891089107e-06, "loss": 0.0008, "step": 696 }, { "epoch": 3.4208588957055213, "grad_norm": 0.01086588017642498, "learning_rate": 2.2039603960396038e-06, "loss": 0.0016, "step": 697 }, { "epoch": 3.425766871165644, "grad_norm": 0.2833847105503082, "learning_rate": 2.197029702970297e-06, "loss": 0.0092, "step": 698 }, { "epoch": 3.430674846625767, "grad_norm": 0.01620599813759327, "learning_rate": 2.19009900990099e-06, "loss": 0.0005, "step": 699 }, { "epoch": 3.4355828220858897, "grad_norm": 0.0964425802230835, "learning_rate": 2.183168316831683e-06, "loss": 0.0059, "step": 700 }, { "epoch": 3.4404907975460124, "grad_norm": 0.051153287291526794, "learning_rate": 2.176237623762376e-06, "loss": 0.003, "step": 701 }, { "epoch": 3.445398773006135, "grad_norm": 0.12504975497722626, "learning_rate": 2.169306930693069e-06, "loss": 0.0023, "step": 702 }, { "epoch": 3.4503067484662577, "grad_norm": 0.18094071745872498, "learning_rate": 2.162376237623762e-06, "loss": 0.0175, "step": 703 }, { "epoch": 3.4552147239263804, "grad_norm": 0.011514030396938324, "learning_rate": 2.1554455445544553e-06, "loss": 0.001, "step": 704 }, { "epoch": 3.460122699386503, "grad_norm": 0.015152939595282078, "learning_rate": 2.1485148514851483e-06, "loss": 0.0011, "step": 705 }, { "epoch": 3.4650306748466257, "grad_norm": 0.05039620399475098, "learning_rate": 2.1415841584158414e-06, "loss": 0.0049, "step": 706 }, { "epoch": 3.4699386503067484, "grad_norm": 0.044066257774829865, "learning_rate": 2.1346534653465345e-06, "loss": 0.0011, "step": 707 }, { "epoch": 3.474846625766871, "grad_norm": 0.06301417946815491, "learning_rate": 2.1277227722772275e-06, "loss": 0.002, "step": 708 }, { "epoch": 3.4797546012269938, "grad_norm": 0.05275435373187065, "learning_rate": 2.1207920792079206e-06, "loss": 0.0026, "step": 709 }, { "epoch": 3.4846625766871164, "grad_norm": 0.05170956999063492, "learning_rate": 2.1138613861386137e-06, "loss": 0.0033, "step": 710 }, { "epoch": 3.489570552147239, "grad_norm": 0.12438485026359558, "learning_rate": 2.1069306930693067e-06, "loss": 0.0028, "step": 711 }, { "epoch": 3.4944785276073618, "grad_norm": 0.07120586186647415, "learning_rate": 2.1e-06, "loss": 0.0021, "step": 712 }, { "epoch": 3.499386503067485, "grad_norm": 0.027411244809627533, "learning_rate": 2.093069306930693e-06, "loss": 0.0011, "step": 713 }, { "epoch": 3.5042944785276076, "grad_norm": 0.005563246086239815, "learning_rate": 2.086138613861386e-06, "loss": 0.0007, "step": 714 }, { "epoch": 3.5092024539877302, "grad_norm": 0.04439758136868477, "learning_rate": 2.079207920792079e-06, "loss": 0.0017, "step": 715 }, { "epoch": 3.514110429447853, "grad_norm": 0.04545675963163376, "learning_rate": 2.072277227722772e-06, "loss": 0.0045, "step": 716 }, { "epoch": 3.5190184049079756, "grad_norm": 0.019012991338968277, "learning_rate": 2.065346534653465e-06, "loss": 0.001, "step": 717 }, { "epoch": 3.5239263803680982, "grad_norm": 0.5096023678779602, "learning_rate": 2.058415841584158e-06, "loss": 0.0205, "step": 718 }, { "epoch": 3.528834355828221, "grad_norm": 0.03077244944870472, "learning_rate": 2.0514851485148513e-06, "loss": 0.0018, "step": 719 }, { "epoch": 3.5337423312883436, "grad_norm": 0.052647169679403305, "learning_rate": 2.0445544554455443e-06, "loss": 0.0021, "step": 720 }, { "epoch": 3.5386503067484663, "grad_norm": 0.014248156920075417, "learning_rate": 2.0376237623762374e-06, "loss": 0.0007, "step": 721 }, { "epoch": 3.543558282208589, "grad_norm": 0.11832743138074875, "learning_rate": 2.0306930693069305e-06, "loss": 0.0088, "step": 722 }, { "epoch": 3.5484662576687116, "grad_norm": 0.025962911546230316, "learning_rate": 2.0237623762376236e-06, "loss": 0.0009, "step": 723 }, { "epoch": 3.5533742331288343, "grad_norm": 0.06493301689624786, "learning_rate": 2.0168316831683166e-06, "loss": 0.0024, "step": 724 }, { "epoch": 3.558282208588957, "grad_norm": 0.023671971634030342, "learning_rate": 2.0099009900990097e-06, "loss": 0.0019, "step": 725 }, { "epoch": 3.5631901840490796, "grad_norm": 0.02273421734571457, "learning_rate": 2.0029702970297028e-06, "loss": 0.0008, "step": 726 }, { "epoch": 3.5680981595092023, "grad_norm": 0.18828389048576355, "learning_rate": 1.996039603960396e-06, "loss": 0.0182, "step": 727 }, { "epoch": 3.573006134969325, "grad_norm": 0.01973796635866165, "learning_rate": 1.989108910891089e-06, "loss": 0.0023, "step": 728 }, { "epoch": 3.5779141104294476, "grad_norm": 0.03295096009969711, "learning_rate": 1.982178217821782e-06, "loss": 0.0017, "step": 729 }, { "epoch": 3.5828220858895703, "grad_norm": 0.012010748498141766, "learning_rate": 1.975247524752475e-06, "loss": 0.0009, "step": 730 }, { "epoch": 3.5877300613496934, "grad_norm": 0.008494194597005844, "learning_rate": 1.9683168316831685e-06, "loss": 0.0009, "step": 731 }, { "epoch": 3.592638036809816, "grad_norm": 0.016253001987934113, "learning_rate": 1.9613861386138616e-06, "loss": 0.0015, "step": 732 }, { "epoch": 3.5975460122699388, "grad_norm": 0.007456593681126833, "learning_rate": 1.9544554455445547e-06, "loss": 0.0007, "step": 733 }, { "epoch": 3.6024539877300614, "grad_norm": 0.00861444789916277, "learning_rate": 1.9475247524752477e-06, "loss": 0.0007, "step": 734 }, { "epoch": 3.607361963190184, "grad_norm": 0.0065794652327895164, "learning_rate": 1.940594059405941e-06, "loss": 0.001, "step": 735 }, { "epoch": 3.6122699386503068, "grad_norm": 0.015389169566333294, "learning_rate": 1.933663366336634e-06, "loss": 0.0015, "step": 736 }, { "epoch": 3.6171779141104294, "grad_norm": 0.025337016209959984, "learning_rate": 1.926732673267327e-06, "loss": 0.0018, "step": 737 }, { "epoch": 3.622085889570552, "grad_norm": 0.00653579318895936, "learning_rate": 1.91980198019802e-06, "loss": 0.0008, "step": 738 }, { "epoch": 3.626993865030675, "grad_norm": 0.10997878760099411, "learning_rate": 1.912871287128713e-06, "loss": 0.0068, "step": 739 }, { "epoch": 3.6319018404907975, "grad_norm": 0.05580228194594383, "learning_rate": 1.9059405940594061e-06, "loss": 0.0056, "step": 740 }, { "epoch": 3.63680981595092, "grad_norm": 0.06799723207950592, "learning_rate": 1.8990099009900992e-06, "loss": 0.0019, "step": 741 }, { "epoch": 3.641717791411043, "grad_norm": 0.20822834968566895, "learning_rate": 1.8920792079207923e-06, "loss": 0.0039, "step": 742 }, { "epoch": 3.646625766871166, "grad_norm": 0.03920517861843109, "learning_rate": 1.8851485148514851e-06, "loss": 0.0017, "step": 743 }, { "epoch": 3.6515337423312886, "grad_norm": 0.06821847707033157, "learning_rate": 1.8782178217821782e-06, "loss": 0.0027, "step": 744 }, { "epoch": 3.6564417177914113, "grad_norm": 0.09687570482492447, "learning_rate": 1.8712871287128713e-06, "loss": 0.002, "step": 745 }, { "epoch": 3.661349693251534, "grad_norm": 0.05403744429349899, "learning_rate": 1.8643564356435643e-06, "loss": 0.0019, "step": 746 }, { "epoch": 3.6662576687116566, "grad_norm": 0.019597845152020454, "learning_rate": 1.8574257425742574e-06, "loss": 0.0009, "step": 747 }, { "epoch": 3.6711656441717793, "grad_norm": 0.04923088103532791, "learning_rate": 1.8504950495049505e-06, "loss": 0.0013, "step": 748 }, { "epoch": 3.676073619631902, "grad_norm": 0.0967707633972168, "learning_rate": 1.8435643564356435e-06, "loss": 0.0039, "step": 749 }, { "epoch": 3.6809815950920246, "grad_norm": 0.037127815186977386, "learning_rate": 1.8366336633663366e-06, "loss": 0.0012, "step": 750 }, { "epoch": 3.6858895705521473, "grad_norm": 0.013236461207270622, "learning_rate": 1.8297029702970297e-06, "loss": 0.0045, "step": 751 }, { "epoch": 3.69079754601227, "grad_norm": 0.031925372779369354, "learning_rate": 1.8227722772277228e-06, "loss": 0.002, "step": 752 }, { "epoch": 3.6957055214723926, "grad_norm": 0.023648735135793686, "learning_rate": 1.8158415841584158e-06, "loss": 0.0012, "step": 753 }, { "epoch": 3.7006134969325153, "grad_norm": 0.01484636776149273, "learning_rate": 1.8089108910891089e-06, "loss": 0.0011, "step": 754 }, { "epoch": 3.705521472392638, "grad_norm": 0.02288316749036312, "learning_rate": 1.801980198019802e-06, "loss": 0.0018, "step": 755 }, { "epoch": 3.7104294478527606, "grad_norm": 0.005614751018583775, "learning_rate": 1.795049504950495e-06, "loss": 0.001, "step": 756 }, { "epoch": 3.7153374233128833, "grad_norm": 0.03587134927511215, "learning_rate": 1.788118811881188e-06, "loss": 0.0013, "step": 757 }, { "epoch": 3.720245398773006, "grad_norm": 0.048482466489076614, "learning_rate": 1.7811881188118812e-06, "loss": 0.0014, "step": 758 }, { "epoch": 3.7251533742331286, "grad_norm": 0.06541978567838669, "learning_rate": 1.7742574257425742e-06, "loss": 0.0023, "step": 759 }, { "epoch": 3.7300613496932513, "grad_norm": 0.040501050651073456, "learning_rate": 1.7673267326732673e-06, "loss": 0.0014, "step": 760 }, { "epoch": 3.734969325153374, "grad_norm": 0.006551014259457588, "learning_rate": 1.7603960396039604e-06, "loss": 0.0009, "step": 761 }, { "epoch": 3.7398773006134967, "grad_norm": 0.11849401146173477, "learning_rate": 1.7534653465346534e-06, "loss": 0.0099, "step": 762 }, { "epoch": 3.7447852760736198, "grad_norm": 0.004786093719303608, "learning_rate": 1.7465346534653465e-06, "loss": 0.0008, "step": 763 }, { "epoch": 3.7496932515337424, "grad_norm": 0.02577151544392109, "learning_rate": 1.7396039603960396e-06, "loss": 0.0016, "step": 764 }, { "epoch": 3.754601226993865, "grad_norm": 0.014097603037953377, "learning_rate": 1.7326732673267326e-06, "loss": 0.0008, "step": 765 }, { "epoch": 3.759509202453988, "grad_norm": 0.05258313938975334, "learning_rate": 1.7257425742574257e-06, "loss": 0.0009, "step": 766 }, { "epoch": 3.7644171779141105, "grad_norm": 0.09022804349660873, "learning_rate": 1.7188118811881188e-06, "loss": 0.005, "step": 767 }, { "epoch": 3.769325153374233, "grad_norm": 0.008886247873306274, "learning_rate": 1.7118811881188119e-06, "loss": 0.0008, "step": 768 }, { "epoch": 3.774233128834356, "grad_norm": 0.036997053772211075, "learning_rate": 1.704950495049505e-06, "loss": 0.0011, "step": 769 }, { "epoch": 3.7791411042944785, "grad_norm": 0.05569405481219292, "learning_rate": 1.698019801980198e-06, "loss": 0.0018, "step": 770 }, { "epoch": 3.784049079754601, "grad_norm": 0.0031505110673606396, "learning_rate": 1.691089108910891e-06, "loss": 0.0004, "step": 771 }, { "epoch": 3.788957055214724, "grad_norm": 0.014605509117245674, "learning_rate": 1.6841584158415841e-06, "loss": 0.0011, "step": 772 }, { "epoch": 3.7938650306748465, "grad_norm": 0.09325973689556122, "learning_rate": 1.6772277227722772e-06, "loss": 0.0052, "step": 773 }, { "epoch": 3.7987730061349696, "grad_norm": 0.059272442013025284, "learning_rate": 1.6702970297029703e-06, "loss": 0.0022, "step": 774 }, { "epoch": 3.8036809815950923, "grad_norm": 0.01452575996518135, "learning_rate": 1.6633663366336633e-06, "loss": 0.0021, "step": 775 }, { "epoch": 3.808588957055215, "grad_norm": 0.17578046023845673, "learning_rate": 1.6564356435643564e-06, "loss": 0.0093, "step": 776 }, { "epoch": 3.8134969325153376, "grad_norm": 0.007930277846753597, "learning_rate": 1.6495049504950495e-06, "loss": 0.0006, "step": 777 }, { "epoch": 3.8184049079754603, "grad_norm": 0.07230112701654434, "learning_rate": 1.6425742574257425e-06, "loss": 0.0025, "step": 778 }, { "epoch": 3.823312883435583, "grad_norm": 0.03507319092750549, "learning_rate": 1.6356435643564358e-06, "loss": 0.0017, "step": 779 }, { "epoch": 3.8282208588957056, "grad_norm": 0.06336654722690582, "learning_rate": 1.6287128712871287e-06, "loss": 0.0025, "step": 780 }, { "epoch": 3.8331288343558283, "grad_norm": 0.14077608287334442, "learning_rate": 1.6217821782178217e-06, "loss": 0.0086, "step": 781 }, { "epoch": 3.838036809815951, "grad_norm": 0.015772581100463867, "learning_rate": 1.6148514851485148e-06, "loss": 0.001, "step": 782 }, { "epoch": 3.8429447852760736, "grad_norm": 0.01927962154150009, "learning_rate": 1.6079207920792079e-06, "loss": 0.0015, "step": 783 }, { "epoch": 3.8478527607361963, "grad_norm": 0.011015449650585651, "learning_rate": 1.600990099009901e-06, "loss": 0.0008, "step": 784 }, { "epoch": 3.852760736196319, "grad_norm": 0.40098482370376587, "learning_rate": 1.594059405940594e-06, "loss": 0.0314, "step": 785 }, { "epoch": 3.8576687116564417, "grad_norm": 0.02672453783452511, "learning_rate": 1.587128712871287e-06, "loss": 0.0025, "step": 786 }, { "epoch": 3.8625766871165643, "grad_norm": 0.022412395104765892, "learning_rate": 1.5801980198019802e-06, "loss": 0.0013, "step": 787 }, { "epoch": 3.867484662576687, "grad_norm": 0.023978037759661674, "learning_rate": 1.5732673267326732e-06, "loss": 0.0017, "step": 788 }, { "epoch": 3.8723926380368097, "grad_norm": 0.017764659598469734, "learning_rate": 1.5663366336633663e-06, "loss": 0.002, "step": 789 }, { "epoch": 3.8773006134969323, "grad_norm": 0.012586713768541813, "learning_rate": 1.5594059405940594e-06, "loss": 0.0008, "step": 790 }, { "epoch": 3.882208588957055, "grad_norm": 0.056462038308382034, "learning_rate": 1.5524752475247524e-06, "loss": 0.0036, "step": 791 }, { "epoch": 3.8871165644171777, "grad_norm": 0.05329478159546852, "learning_rate": 1.5455445544554455e-06, "loss": 0.0041, "step": 792 }, { "epoch": 3.8920245398773003, "grad_norm": 0.0013215028448030353, "learning_rate": 1.5386138613861386e-06, "loss": 0.0004, "step": 793 }, { "epoch": 3.8969325153374235, "grad_norm": 0.05318621173501015, "learning_rate": 1.5316831683168316e-06, "loss": 0.0011, "step": 794 }, { "epoch": 3.901840490797546, "grad_norm": 0.3169184625148773, "learning_rate": 1.5247524752475247e-06, "loss": 0.0221, "step": 795 }, { "epoch": 3.906748466257669, "grad_norm": 0.04726627469062805, "learning_rate": 1.5178217821782178e-06, "loss": 0.0015, "step": 796 }, { "epoch": 3.9116564417177915, "grad_norm": 0.13995185494422913, "learning_rate": 1.5108910891089108e-06, "loss": 0.0092, "step": 797 }, { "epoch": 3.916564417177914, "grad_norm": 0.01544391643255949, "learning_rate": 1.503960396039604e-06, "loss": 0.0012, "step": 798 }, { "epoch": 3.921472392638037, "grad_norm": 0.1588226556777954, "learning_rate": 1.497029702970297e-06, "loss": 0.0087, "step": 799 }, { "epoch": 3.9263803680981595, "grad_norm": 0.011546803638339043, "learning_rate": 1.49009900990099e-06, "loss": 0.0015, "step": 800 }, { "epoch": 3.931288343558282, "grad_norm": 0.04798766225576401, "learning_rate": 1.4831683168316831e-06, "loss": 0.0013, "step": 801 }, { "epoch": 3.936196319018405, "grad_norm": 0.01064328383654356, "learning_rate": 1.4762376237623762e-06, "loss": 0.0011, "step": 802 }, { "epoch": 3.9411042944785275, "grad_norm": 0.1379479169845581, "learning_rate": 1.4693069306930693e-06, "loss": 0.0142, "step": 803 }, { "epoch": 3.94601226993865, "grad_norm": 0.054966770112514496, "learning_rate": 1.4623762376237623e-06, "loss": 0.0014, "step": 804 }, { "epoch": 3.950920245398773, "grad_norm": 0.035458799451589584, "learning_rate": 1.4554455445544554e-06, "loss": 0.002, "step": 805 }, { "epoch": 3.955828220858896, "grad_norm": 0.011258352547883987, "learning_rate": 1.4485148514851485e-06, "loss": 0.0005, "step": 806 }, { "epoch": 3.9607361963190186, "grad_norm": 0.022768640890717506, "learning_rate": 1.4415841584158415e-06, "loss": 0.0009, "step": 807 }, { "epoch": 3.9656441717791413, "grad_norm": 0.0772656723856926, "learning_rate": 1.4346534653465346e-06, "loss": 0.0014, "step": 808 }, { "epoch": 3.970552147239264, "grad_norm": 0.06587695330381393, "learning_rate": 1.4277227722772277e-06, "loss": 0.0034, "step": 809 }, { "epoch": 3.9754601226993866, "grad_norm": 0.01118537038564682, "learning_rate": 1.420792079207921e-06, "loss": 0.0009, "step": 810 }, { "epoch": 3.9803680981595093, "grad_norm": 0.06560896337032318, "learning_rate": 1.413861386138614e-06, "loss": 0.0019, "step": 811 }, { "epoch": 3.985276073619632, "grad_norm": 0.014048455283045769, "learning_rate": 1.406930693069307e-06, "loss": 0.0018, "step": 812 }, { "epoch": 3.9901840490797547, "grad_norm": 0.01656423695385456, "learning_rate": 1.4000000000000001e-06, "loss": 0.001, "step": 813 }, { "epoch": 3.9950920245398773, "grad_norm": 0.0036234341096132994, "learning_rate": 1.3930693069306932e-06, "loss": 0.0007, "step": 814 }, { "epoch": 4.0, "grad_norm": 0.012785837985575199, "learning_rate": 1.3861386138613863e-06, "loss": 0.0009, "step": 815 }, { "epoch": 4.004907975460123, "grad_norm": 0.025322729721665382, "learning_rate": 1.3792079207920791e-06, "loss": 0.002, "step": 816 }, { "epoch": 4.009815950920245, "grad_norm": 0.2052641361951828, "learning_rate": 1.3722772277227722e-06, "loss": 0.0056, "step": 817 }, { "epoch": 4.014723926380368, "grad_norm": 0.057693980634212494, "learning_rate": 1.3653465346534653e-06, "loss": 0.0021, "step": 818 }, { "epoch": 4.019631901840491, "grad_norm": 0.009920844808220863, "learning_rate": 1.3584158415841583e-06, "loss": 0.0009, "step": 819 }, { "epoch": 4.024539877300613, "grad_norm": 0.18843849003314972, "learning_rate": 1.3514851485148514e-06, "loss": 0.0148, "step": 820 }, { "epoch": 4.029447852760736, "grad_norm": 0.003511168295517564, "learning_rate": 1.3445544554455445e-06, "loss": 0.0003, "step": 821 }, { "epoch": 4.034355828220859, "grad_norm": 0.02023676596581936, "learning_rate": 1.3376237623762376e-06, "loss": 0.001, "step": 822 }, { "epoch": 4.039263803680981, "grad_norm": 0.010772217065095901, "learning_rate": 1.3306930693069306e-06, "loss": 0.0013, "step": 823 }, { "epoch": 4.044171779141104, "grad_norm": 0.023414717987179756, "learning_rate": 1.3237623762376237e-06, "loss": 0.0023, "step": 824 }, { "epoch": 4.049079754601227, "grad_norm": 0.019114743918180466, "learning_rate": 1.3168316831683168e-06, "loss": 0.0007, "step": 825 }, { "epoch": 4.053987730061349, "grad_norm": 0.012856281362473965, "learning_rate": 1.3099009900990098e-06, "loss": 0.0015, "step": 826 }, { "epoch": 4.058895705521472, "grad_norm": 0.00855772290378809, "learning_rate": 1.302970297029703e-06, "loss": 0.0015, "step": 827 }, { "epoch": 4.063803680981595, "grad_norm": 0.021148694679141045, "learning_rate": 1.296039603960396e-06, "loss": 0.0012, "step": 828 }, { "epoch": 4.068711656441717, "grad_norm": 0.07430653274059296, "learning_rate": 1.289108910891089e-06, "loss": 0.0049, "step": 829 }, { "epoch": 4.07361963190184, "grad_norm": 0.10033933073282242, "learning_rate": 1.282178217821782e-06, "loss": 0.0026, "step": 830 }, { "epoch": 4.078527607361964, "grad_norm": 0.006094958167523146, "learning_rate": 1.2752475247524752e-06, "loss": 0.0006, "step": 831 }, { "epoch": 4.083435582822086, "grad_norm": 0.06336677074432373, "learning_rate": 1.2683168316831682e-06, "loss": 0.0038, "step": 832 }, { "epoch": 4.088343558282209, "grad_norm": 0.005283738486468792, "learning_rate": 1.2613861386138613e-06, "loss": 0.0015, "step": 833 }, { "epoch": 4.093251533742332, "grad_norm": 0.006675936747342348, "learning_rate": 1.2544554455445544e-06, "loss": 0.0006, "step": 834 }, { "epoch": 4.098159509202454, "grad_norm": 0.008660698309540749, "learning_rate": 1.2475247524752474e-06, "loss": 0.0018, "step": 835 }, { "epoch": 4.103067484662577, "grad_norm": 0.02305518463253975, "learning_rate": 1.2405940594059405e-06, "loss": 0.0007, "step": 836 }, { "epoch": 4.1079754601227, "grad_norm": 0.024816259741783142, "learning_rate": 1.2336633663366336e-06, "loss": 0.0007, "step": 837 }, { "epoch": 4.112883435582822, "grad_norm": 0.026276560500264168, "learning_rate": 1.2267326732673267e-06, "loss": 0.0016, "step": 838 }, { "epoch": 4.117791411042945, "grad_norm": 0.029642153531312943, "learning_rate": 1.2198019801980197e-06, "loss": 0.0038, "step": 839 }, { "epoch": 4.122699386503068, "grad_norm": 0.05285963416099548, "learning_rate": 1.2128712871287128e-06, "loss": 0.0019, "step": 840 }, { "epoch": 4.12760736196319, "grad_norm": 0.014620939269661903, "learning_rate": 1.2059405940594059e-06, "loss": 0.0012, "step": 841 }, { "epoch": 4.132515337423313, "grad_norm": 0.012532511726021767, "learning_rate": 1.1990099009900991e-06, "loss": 0.0007, "step": 842 }, { "epoch": 4.137423312883436, "grad_norm": 0.04022945091128349, "learning_rate": 1.1920792079207922e-06, "loss": 0.0032, "step": 843 }, { "epoch": 4.142331288343558, "grad_norm": 0.021105729043483734, "learning_rate": 1.1851485148514853e-06, "loss": 0.0015, "step": 844 }, { "epoch": 4.147239263803681, "grad_norm": 0.07277761399745941, "learning_rate": 1.1782178217821783e-06, "loss": 0.0038, "step": 845 }, { "epoch": 4.152147239263804, "grad_norm": 0.015429302118718624, "learning_rate": 1.1712871287128714e-06, "loss": 0.0014, "step": 846 }, { "epoch": 4.157055214723926, "grad_norm": 0.02602989971637726, "learning_rate": 1.1643564356435645e-06, "loss": 0.0013, "step": 847 }, { "epoch": 4.161963190184049, "grad_norm": 0.018687183037400246, "learning_rate": 1.1574257425742575e-06, "loss": 0.0016, "step": 848 }, { "epoch": 4.166871165644172, "grad_norm": 0.019744986668229103, "learning_rate": 1.1504950495049506e-06, "loss": 0.0013, "step": 849 }, { "epoch": 4.171779141104294, "grad_norm": 0.029573217034339905, "learning_rate": 1.1435643564356437e-06, "loss": 0.0023, "step": 850 }, { "epoch": 4.176687116564417, "grad_norm": 0.020479142665863037, "learning_rate": 1.1366336633663368e-06, "loss": 0.0011, "step": 851 }, { "epoch": 4.18159509202454, "grad_norm": 0.11432457715272903, "learning_rate": 1.1297029702970298e-06, "loss": 0.0041, "step": 852 }, { "epoch": 4.186503067484662, "grad_norm": 0.012511249631643295, "learning_rate": 1.1227722772277227e-06, "loss": 0.0006, "step": 853 }, { "epoch": 4.191411042944785, "grad_norm": 0.01595146209001541, "learning_rate": 1.1158415841584157e-06, "loss": 0.0009, "step": 854 }, { "epoch": 4.196319018404908, "grad_norm": 0.007092094514518976, "learning_rate": 1.1089108910891088e-06, "loss": 0.0007, "step": 855 }, { "epoch": 4.20122699386503, "grad_norm": 0.03979247063398361, "learning_rate": 1.1019801980198019e-06, "loss": 0.0018, "step": 856 }, { "epoch": 4.206134969325153, "grad_norm": 0.008829467929899693, "learning_rate": 1.095049504950495e-06, "loss": 0.0007, "step": 857 }, { "epoch": 4.211042944785276, "grad_norm": 0.09763351082801819, "learning_rate": 1.088118811881188e-06, "loss": 0.0048, "step": 858 }, { "epoch": 4.215950920245398, "grad_norm": 0.08337781578302383, "learning_rate": 1.081188118811881e-06, "loss": 0.0014, "step": 859 }, { "epoch": 4.220858895705521, "grad_norm": 0.029353009536862373, "learning_rate": 1.0742574257425742e-06, "loss": 0.001, "step": 860 }, { "epoch": 4.225766871165644, "grad_norm": 0.121429443359375, "learning_rate": 1.0673267326732672e-06, "loss": 0.0148, "step": 861 }, { "epoch": 4.230674846625767, "grad_norm": 0.01580023020505905, "learning_rate": 1.0603960396039603e-06, "loss": 0.001, "step": 862 }, { "epoch": 4.23558282208589, "grad_norm": 0.013746123760938644, "learning_rate": 1.0534653465346534e-06, "loss": 0.0012, "step": 863 }, { "epoch": 4.240490797546013, "grad_norm": 0.011870300397276878, "learning_rate": 1.0465346534653464e-06, "loss": 0.0007, "step": 864 }, { "epoch": 4.245398773006135, "grad_norm": 0.05088931694626808, "learning_rate": 1.0396039603960395e-06, "loss": 0.0017, "step": 865 }, { "epoch": 4.250306748466258, "grad_norm": 0.38851794600486755, "learning_rate": 1.0326732673267326e-06, "loss": 0.014, "step": 866 }, { "epoch": 4.255214723926381, "grad_norm": 0.01347925141453743, "learning_rate": 1.0257425742574256e-06, "loss": 0.0012, "step": 867 }, { "epoch": 4.260122699386503, "grad_norm": 0.010402753949165344, "learning_rate": 1.0188118811881187e-06, "loss": 0.0007, "step": 868 }, { "epoch": 4.265030674846626, "grad_norm": 0.03338263928890228, "learning_rate": 1.0118811881188118e-06, "loss": 0.0017, "step": 869 }, { "epoch": 4.269938650306749, "grad_norm": 0.007806051056832075, "learning_rate": 1.0049504950495048e-06, "loss": 0.0006, "step": 870 }, { "epoch": 4.274846625766871, "grad_norm": 0.0339755155146122, "learning_rate": 9.98019801980198e-07, "loss": 0.0027, "step": 871 }, { "epoch": 4.279754601226994, "grad_norm": 0.02654801867902279, "learning_rate": 9.91089108910891e-07, "loss": 0.0012, "step": 872 }, { "epoch": 4.284662576687117, "grad_norm": 0.018412116914987564, "learning_rate": 9.841584158415843e-07, "loss": 0.0022, "step": 873 }, { "epoch": 4.289570552147239, "grad_norm": 0.0640820562839508, "learning_rate": 9.772277227722773e-07, "loss": 0.0021, "step": 874 }, { "epoch": 4.294478527607362, "grad_norm": 0.03333529084920883, "learning_rate": 9.702970297029704e-07, "loss": 0.0023, "step": 875 }, { "epoch": 4.299386503067485, "grad_norm": 0.022033028304576874, "learning_rate": 9.633663366336635e-07, "loss": 0.0007, "step": 876 }, { "epoch": 4.304294478527607, "grad_norm": 0.010194915346801281, "learning_rate": 9.564356435643565e-07, "loss": 0.0008, "step": 877 }, { "epoch": 4.30920245398773, "grad_norm": 0.015077208168804646, "learning_rate": 9.495049504950496e-07, "loss": 0.0012, "step": 878 }, { "epoch": 4.314110429447853, "grad_norm": 0.029076164588332176, "learning_rate": 9.425742574257426e-07, "loss": 0.0015, "step": 879 }, { "epoch": 4.319018404907975, "grad_norm": 0.0363786481320858, "learning_rate": 9.356435643564356e-07, "loss": 0.001, "step": 880 }, { "epoch": 4.323926380368098, "grad_norm": 0.21520403027534485, "learning_rate": 9.287128712871287e-07, "loss": 0.0094, "step": 881 }, { "epoch": 4.328834355828221, "grad_norm": 0.003572958754375577, "learning_rate": 9.217821782178218e-07, "loss": 0.0005, "step": 882 }, { "epoch": 4.333742331288343, "grad_norm": 0.01643703132867813, "learning_rate": 9.148514851485148e-07, "loss": 0.0009, "step": 883 }, { "epoch": 4.338650306748466, "grad_norm": 0.13475348055362701, "learning_rate": 9.079207920792079e-07, "loss": 0.0077, "step": 884 }, { "epoch": 4.343558282208589, "grad_norm": 0.10863371193408966, "learning_rate": 9.00990099009901e-07, "loss": 0.0294, "step": 885 }, { "epoch": 4.348466257668711, "grad_norm": 0.02765970304608345, "learning_rate": 8.94059405940594e-07, "loss": 0.0017, "step": 886 }, { "epoch": 4.353374233128834, "grad_norm": 0.011608476750552654, "learning_rate": 8.871287128712871e-07, "loss": 0.001, "step": 887 }, { "epoch": 4.358282208588957, "grad_norm": 0.005024611949920654, "learning_rate": 8.801980198019802e-07, "loss": 0.0006, "step": 888 }, { "epoch": 4.363190184049079, "grad_norm": 0.007748506963253021, "learning_rate": 8.732673267326733e-07, "loss": 0.0006, "step": 889 }, { "epoch": 4.368098159509202, "grad_norm": 0.23455409705638885, "learning_rate": 8.663366336633663e-07, "loss": 0.0057, "step": 890 }, { "epoch": 4.373006134969325, "grad_norm": 0.03380454331636429, "learning_rate": 8.594059405940594e-07, "loss": 0.0011, "step": 891 }, { "epoch": 4.3779141104294474, "grad_norm": 0.03481479734182358, "learning_rate": 8.524752475247525e-07, "loss": 0.0016, "step": 892 }, { "epoch": 4.38282208588957, "grad_norm": 0.022679351270198822, "learning_rate": 8.455445544554455e-07, "loss": 0.0022, "step": 893 }, { "epoch": 4.387730061349693, "grad_norm": 0.093803271651268, "learning_rate": 8.386138613861386e-07, "loss": 0.0022, "step": 894 }, { "epoch": 4.392638036809816, "grad_norm": 0.05329536274075508, "learning_rate": 8.316831683168317e-07, "loss": 0.0018, "step": 895 }, { "epoch": 4.397546012269939, "grad_norm": 0.05470538139343262, "learning_rate": 8.247524752475247e-07, "loss": 0.0014, "step": 896 }, { "epoch": 4.402453987730062, "grad_norm": 0.02288208343088627, "learning_rate": 8.178217821782179e-07, "loss": 0.0014, "step": 897 }, { "epoch": 4.407361963190184, "grad_norm": 0.04405367746949196, "learning_rate": 8.108910891089109e-07, "loss": 0.001, "step": 898 }, { "epoch": 4.412269938650307, "grad_norm": 0.030512019991874695, "learning_rate": 8.039603960396039e-07, "loss": 0.0013, "step": 899 }, { "epoch": 4.41717791411043, "grad_norm": 0.12844492495059967, "learning_rate": 7.97029702970297e-07, "loss": 0.0043, "step": 900 }, { "epoch": 4.422085889570552, "grad_norm": 0.02055547758936882, "learning_rate": 7.900990099009901e-07, "loss": 0.001, "step": 901 }, { "epoch": 4.426993865030675, "grad_norm": 0.04747156798839569, "learning_rate": 7.831683168316831e-07, "loss": 0.0061, "step": 902 }, { "epoch": 4.431901840490798, "grad_norm": 0.010140195488929749, "learning_rate": 7.762376237623762e-07, "loss": 0.001, "step": 903 }, { "epoch": 4.43680981595092, "grad_norm": 0.009971629828214645, "learning_rate": 7.693069306930693e-07, "loss": 0.0011, "step": 904 }, { "epoch": 4.441717791411043, "grad_norm": 0.006146845407783985, "learning_rate": 7.623762376237624e-07, "loss": 0.0004, "step": 905 }, { "epoch": 4.446625766871166, "grad_norm": 0.027412964031100273, "learning_rate": 7.554455445544554e-07, "loss": 0.0024, "step": 906 }, { "epoch": 4.451533742331288, "grad_norm": 0.021934248507022858, "learning_rate": 7.485148514851485e-07, "loss": 0.0012, "step": 907 }, { "epoch": 4.456441717791411, "grad_norm": 0.3181805908679962, "learning_rate": 7.415841584158416e-07, "loss": 0.0261, "step": 908 }, { "epoch": 4.461349693251534, "grad_norm": 0.010769632644951344, "learning_rate": 7.346534653465346e-07, "loss": 0.0008, "step": 909 }, { "epoch": 4.466257668711656, "grad_norm": 0.15605410933494568, "learning_rate": 7.277227722772277e-07, "loss": 0.006, "step": 910 }, { "epoch": 4.471165644171779, "grad_norm": 0.1743585467338562, "learning_rate": 7.207920792079208e-07, "loss": 0.0075, "step": 911 }, { "epoch": 4.476073619631902, "grad_norm": 0.011531657539308071, "learning_rate": 7.138613861386138e-07, "loss": 0.0008, "step": 912 }, { "epoch": 4.480981595092024, "grad_norm": 0.15488475561141968, "learning_rate": 7.06930693069307e-07, "loss": 0.0054, "step": 913 }, { "epoch": 4.485889570552147, "grad_norm": 0.12285412847995758, "learning_rate": 7.000000000000001e-07, "loss": 0.0067, "step": 914 }, { "epoch": 4.49079754601227, "grad_norm": 0.02667032927274704, "learning_rate": 6.930693069306931e-07, "loss": 0.001, "step": 915 }, { "epoch": 4.495705521472392, "grad_norm": 0.027680931612849236, "learning_rate": 6.861386138613861e-07, "loss": 0.0014, "step": 916 }, { "epoch": 4.500613496932515, "grad_norm": 0.01782669499516487, "learning_rate": 6.792079207920792e-07, "loss": 0.0019, "step": 917 }, { "epoch": 4.505521472392638, "grad_norm": 0.061316560953855515, "learning_rate": 6.722772277227722e-07, "loss": 0.0026, "step": 918 }, { "epoch": 4.5104294478527605, "grad_norm": 0.052529476583004, "learning_rate": 6.653465346534653e-07, "loss": 0.0023, "step": 919 }, { "epoch": 4.515337423312883, "grad_norm": 0.0037185668479651213, "learning_rate": 6.584158415841584e-07, "loss": 0.001, "step": 920 }, { "epoch": 4.520245398773006, "grad_norm": 0.06986022740602493, "learning_rate": 6.514851485148514e-07, "loss": 0.0022, "step": 921 }, { "epoch": 4.5251533742331285, "grad_norm": 0.006407030858099461, "learning_rate": 6.445544554455445e-07, "loss": 0.0005, "step": 922 }, { "epoch": 4.530061349693252, "grad_norm": 0.005136528518050909, "learning_rate": 6.376237623762376e-07, "loss": 0.0007, "step": 923 }, { "epoch": 4.534969325153375, "grad_norm": 0.063414067029953, "learning_rate": 6.306930693069307e-07, "loss": 0.0031, "step": 924 }, { "epoch": 4.539877300613497, "grad_norm": 0.029516983777284622, "learning_rate": 6.237623762376237e-07, "loss": 0.0015, "step": 925 }, { "epoch": 4.54478527607362, "grad_norm": 0.07440595328807831, "learning_rate": 6.168316831683168e-07, "loss": 0.0016, "step": 926 }, { "epoch": 4.549693251533743, "grad_norm": 0.044622018933296204, "learning_rate": 6.099009900990099e-07, "loss": 0.0023, "step": 927 }, { "epoch": 4.554601226993865, "grad_norm": 0.04841621220111847, "learning_rate": 6.029702970297029e-07, "loss": 0.0049, "step": 928 }, { "epoch": 4.559509202453988, "grad_norm": 0.026555247604846954, "learning_rate": 5.960396039603961e-07, "loss": 0.001, "step": 929 }, { "epoch": 4.564417177914111, "grad_norm": 0.1271572858095169, "learning_rate": 5.891089108910892e-07, "loss": 0.0026, "step": 930 }, { "epoch": 4.569325153374233, "grad_norm": 0.015365286730229855, "learning_rate": 5.821782178217822e-07, "loss": 0.0017, "step": 931 }, { "epoch": 4.574233128834356, "grad_norm": 0.024656543508172035, "learning_rate": 5.752475247524753e-07, "loss": 0.0013, "step": 932 }, { "epoch": 4.579141104294479, "grad_norm": 0.05672885477542877, "learning_rate": 5.683168316831684e-07, "loss": 0.0029, "step": 933 }, { "epoch": 4.584049079754601, "grad_norm": 0.023147093132138252, "learning_rate": 5.613861386138613e-07, "loss": 0.0026, "step": 934 }, { "epoch": 4.588957055214724, "grad_norm": 0.13222621381282806, "learning_rate": 5.544554455445544e-07, "loss": 0.0144, "step": 935 }, { "epoch": 4.593865030674847, "grad_norm": 0.022834930568933487, "learning_rate": 5.475247524752475e-07, "loss": 0.0012, "step": 936 }, { "epoch": 4.598773006134969, "grad_norm": 0.014577291905879974, "learning_rate": 5.405940594059405e-07, "loss": 0.0009, "step": 937 }, { "epoch": 4.603680981595092, "grad_norm": 0.026476260274648666, "learning_rate": 5.336633663366336e-07, "loss": 0.0015, "step": 938 }, { "epoch": 4.608588957055215, "grad_norm": 0.0048033553175628185, "learning_rate": 5.267326732673267e-07, "loss": 0.0001, "step": 939 }, { "epoch": 4.613496932515337, "grad_norm": 0.05348242446780205, "learning_rate": 5.198019801980198e-07, "loss": 0.0017, "step": 940 }, { "epoch": 4.61840490797546, "grad_norm": 0.027225324884057045, "learning_rate": 5.128712871287128e-07, "loss": 0.0012, "step": 941 }, { "epoch": 4.623312883435583, "grad_norm": 0.10507699847221375, "learning_rate": 5.059405940594059e-07, "loss": 0.0071, "step": 942 }, { "epoch": 4.6282208588957054, "grad_norm": 0.12814861536026, "learning_rate": 4.99009900990099e-07, "loss": 0.0073, "step": 943 }, { "epoch": 4.633128834355828, "grad_norm": 0.03401525318622589, "learning_rate": 4.920792079207921e-07, "loss": 0.0014, "step": 944 }, { "epoch": 4.638036809815951, "grad_norm": 0.021818110719323158, "learning_rate": 4.851485148514852e-07, "loss": 0.001, "step": 945 }, { "epoch": 4.6429447852760735, "grad_norm": 0.13617785274982452, "learning_rate": 4.782178217821783e-07, "loss": 0.0047, "step": 946 }, { "epoch": 4.647852760736196, "grad_norm": 0.15220874547958374, "learning_rate": 4.712871287128713e-07, "loss": 0.0053, "step": 947 }, { "epoch": 4.652760736196319, "grad_norm": 0.30194413661956787, "learning_rate": 4.6435643564356435e-07, "loss": 0.0218, "step": 948 }, { "epoch": 4.6576687116564415, "grad_norm": 0.1869322508573532, "learning_rate": 4.574257425742574e-07, "loss": 0.0622, "step": 949 }, { "epoch": 4.662576687116564, "grad_norm": 0.024376358836889267, "learning_rate": 4.504950495049505e-07, "loss": 0.001, "step": 950 }, { "epoch": 4.667484662576687, "grad_norm": 0.04647885262966156, "learning_rate": 4.4356435643564356e-07, "loss": 0.0019, "step": 951 }, { "epoch": 4.6723926380368095, "grad_norm": 0.09357151389122009, "learning_rate": 4.3663366336633663e-07, "loss": 0.0046, "step": 952 }, { "epoch": 4.677300613496932, "grad_norm": 0.12137161940336227, "learning_rate": 4.297029702970297e-07, "loss": 0.0024, "step": 953 }, { "epoch": 4.682208588957055, "grad_norm": 0.017510604113340378, "learning_rate": 4.2277227722772276e-07, "loss": 0.0007, "step": 954 }, { "epoch": 4.6871165644171775, "grad_norm": 0.011096769012510777, "learning_rate": 4.1584158415841583e-07, "loss": 0.0009, "step": 955 }, { "epoch": 4.6920245398773, "grad_norm": 0.075267493724823, "learning_rate": 4.0891089108910895e-07, "loss": 0.0019, "step": 956 }, { "epoch": 4.696932515337423, "grad_norm": 0.0064629483968019485, "learning_rate": 4.0198019801980197e-07, "loss": 0.0017, "step": 957 }, { "epoch": 4.7018404907975455, "grad_norm": 0.018965771421790123, "learning_rate": 3.9504950495049504e-07, "loss": 0.0021, "step": 958 }, { "epoch": 4.706748466257669, "grad_norm": 0.06010276451706886, "learning_rate": 3.881188118811881e-07, "loss": 0.0014, "step": 959 }, { "epoch": 4.711656441717792, "grad_norm": 0.03155827522277832, "learning_rate": 3.811881188118812e-07, "loss": 0.0017, "step": 960 }, { "epoch": 4.716564417177914, "grad_norm": 0.1321091651916504, "learning_rate": 3.7425742574257424e-07, "loss": 0.0066, "step": 961 }, { "epoch": 4.721472392638037, "grad_norm": 0.018097804859280586, "learning_rate": 3.673267326732673e-07, "loss": 0.0011, "step": 962 }, { "epoch": 4.72638036809816, "grad_norm": 0.05625467747449875, "learning_rate": 3.603960396039604e-07, "loss": 0.0021, "step": 963 }, { "epoch": 4.731288343558282, "grad_norm": 0.010586952790617943, "learning_rate": 3.534653465346535e-07, "loss": 0.0004, "step": 964 }, { "epoch": 4.736196319018405, "grad_norm": 0.0075930338352918625, "learning_rate": 3.4653465346534657e-07, "loss": 0.001, "step": 965 }, { "epoch": 4.741104294478528, "grad_norm": 0.011664043180644512, "learning_rate": 3.396039603960396e-07, "loss": 0.0014, "step": 966 }, { "epoch": 4.74601226993865, "grad_norm": 0.04409307986497879, "learning_rate": 3.3267326732673266e-07, "loss": 0.0017, "step": 967 }, { "epoch": 4.750920245398773, "grad_norm": 0.01488639134913683, "learning_rate": 3.257425742574257e-07, "loss": 0.0012, "step": 968 }, { "epoch": 4.755828220858896, "grad_norm": 0.02663021720945835, "learning_rate": 3.188118811881188e-07, "loss": 0.0014, "step": 969 }, { "epoch": 4.7607361963190185, "grad_norm": 0.019124912098050117, "learning_rate": 3.1188118811881186e-07, "loss": 0.0013, "step": 970 }, { "epoch": 4.765644171779141, "grad_norm": 0.023948566988110542, "learning_rate": 3.0495049504950493e-07, "loss": 0.0027, "step": 971 }, { "epoch": 4.770552147239264, "grad_norm": 0.04264827072620392, "learning_rate": 2.9801980198019805e-07, "loss": 0.0015, "step": 972 }, { "epoch": 4.7754601226993865, "grad_norm": 0.11865667253732681, "learning_rate": 2.910891089108911e-07, "loss": 0.0037, "step": 973 }, { "epoch": 4.780368098159509, "grad_norm": 0.0415462963283062, "learning_rate": 2.841584158415842e-07, "loss": 0.0027, "step": 974 }, { "epoch": 4.785276073619632, "grad_norm": 0.16526491940021515, "learning_rate": 2.772277227722772e-07, "loss": 0.0149, "step": 975 }, { "epoch": 4.7901840490797545, "grad_norm": 0.017350684851408005, "learning_rate": 2.7029702970297027e-07, "loss": 0.0008, "step": 976 }, { "epoch": 4.795092024539877, "grad_norm": 0.020096784457564354, "learning_rate": 2.6336633663366334e-07, "loss": 0.0018, "step": 977 }, { "epoch": 4.8, "grad_norm": 0.019672604277729988, "learning_rate": 2.564356435643564e-07, "loss": 0.0009, "step": 978 }, { "epoch": 4.8049079754601225, "grad_norm": 0.050094954669475555, "learning_rate": 2.495049504950495e-07, "loss": 0.0015, "step": 979 }, { "epoch": 4.809815950920245, "grad_norm": 0.07061317563056946, "learning_rate": 2.425742574257426e-07, "loss": 0.0039, "step": 980 }, { "epoch": 4.814723926380368, "grad_norm": 0.023730693385004997, "learning_rate": 2.3564356435643564e-07, "loss": 0.0012, "step": 981 }, { "epoch": 4.8196319018404905, "grad_norm": 0.02949446439743042, "learning_rate": 2.287128712871287e-07, "loss": 0.0013, "step": 982 }, { "epoch": 4.824539877300613, "grad_norm": 0.04460925608873367, "learning_rate": 2.2178217821782178e-07, "loss": 0.002, "step": 983 }, { "epoch": 4.829447852760736, "grad_norm": 0.032420773059129715, "learning_rate": 2.1485148514851485e-07, "loss": 0.0011, "step": 984 }, { "epoch": 4.8343558282208585, "grad_norm": 0.16345536708831787, "learning_rate": 2.0792079207920792e-07, "loss": 0.0052, "step": 985 }, { "epoch": 4.839263803680982, "grad_norm": 0.016612550243735313, "learning_rate": 2.0099009900990098e-07, "loss": 0.0014, "step": 986 }, { "epoch": 4.844171779141105, "grad_norm": 0.022284861654043198, "learning_rate": 1.9405940594059405e-07, "loss": 0.0015, "step": 987 }, { "epoch": 4.849079754601227, "grad_norm": 0.018358217552304268, "learning_rate": 1.8712871287128712e-07, "loss": 0.001, "step": 988 }, { "epoch": 4.85398773006135, "grad_norm": 0.012662719935178757, "learning_rate": 1.801980198019802e-07, "loss": 0.0009, "step": 989 }, { "epoch": 4.858895705521473, "grad_norm": 0.020140303298830986, "learning_rate": 1.7326732673267329e-07, "loss": 0.0014, "step": 990 }, { "epoch": 4.863803680981595, "grad_norm": 0.006523944437503815, "learning_rate": 1.6633663366336633e-07, "loss": 0.0004, "step": 991 }, { "epoch": 4.868711656441718, "grad_norm": 0.02763935551047325, "learning_rate": 1.594059405940594e-07, "loss": 0.0014, "step": 992 }, { "epoch": 4.873619631901841, "grad_norm": 0.05238136649131775, "learning_rate": 1.5247524752475246e-07, "loss": 0.0015, "step": 993 }, { "epoch": 4.8785276073619634, "grad_norm": 0.026852233335375786, "learning_rate": 1.4554455445544556e-07, "loss": 0.0011, "step": 994 }, { "epoch": 4.883435582822086, "grad_norm": 0.01658753491938114, "learning_rate": 1.386138613861386e-07, "loss": 0.0011, "step": 995 }, { "epoch": 4.888343558282209, "grad_norm": 0.025397639721632004, "learning_rate": 1.3168316831683167e-07, "loss": 0.0018, "step": 996 }, { "epoch": 4.8932515337423315, "grad_norm": 0.05166466534137726, "learning_rate": 1.2475247524752474e-07, "loss": 0.0014, "step": 997 }, { "epoch": 4.898159509202454, "grad_norm": 0.06475819647312164, "learning_rate": 1.1782178217821782e-07, "loss": 0.0017, "step": 998 }, { "epoch": 4.903067484662577, "grad_norm": 0.03307437151670456, "learning_rate": 1.1089108910891089e-07, "loss": 0.0013, "step": 999 }, { "epoch": 4.9079754601226995, "grad_norm": 0.014889244921505451, "learning_rate": 1.0396039603960396e-07, "loss": 0.0012, "step": 1000 }, { "epoch": 4.912883435582822, "grad_norm": 0.02157243713736534, "learning_rate": 9.702970297029703e-08, "loss": 0.0013, "step": 1001 }, { "epoch": 4.917791411042945, "grad_norm": 0.011875314638018608, "learning_rate": 9.00990099009901e-08, "loss": 0.0009, "step": 1002 }, { "epoch": 4.9226993865030675, "grad_norm": 0.01703159138560295, "learning_rate": 8.316831683168316e-08, "loss": 0.0015, "step": 1003 }, { "epoch": 4.92760736196319, "grad_norm": 0.042403411120176315, "learning_rate": 7.623762376237623e-08, "loss": 0.0024, "step": 1004 }, { "epoch": 4.932515337423313, "grad_norm": 0.007990752346813679, "learning_rate": 6.93069306930693e-08, "loss": 0.0011, "step": 1005 }, { "epoch": 4.9374233128834355, "grad_norm": 0.024830589070916176, "learning_rate": 6.237623762376237e-08, "loss": 0.0013, "step": 1006 }, { "epoch": 4.942331288343558, "grad_norm": 0.023844977840781212, "learning_rate": 5.5445544554455445e-08, "loss": 0.0011, "step": 1007 }, { "epoch": 4.947239263803681, "grad_norm": 0.0047842771746218204, "learning_rate": 4.8514851485148513e-08, "loss": 0.0002, "step": 1008 }, { "epoch": 4.9521472392638035, "grad_norm": 0.20831891894340515, "learning_rate": 4.158415841584158e-08, "loss": 0.0057, "step": 1009 }, { "epoch": 4.957055214723926, "grad_norm": 0.026182973757386208, "learning_rate": 3.465346534653465e-08, "loss": 0.0017, "step": 1010 }, { "epoch": 4.961963190184049, "grad_norm": 0.03703535720705986, "learning_rate": 2.7722772277227722e-08, "loss": 0.002, "step": 1011 }, { "epoch": 4.9668711656441715, "grad_norm": 0.09641406685113907, "learning_rate": 2.079207920792079e-08, "loss": 0.0031, "step": 1012 }, { "epoch": 4.971779141104294, "grad_norm": 0.007641744799911976, "learning_rate": 1.3861386138613861e-08, "loss": 0.0006, "step": 1013 }, { "epoch": 4.976687116564417, "grad_norm": 0.15017706155776978, "learning_rate": 6.930693069306931e-09, "loss": 0.0045, "step": 1014 }, { "epoch": 4.9815950920245395, "grad_norm": 0.014618399553000927, "learning_rate": 0.0, "loss": 0.001, "step": 1015 } ], "logging_steps": 1, "max_steps": 1015, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2124067535414374e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }