{ "best_metric": 1.3909703493118286, "best_model_checkpoint": "outputs/checkpoint-2955", "epoch": 0.2500211523817582, "eval_steps": 2955, "global_step": 2955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 8.460952703274389e-05, "grad_norm": 2.34375, "learning_rate": 4e-05, "loss": 1.347, "step": 1 }, { "epoch": 0.00016921905406548778, "grad_norm": 3.140625, "learning_rate": 8e-05, "loss": 1.4858, "step": 2 }, { "epoch": 0.00025382858109823165, "grad_norm": 2.6875, "learning_rate": 0.00012, "loss": 1.1842, "step": 3 }, { "epoch": 0.00033843810813097557, "grad_norm": 2.953125, "learning_rate": 0.00016, "loss": 1.7778, "step": 4 }, { "epoch": 0.00042304763516371943, "grad_norm": 9.8125, "learning_rate": 0.0002, "loss": 1.0771, "step": 5 }, { "epoch": 0.0005076571621964633, "grad_norm": 2.15625, "learning_rate": 0.00024, "loss": 1.7402, "step": 6 }, { "epoch": 0.0005922666892292072, "grad_norm": 2.5, "learning_rate": 0.00028000000000000003, "loss": 1.6068, "step": 7 }, { "epoch": 0.0006768762162619511, "grad_norm": 2.0625, "learning_rate": 0.00032, "loss": 1.3865, "step": 8 }, { "epoch": 0.000761485743294695, "grad_norm": 1.8203125, "learning_rate": 0.00035999999999999997, "loss": 1.5416, "step": 9 }, { "epoch": 0.0008460952703274389, "grad_norm": 2.328125, "learning_rate": 0.0004, "loss": 1.2955, "step": 10 }, { "epoch": 0.0009307047973601827, "grad_norm": 4.28125, "learning_rate": 0.00044, "loss": 2.2166, "step": 11 }, { "epoch": 0.0010153143243929266, "grad_norm": 1.671875, "learning_rate": 0.00048, "loss": 1.052, "step": 12 }, { "epoch": 0.0010999238514256705, "grad_norm": 2.328125, "learning_rate": 0.0005200000000000001, "loss": 1.5539, "step": 13 }, { "epoch": 0.0011845333784584143, "grad_norm": 1.59375, "learning_rate": 0.0005600000000000001, "loss": 1.4587, "step": 14 }, { "epoch": 0.0012691429054911584, "grad_norm": 1.453125, "learning_rate": 0.0006, "loss": 1.3232, "step": 15 }, { "epoch": 0.0013537524325239023, "grad_norm": 1.734375, "learning_rate": 0.00064, "loss": 1.5287, "step": 16 }, { "epoch": 0.0014383619595566461, "grad_norm": 1.9765625, "learning_rate": 0.00068, "loss": 1.4651, "step": 17 }, { "epoch": 0.00152297148658939, "grad_norm": 1.484375, "learning_rate": 0.0007199999999999999, "loss": 1.1937, "step": 18 }, { "epoch": 0.0016075810136221339, "grad_norm": 1.671875, "learning_rate": 0.00076, "loss": 1.3257, "step": 19 }, { "epoch": 0.0016921905406548777, "grad_norm": 1.875, "learning_rate": 0.0008, "loss": 1.4073, "step": 20 }, { "epoch": 0.0017768000676876216, "grad_norm": 1.8203125, "learning_rate": 0.00084, "loss": 1.6229, "step": 21 }, { "epoch": 0.0018614095947203655, "grad_norm": 2.0, "learning_rate": 0.00088, "loss": 1.3724, "step": 22 }, { "epoch": 0.0019460191217531093, "grad_norm": 1.734375, "learning_rate": 0.00092, "loss": 1.49, "step": 23 }, { "epoch": 0.002030628648785853, "grad_norm": 1.65625, "learning_rate": 0.00096, "loss": 1.0921, "step": 24 }, { "epoch": 0.0021152381758185973, "grad_norm": 2.6875, "learning_rate": 0.001, "loss": 1.7941, "step": 25 }, { "epoch": 0.002199847702851341, "grad_norm": 1.609375, "learning_rate": 0.000999999982261475, "loss": 1.4013, "step": 26 }, { "epoch": 0.002284457229884085, "grad_norm": 2.265625, "learning_rate": 0.0009999999290459013, "loss": 2.0351, "step": 27 }, { "epoch": 0.0023690667569168287, "grad_norm": 1.5703125, "learning_rate": 0.0009999998403532825, "loss": 1.5376, "step": 28 }, { "epoch": 0.0024536762839495727, "grad_norm": 1.796875, "learning_rate": 0.000999999716183625, "loss": 1.2436, "step": 29 }, { "epoch": 0.002538285810982317, "grad_norm": 1.5703125, "learning_rate": 0.0009999995565369377, "loss": 1.3176, "step": 30 }, { "epoch": 0.0026228953380150605, "grad_norm": 2.109375, "learning_rate": 0.000999999361413232, "loss": 1.8695, "step": 31 }, { "epoch": 0.0027075048650478046, "grad_norm": 1.875, "learning_rate": 0.0009999991308125213, "loss": 1.3461, "step": 32 }, { "epoch": 0.002792114392080548, "grad_norm": 1.875, "learning_rate": 0.0009999988647348224, "loss": 1.1417, "step": 33 }, { "epoch": 0.0028767239191132923, "grad_norm": 2.15625, "learning_rate": 0.000999998563180154, "loss": 1.2394, "step": 34 }, { "epoch": 0.002961333446146036, "grad_norm": 2.578125, "learning_rate": 0.0009999982261485374, "loss": 1.2879, "step": 35 }, { "epoch": 0.00304594297317878, "grad_norm": 2.265625, "learning_rate": 0.0009999978536399969, "loss": 1.3137, "step": 36 }, { "epoch": 0.0031305525002115237, "grad_norm": 2.640625, "learning_rate": 0.0009999974456545585, "loss": 1.4288, "step": 37 }, { "epoch": 0.0032151620272442677, "grad_norm": 4.0625, "learning_rate": 0.0009999970021922515, "loss": 1.4801, "step": 38 }, { "epoch": 0.0032997715542770114, "grad_norm": 4.09375, "learning_rate": 0.000999996523253107, "loss": 1.4042, "step": 39 }, { "epoch": 0.0033843810813097555, "grad_norm": 2.9375, "learning_rate": 0.0009999960088371593, "loss": 1.868, "step": 40 }, { "epoch": 0.0034689906083424996, "grad_norm": 3.265625, "learning_rate": 0.0009999954589444446, "loss": 1.2621, "step": 41 }, { "epoch": 0.003553600135375243, "grad_norm": 3.265625, "learning_rate": 0.0009999948735750022, "loss": 1.079, "step": 42 }, { "epoch": 0.0036382096624079873, "grad_norm": 2.09375, "learning_rate": 0.0009999942527288736, "loss": 1.3093, "step": 43 }, { "epoch": 0.003722819189440731, "grad_norm": 2.578125, "learning_rate": 0.0009999935964061027, "loss": 1.4501, "step": 44 }, { "epoch": 0.003807428716473475, "grad_norm": 2.65625, "learning_rate": 0.000999992904606736, "loss": 1.5224, "step": 45 }, { "epoch": 0.0038920382435062187, "grad_norm": 1.6875, "learning_rate": 0.0009999921773308229, "loss": 1.2122, "step": 46 }, { "epoch": 0.003976647770538962, "grad_norm": 2.25, "learning_rate": 0.000999991414578415, "loss": 1.5258, "step": 47 }, { "epoch": 0.004061257297571706, "grad_norm": 2.328125, "learning_rate": 0.000999990616349566, "loss": 1.218, "step": 48 }, { "epoch": 0.0041458668246044505, "grad_norm": 2.625, "learning_rate": 0.0009999897826443328, "loss": 1.5019, "step": 49 }, { "epoch": 0.0042304763516371946, "grad_norm": 1.765625, "learning_rate": 0.0009999889134627747, "loss": 1.1043, "step": 50 }, { "epoch": 0.004315085878669939, "grad_norm": 1.7890625, "learning_rate": 0.000999988008804953, "loss": 1.6189, "step": 51 }, { "epoch": 0.004399695405702682, "grad_norm": 1.9609375, "learning_rate": 0.0009999870686709321, "loss": 1.5236, "step": 52 }, { "epoch": 0.004484304932735426, "grad_norm": 2.296875, "learning_rate": 0.000999986093060779, "loss": 1.8329, "step": 53 }, { "epoch": 0.00456891445976817, "grad_norm": 1.7890625, "learning_rate": 0.0009999850819745624, "loss": 1.5464, "step": 54 }, { "epoch": 0.004653523986800914, "grad_norm": 1.703125, "learning_rate": 0.0009999840354123545, "loss": 1.3176, "step": 55 }, { "epoch": 0.004738133513833657, "grad_norm": 1.8203125, "learning_rate": 0.0009999829533742291, "loss": 1.2979, "step": 56 }, { "epoch": 0.004822743040866401, "grad_norm": 1.9453125, "learning_rate": 0.0009999818358602632, "loss": 1.2222, "step": 57 }, { "epoch": 0.0049073525678991455, "grad_norm": 1.8125, "learning_rate": 0.0009999806828705363, "loss": 1.1931, "step": 58 }, { "epoch": 0.00499196209493189, "grad_norm": 1.84375, "learning_rate": 0.00099997949440513, "loss": 1.2699, "step": 59 }, { "epoch": 0.005076571621964634, "grad_norm": 2.21875, "learning_rate": 0.0009999782704641284, "loss": 1.7302, "step": 60 }, { "epoch": 0.005161181148997377, "grad_norm": 1.9765625, "learning_rate": 0.0009999770110476187, "loss": 1.3714, "step": 61 }, { "epoch": 0.005245790676030121, "grad_norm": 1.6875, "learning_rate": 0.0009999757161556903, "loss": 1.2651, "step": 62 }, { "epoch": 0.005330400203062865, "grad_norm": 3.921875, "learning_rate": 0.0009999743857884347, "loss": 1.1328, "step": 63 }, { "epoch": 0.005415009730095609, "grad_norm": 1.9765625, "learning_rate": 0.0009999730199459465, "loss": 1.5003, "step": 64 }, { "epoch": 0.005499619257128352, "grad_norm": 1.8125, "learning_rate": 0.0009999716186283227, "loss": 1.4297, "step": 65 }, { "epoch": 0.005584228784161096, "grad_norm": 1.5, "learning_rate": 0.0009999701818356628, "loss": 1.3476, "step": 66 }, { "epoch": 0.0056688383111938405, "grad_norm": 1.859375, "learning_rate": 0.0009999687095680685, "loss": 1.3231, "step": 67 }, { "epoch": 0.005753447838226585, "grad_norm": 1.90625, "learning_rate": 0.0009999672018256443, "loss": 1.3971, "step": 68 }, { "epoch": 0.005838057365259328, "grad_norm": 1.3984375, "learning_rate": 0.0009999656586084974, "loss": 1.0977, "step": 69 }, { "epoch": 0.005922666892292072, "grad_norm": 2.375, "learning_rate": 0.000999964079916737, "loss": 1.6166, "step": 70 }, { "epoch": 0.006007276419324816, "grad_norm": 1.6953125, "learning_rate": 0.0009999624657504754, "loss": 1.2498, "step": 71 }, { "epoch": 0.00609188594635756, "grad_norm": 3.3125, "learning_rate": 0.000999960816109827, "loss": 1.2013, "step": 72 }, { "epoch": 0.006176495473390304, "grad_norm": 2.03125, "learning_rate": 0.0009999591309949086, "loss": 1.6118, "step": 73 }, { "epoch": 0.006261105000423047, "grad_norm": 2.078125, "learning_rate": 0.00099995741040584, "loss": 1.5442, "step": 74 }, { "epoch": 0.006345714527455791, "grad_norm": 1.59375, "learning_rate": 0.0009999556543427435, "loss": 1.088, "step": 75 }, { "epoch": 0.0064303240544885355, "grad_norm": 1.7421875, "learning_rate": 0.0009999538628057433, "loss": 1.1979, "step": 76 }, { "epoch": 0.00651493358152128, "grad_norm": 1.8046875, "learning_rate": 0.000999952035794967, "loss": 1.4976, "step": 77 }, { "epoch": 0.006599543108554023, "grad_norm": 1.890625, "learning_rate": 0.0009999501733105438, "loss": 1.2319, "step": 78 }, { "epoch": 0.006684152635586767, "grad_norm": 1.9375, "learning_rate": 0.0009999482753526058, "loss": 1.5253, "step": 79 }, { "epoch": 0.006768762162619511, "grad_norm": 2.046875, "learning_rate": 0.0009999463419212882, "loss": 1.5161, "step": 80 }, { "epoch": 0.006853371689652255, "grad_norm": 1.78125, "learning_rate": 0.0009999443730167275, "loss": 1.336, "step": 81 }, { "epoch": 0.006937981216684999, "grad_norm": 1.8203125, "learning_rate": 0.0009999423686390641, "loss": 1.253, "step": 82 }, { "epoch": 0.007022590743717742, "grad_norm": 2.34375, "learning_rate": 0.0009999403287884396, "loss": 1.3102, "step": 83 }, { "epoch": 0.007107200270750486, "grad_norm": 2.25, "learning_rate": 0.000999938253464999, "loss": 1.1867, "step": 84 }, { "epoch": 0.0071918097977832305, "grad_norm": 2.34375, "learning_rate": 0.0009999361426688896, "loss": 1.2959, "step": 85 }, { "epoch": 0.007276419324815975, "grad_norm": 2.453125, "learning_rate": 0.000999933996400261, "loss": 1.4914, "step": 86 }, { "epoch": 0.007361028851848718, "grad_norm": 1.9453125, "learning_rate": 0.000999931814659266, "loss": 1.2173, "step": 87 }, { "epoch": 0.007445638378881462, "grad_norm": 2.21875, "learning_rate": 0.0009999295974460586, "loss": 1.5319, "step": 88 }, { "epoch": 0.007530247905914206, "grad_norm": 1.6015625, "learning_rate": 0.0009999273447607968, "loss": 1.2836, "step": 89 }, { "epoch": 0.00761485743294695, "grad_norm": 1.59375, "learning_rate": 0.00099992505660364, "loss": 1.1978, "step": 90 }, { "epoch": 0.007699466959979694, "grad_norm": 3.171875, "learning_rate": 0.0009999227329747509, "loss": 1.1352, "step": 91 }, { "epoch": 0.007784076487012437, "grad_norm": 1.8125, "learning_rate": 0.000999920373874294, "loss": 1.6017, "step": 92 }, { "epoch": 0.007868686014045181, "grad_norm": 1.90625, "learning_rate": 0.000999917979302437, "loss": 1.2574, "step": 93 }, { "epoch": 0.007953295541077925, "grad_norm": 2.1875, "learning_rate": 0.0009999155492593497, "loss": 1.7009, "step": 94 }, { "epoch": 0.00803790506811067, "grad_norm": 1.78125, "learning_rate": 0.0009999130837452044, "loss": 1.1946, "step": 95 }, { "epoch": 0.008122514595143413, "grad_norm": 2.8125, "learning_rate": 0.0009999105827601762, "loss": 1.2913, "step": 96 }, { "epoch": 0.008207124122176158, "grad_norm": 2.6875, "learning_rate": 0.0009999080463044426, "loss": 1.2741, "step": 97 }, { "epoch": 0.008291733649208901, "grad_norm": 3.734375, "learning_rate": 0.0009999054743781832, "loss": 1.5001, "step": 98 }, { "epoch": 0.008376343176241644, "grad_norm": 2.140625, "learning_rate": 0.0009999028669815811, "loss": 1.1845, "step": 99 }, { "epoch": 0.008460952703274389, "grad_norm": 1.9765625, "learning_rate": 0.0009999002241148206, "loss": 1.2613, "step": 100 }, { "epoch": 0.008545562230307132, "grad_norm": 2.625, "learning_rate": 0.0009998975457780898, "loss": 1.4781, "step": 101 }, { "epoch": 0.008630171757339877, "grad_norm": 1.84375, "learning_rate": 0.0009998948319715785, "loss": 1.3914, "step": 102 }, { "epoch": 0.00871478128437262, "grad_norm": 1.71875, "learning_rate": 0.0009998920826954794, "loss": 1.0609, "step": 103 }, { "epoch": 0.008799390811405364, "grad_norm": 3.0, "learning_rate": 0.0009998892979499875, "loss": 1.1943, "step": 104 }, { "epoch": 0.008884000338438109, "grad_norm": 1.890625, "learning_rate": 0.0009998864777353, "loss": 1.2727, "step": 105 }, { "epoch": 0.008968609865470852, "grad_norm": 1.53125, "learning_rate": 0.0009998836220516177, "loss": 1.0181, "step": 106 }, { "epoch": 0.009053219392503595, "grad_norm": 2.21875, "learning_rate": 0.000999880730899143, "loss": 1.2839, "step": 107 }, { "epoch": 0.00913782891953634, "grad_norm": 1.859375, "learning_rate": 0.0009998778042780805, "loss": 1.3362, "step": 108 }, { "epoch": 0.009222438446569083, "grad_norm": 1.546875, "learning_rate": 0.0009998748421886383, "loss": 1.1448, "step": 109 }, { "epoch": 0.009307047973601828, "grad_norm": 1.5703125, "learning_rate": 0.0009998718446310268, "loss": 1.1337, "step": 110 }, { "epoch": 0.009391657500634571, "grad_norm": 1.8125, "learning_rate": 0.0009998688116054583, "loss": 1.2494, "step": 111 }, { "epoch": 0.009476267027667315, "grad_norm": 2.546875, "learning_rate": 0.0009998657431121482, "loss": 1.1997, "step": 112 }, { "epoch": 0.00956087655470006, "grad_norm": 2.640625, "learning_rate": 0.0009998626391513141, "loss": 1.5433, "step": 113 }, { "epoch": 0.009645486081732803, "grad_norm": 2.203125, "learning_rate": 0.0009998594997231763, "loss": 1.2961, "step": 114 }, { "epoch": 0.009730095608765548, "grad_norm": 2.015625, "learning_rate": 0.0009998563248279574, "loss": 1.1966, "step": 115 }, { "epoch": 0.009814705135798291, "grad_norm": 1.65625, "learning_rate": 0.0009998531144658832, "loss": 1.0839, "step": 116 }, { "epoch": 0.009899314662831034, "grad_norm": 2.171875, "learning_rate": 0.0009998498686371808, "loss": 1.2408, "step": 117 }, { "epoch": 0.00998392418986378, "grad_norm": 2.078125, "learning_rate": 0.0009998465873420808, "loss": 1.355, "step": 118 }, { "epoch": 0.010068533716896522, "grad_norm": 1.6953125, "learning_rate": 0.0009998432705808163, "loss": 1.3892, "step": 119 }, { "epoch": 0.010153143243929267, "grad_norm": 2.03125, "learning_rate": 0.000999839918353622, "loss": 1.5594, "step": 120 }, { "epoch": 0.01023775277096201, "grad_norm": 1.7890625, "learning_rate": 0.0009998365306607365, "loss": 1.1544, "step": 121 }, { "epoch": 0.010322362297994754, "grad_norm": 1.765625, "learning_rate": 0.0009998331075023997, "loss": 1.0929, "step": 122 }, { "epoch": 0.010406971825027499, "grad_norm": 1.8125, "learning_rate": 0.0009998296488788545, "loss": 1.2034, "step": 123 }, { "epoch": 0.010491581352060242, "grad_norm": 5.625, "learning_rate": 0.0009998261547903464, "loss": 1.2612, "step": 124 }, { "epoch": 0.010576190879092985, "grad_norm": 1.8828125, "learning_rate": 0.0009998226252371234, "loss": 1.4275, "step": 125 }, { "epoch": 0.01066080040612573, "grad_norm": 1.8515625, "learning_rate": 0.0009998190602194357, "loss": 1.3639, "step": 126 }, { "epoch": 0.010745409933158473, "grad_norm": 5.09375, "learning_rate": 0.0009998154597375367, "loss": 1.5568, "step": 127 }, { "epoch": 0.010830019460191218, "grad_norm": 1.8984375, "learning_rate": 0.0009998118237916813, "loss": 1.2039, "step": 128 }, { "epoch": 0.010914628987223961, "grad_norm": 1.984375, "learning_rate": 0.0009998081523821278, "loss": 1.2754, "step": 129 }, { "epoch": 0.010999238514256705, "grad_norm": 1.890625, "learning_rate": 0.0009998044455091367, "loss": 1.4505, "step": 130 }, { "epoch": 0.01108384804128945, "grad_norm": 2.171875, "learning_rate": 0.000999800703172971, "loss": 1.6079, "step": 131 }, { "epoch": 0.011168457568322193, "grad_norm": 1.5, "learning_rate": 0.000999796925373896, "loss": 1.1853, "step": 132 }, { "epoch": 0.011253067095354938, "grad_norm": 1.7890625, "learning_rate": 0.0009997931121121801, "loss": 1.5865, "step": 133 }, { "epoch": 0.011337676622387681, "grad_norm": 2.015625, "learning_rate": 0.000999789263388094, "loss": 1.2847, "step": 134 }, { "epoch": 0.011422286149420424, "grad_norm": 1.875, "learning_rate": 0.00099978537920191, "loss": 1.6055, "step": 135 }, { "epoch": 0.01150689567645317, "grad_norm": 2.015625, "learning_rate": 0.0009997814595539046, "loss": 1.0537, "step": 136 }, { "epoch": 0.011591505203485912, "grad_norm": 2.140625, "learning_rate": 0.0009997775044443553, "loss": 1.2045, "step": 137 }, { "epoch": 0.011676114730518656, "grad_norm": 2.125, "learning_rate": 0.0009997735138735431, "loss": 1.3257, "step": 138 }, { "epoch": 0.0117607242575514, "grad_norm": 2.375, "learning_rate": 0.000999769487841751, "loss": 1.8842, "step": 139 }, { "epoch": 0.011845333784584144, "grad_norm": 2.234375, "learning_rate": 0.0009997654263492646, "loss": 1.6685, "step": 140 }, { "epoch": 0.011929943311616889, "grad_norm": 2.09375, "learning_rate": 0.000999761329396372, "loss": 1.2609, "step": 141 }, { "epoch": 0.012014552838649632, "grad_norm": 21.625, "learning_rate": 0.0009997571969833643, "loss": 1.3058, "step": 142 }, { "epoch": 0.012099162365682375, "grad_norm": 97.5, "learning_rate": 0.0009997530291105344, "loss": 1.5144, "step": 143 }, { "epoch": 0.01218377189271512, "grad_norm": 2.765625, "learning_rate": 0.000999748825778178, "loss": 1.6006, "step": 144 }, { "epoch": 0.012268381419747863, "grad_norm": 3.671875, "learning_rate": 0.0009997445869865936, "loss": 1.1881, "step": 145 }, { "epoch": 0.012352990946780608, "grad_norm": 2.765625, "learning_rate": 0.0009997403127360817, "loss": 1.4521, "step": 146 }, { "epoch": 0.012437600473813351, "grad_norm": 3.09375, "learning_rate": 0.0009997360030269454, "loss": 1.6141, "step": 147 }, { "epoch": 0.012522210000846095, "grad_norm": 2.1875, "learning_rate": 0.0009997316578594909, "loss": 1.6004, "step": 148 }, { "epoch": 0.01260681952787884, "grad_norm": 7.0, "learning_rate": 0.0009997272772340265, "loss": 1.439, "step": 149 }, { "epoch": 0.012691429054911583, "grad_norm": 4.4375, "learning_rate": 0.0009997228611508628, "loss": 1.1298, "step": 150 }, { "epoch": 0.012776038581944328, "grad_norm": 1.8359375, "learning_rate": 0.000999718409610313, "loss": 1.3315, "step": 151 }, { "epoch": 0.012860648108977071, "grad_norm": 172.0, "learning_rate": 0.0009997139226126934, "loss": 1.7768, "step": 152 }, { "epoch": 0.012945257636009814, "grad_norm": 1.75, "learning_rate": 0.0009997094001583222, "loss": 1.0968, "step": 153 }, { "epoch": 0.01302986716304256, "grad_norm": 2.515625, "learning_rate": 0.0009997048422475201, "loss": 1.2392, "step": 154 }, { "epoch": 0.013114476690075302, "grad_norm": 2.25, "learning_rate": 0.0009997002488806108, "loss": 1.3551, "step": 155 }, { "epoch": 0.013199086217108046, "grad_norm": 2.234375, "learning_rate": 0.0009996956200579198, "loss": 1.4327, "step": 156 }, { "epoch": 0.01328369574414079, "grad_norm": 3.65625, "learning_rate": 0.0009996909557797762, "loss": 2.0459, "step": 157 }, { "epoch": 0.013368305271173534, "grad_norm": 6.03125, "learning_rate": 0.00099968625604651, "loss": 1.4953, "step": 158 }, { "epoch": 0.013452914798206279, "grad_norm": 4.34375, "learning_rate": 0.0009996815208584556, "loss": 1.1725, "step": 159 }, { "epoch": 0.013537524325239022, "grad_norm": 1.921875, "learning_rate": 0.0009996767502159485, "loss": 1.0878, "step": 160 }, { "epoch": 0.013622133852271765, "grad_norm": 2.53125, "learning_rate": 0.0009996719441193271, "loss": 1.3483, "step": 161 }, { "epoch": 0.01370674337930451, "grad_norm": 2.5, "learning_rate": 0.0009996671025689328, "loss": 1.7677, "step": 162 }, { "epoch": 0.013791352906337253, "grad_norm": 2.46875, "learning_rate": 0.0009996622255651088, "loss": 1.4143, "step": 163 }, { "epoch": 0.013875962433369998, "grad_norm": 1.75, "learning_rate": 0.0009996573131082014, "loss": 1.2013, "step": 164 }, { "epoch": 0.013960571960402741, "grad_norm": 2.65625, "learning_rate": 0.0009996523651985589, "loss": 1.6487, "step": 165 }, { "epoch": 0.014045181487435485, "grad_norm": 1.671875, "learning_rate": 0.0009996473818365327, "loss": 1.2965, "step": 166 }, { "epoch": 0.01412979101446823, "grad_norm": 1.84375, "learning_rate": 0.0009996423630224758, "loss": 1.2192, "step": 167 }, { "epoch": 0.014214400541500973, "grad_norm": 2.578125, "learning_rate": 0.000999637308756745, "loss": 1.2958, "step": 168 }, { "epoch": 0.014299010068533716, "grad_norm": 2.96875, "learning_rate": 0.0009996322190396986, "loss": 1.8261, "step": 169 }, { "epoch": 0.014383619595566461, "grad_norm": 2.0625, "learning_rate": 0.0009996270938716976, "loss": 1.4484, "step": 170 }, { "epoch": 0.014468229122599204, "grad_norm": 1.59375, "learning_rate": 0.0009996219332531059, "loss": 1.1201, "step": 171 }, { "epoch": 0.01455283864963195, "grad_norm": 1.7890625, "learning_rate": 0.0009996167371842896, "loss": 1.4118, "step": 172 }, { "epoch": 0.014637448176664692, "grad_norm": 1.8515625, "learning_rate": 0.000999611505665617, "loss": 1.4996, "step": 173 }, { "epoch": 0.014722057703697436, "grad_norm": 2.484375, "learning_rate": 0.0009996062386974601, "loss": 1.471, "step": 174 }, { "epoch": 0.01480666723073018, "grad_norm": 2.234375, "learning_rate": 0.0009996009362801918, "loss": 1.4913, "step": 175 }, { "epoch": 0.014891276757762924, "grad_norm": 2.0625, "learning_rate": 0.0009995955984141891, "loss": 1.2942, "step": 176 }, { "epoch": 0.014975886284795669, "grad_norm": 1.875, "learning_rate": 0.0009995902250998301, "loss": 1.3884, "step": 177 }, { "epoch": 0.015060495811828412, "grad_norm": 2.40625, "learning_rate": 0.0009995848163374963, "loss": 1.3034, "step": 178 }, { "epoch": 0.015145105338861155, "grad_norm": 1.8203125, "learning_rate": 0.0009995793721275714, "loss": 1.4782, "step": 179 }, { "epoch": 0.0152297148658939, "grad_norm": 2.015625, "learning_rate": 0.0009995738924704419, "loss": 1.552, "step": 180 }, { "epoch": 0.015314324392926643, "grad_norm": 2.21875, "learning_rate": 0.0009995683773664963, "loss": 1.6338, "step": 181 }, { "epoch": 0.015398933919959388, "grad_norm": 2.09375, "learning_rate": 0.0009995628268161263, "loss": 1.6083, "step": 182 }, { "epoch": 0.015483543446992131, "grad_norm": 1.6171875, "learning_rate": 0.0009995572408197254, "loss": 1.3587, "step": 183 }, { "epoch": 0.015568152974024875, "grad_norm": 2.03125, "learning_rate": 0.00099955161937769, "loss": 1.2394, "step": 184 }, { "epoch": 0.01565276250105762, "grad_norm": 2.359375, "learning_rate": 0.000999545962490419, "loss": 1.6407, "step": 185 }, { "epoch": 0.015737372028090363, "grad_norm": 2.40625, "learning_rate": 0.0009995402701583138, "loss": 2.0192, "step": 186 }, { "epoch": 0.015821981555123106, "grad_norm": 2.125, "learning_rate": 0.0009995345423817784, "loss": 1.3759, "step": 187 }, { "epoch": 0.01590659108215585, "grad_norm": 2.4375, "learning_rate": 0.000999528779161219, "loss": 1.5867, "step": 188 }, { "epoch": 0.015991200609188596, "grad_norm": 1.6015625, "learning_rate": 0.0009995229804970448, "loss": 1.4915, "step": 189 }, { "epoch": 0.01607581013622134, "grad_norm": 2.609375, "learning_rate": 0.000999517146389667, "loss": 1.4121, "step": 190 }, { "epoch": 0.016160419663254082, "grad_norm": 1.7578125, "learning_rate": 0.0009995112768394997, "loss": 1.4422, "step": 191 }, { "epoch": 0.016245029190286826, "grad_norm": 2.046875, "learning_rate": 0.0009995053718469592, "loss": 1.1671, "step": 192 }, { "epoch": 0.01632963871731957, "grad_norm": 2.125, "learning_rate": 0.0009994994314124646, "loss": 1.5485, "step": 193 }, { "epoch": 0.016414248244352315, "grad_norm": 2.140625, "learning_rate": 0.0009994934555364373, "loss": 1.5435, "step": 194 }, { "epoch": 0.01649885777138506, "grad_norm": 2.46875, "learning_rate": 0.0009994874442193014, "loss": 1.373, "step": 195 }, { "epoch": 0.016583467298417802, "grad_norm": 2.046875, "learning_rate": 0.0009994813974614834, "loss": 1.4975, "step": 196 }, { "epoch": 0.016668076825450545, "grad_norm": 2.03125, "learning_rate": 0.0009994753152634125, "loss": 1.294, "step": 197 }, { "epoch": 0.01675268635248329, "grad_norm": 1.578125, "learning_rate": 0.0009994691976255201, "loss": 1.1618, "step": 198 }, { "epoch": 0.016837295879516035, "grad_norm": 1.7890625, "learning_rate": 0.0009994630445482402, "loss": 1.3071, "step": 199 }, { "epoch": 0.016921905406548778, "grad_norm": 8.0625, "learning_rate": 0.0009994568560320095, "loss": 1.4145, "step": 200 }, { "epoch": 0.01700651493358152, "grad_norm": 2.125, "learning_rate": 0.0009994506320772669, "loss": 1.3224, "step": 201 }, { "epoch": 0.017091124460614265, "grad_norm": 1.5859375, "learning_rate": 0.0009994443726844544, "loss": 1.2453, "step": 202 }, { "epoch": 0.017175733987647008, "grad_norm": 2.09375, "learning_rate": 0.000999438077854016, "loss": 2.112, "step": 203 }, { "epoch": 0.017260343514679755, "grad_norm": 1.875, "learning_rate": 0.000999431747586398, "loss": 1.336, "step": 204 }, { "epoch": 0.017344953041712498, "grad_norm": 1.7265625, "learning_rate": 0.00099942538188205, "loss": 1.6594, "step": 205 }, { "epoch": 0.01742956256874524, "grad_norm": 1.9375, "learning_rate": 0.0009994189807414236, "loss": 1.4261, "step": 206 }, { "epoch": 0.017514172095777984, "grad_norm": 2.859375, "learning_rate": 0.0009994125441649727, "loss": 1.8957, "step": 207 }, { "epoch": 0.017598781622810727, "grad_norm": 2.140625, "learning_rate": 0.000999406072153154, "loss": 1.7651, "step": 208 }, { "epoch": 0.01768339114984347, "grad_norm": 1.3203125, "learning_rate": 0.0009993995647064272, "loss": 1.1423, "step": 209 }, { "epoch": 0.017768000676876217, "grad_norm": 1.7890625, "learning_rate": 0.0009993930218252537, "loss": 1.2428, "step": 210 }, { "epoch": 0.01785261020390896, "grad_norm": 2.21875, "learning_rate": 0.0009993864435100977, "loss": 1.5758, "step": 211 }, { "epoch": 0.017937219730941704, "grad_norm": 1.7265625, "learning_rate": 0.0009993798297614261, "loss": 1.3686, "step": 212 }, { "epoch": 0.018021829257974447, "grad_norm": 1.625, "learning_rate": 0.0009993731805797079, "loss": 1.3278, "step": 213 }, { "epoch": 0.01810643878500719, "grad_norm": 2.109375, "learning_rate": 0.0009993664959654155, "loss": 1.4686, "step": 214 }, { "epoch": 0.018191048312039937, "grad_norm": 1.9921875, "learning_rate": 0.0009993597759190224, "loss": 1.4954, "step": 215 }, { "epoch": 0.01827565783907268, "grad_norm": 2.90625, "learning_rate": 0.000999353020441006, "loss": 2.0094, "step": 216 }, { "epoch": 0.018360267366105423, "grad_norm": 3.546875, "learning_rate": 0.0009993462295318453, "loss": 1.3573, "step": 217 }, { "epoch": 0.018444876893138167, "grad_norm": 2.828125, "learning_rate": 0.0009993394031920222, "loss": 1.5403, "step": 218 }, { "epoch": 0.01852948642017091, "grad_norm": 5.625, "learning_rate": 0.0009993325414220213, "loss": 1.6052, "step": 219 }, { "epoch": 0.018614095947203656, "grad_norm": 2.0, "learning_rate": 0.0009993256442223295, "loss": 1.1354, "step": 220 }, { "epoch": 0.0186987054742364, "grad_norm": 16.625, "learning_rate": 0.0009993187115934358, "loss": 1.6388, "step": 221 }, { "epoch": 0.018783315001269143, "grad_norm": 1.984375, "learning_rate": 0.0009993117435358322, "loss": 1.5376, "step": 222 }, { "epoch": 0.018867924528301886, "grad_norm": 2.5625, "learning_rate": 0.0009993047400500134, "loss": 1.6391, "step": 223 }, { "epoch": 0.01895253405533463, "grad_norm": 2.4375, "learning_rate": 0.0009992977011364758, "loss": 1.3547, "step": 224 }, { "epoch": 0.019037143582367376, "grad_norm": 3.078125, "learning_rate": 0.0009992906267957194, "loss": 1.6937, "step": 225 }, { "epoch": 0.01912175310940012, "grad_norm": 2.125, "learning_rate": 0.0009992835170282459, "loss": 1.5653, "step": 226 }, { "epoch": 0.019206362636432862, "grad_norm": 2.109375, "learning_rate": 0.0009992763718345596, "loss": 1.2953, "step": 227 }, { "epoch": 0.019290972163465606, "grad_norm": 1.9140625, "learning_rate": 0.000999269191215168, "loss": 1.3038, "step": 228 }, { "epoch": 0.01937558169049835, "grad_norm": 1.8359375, "learning_rate": 0.0009992619751705801, "loss": 1.3373, "step": 229 }, { "epoch": 0.019460191217531096, "grad_norm": 2.609375, "learning_rate": 0.000999254723701308, "loss": 1.4272, "step": 230 }, { "epoch": 0.01954480074456384, "grad_norm": 1.9375, "learning_rate": 0.0009992474368078663, "loss": 1.595, "step": 231 }, { "epoch": 0.019629410271596582, "grad_norm": 3.046875, "learning_rate": 0.000999240114490772, "loss": 1.5275, "step": 232 }, { "epoch": 0.019714019798629325, "grad_norm": 1.8046875, "learning_rate": 0.000999232756750545, "loss": 1.3026, "step": 233 }, { "epoch": 0.01979862932566207, "grad_norm": 2.21875, "learning_rate": 0.0009992253635877067, "loss": 2.0244, "step": 234 }, { "epoch": 0.019883238852694815, "grad_norm": 1.75, "learning_rate": 0.0009992179350027822, "loss": 1.7058, "step": 235 }, { "epoch": 0.01996784837972756, "grad_norm": 2.671875, "learning_rate": 0.0009992104709962981, "loss": 1.7791, "step": 236 }, { "epoch": 0.0200524579067603, "grad_norm": 1.578125, "learning_rate": 0.0009992029715687847, "loss": 1.261, "step": 237 }, { "epoch": 0.020137067433793045, "grad_norm": 1.7734375, "learning_rate": 0.0009991954367207733, "loss": 1.322, "step": 238 }, { "epoch": 0.020221676960825788, "grad_norm": 1.6015625, "learning_rate": 0.0009991878664527992, "loss": 1.3924, "step": 239 }, { "epoch": 0.020306286487858535, "grad_norm": 1.8203125, "learning_rate": 0.0009991802607653992, "loss": 1.3023, "step": 240 }, { "epoch": 0.020390896014891278, "grad_norm": 2.265625, "learning_rate": 0.0009991726196591133, "loss": 1.6224, "step": 241 }, { "epoch": 0.02047550554192402, "grad_norm": 2.15625, "learning_rate": 0.0009991649431344831, "loss": 1.2995, "step": 242 }, { "epoch": 0.020560115068956764, "grad_norm": 1.9453125, "learning_rate": 0.0009991572311920536, "loss": 1.1433, "step": 243 }, { "epoch": 0.020644724595989507, "grad_norm": 1.9453125, "learning_rate": 0.0009991494838323723, "loss": 1.2607, "step": 244 }, { "epoch": 0.02072933412302225, "grad_norm": 2.546875, "learning_rate": 0.0009991417010559883, "loss": 1.2243, "step": 245 }, { "epoch": 0.020813943650054997, "grad_norm": 1.390625, "learning_rate": 0.0009991338828634541, "loss": 1.0471, "step": 246 }, { "epoch": 0.02089855317708774, "grad_norm": 2.921875, "learning_rate": 0.0009991260292553246, "loss": 1.7105, "step": 247 }, { "epoch": 0.020983162704120484, "grad_norm": 2.21875, "learning_rate": 0.0009991181402321568, "loss": 1.4738, "step": 248 }, { "epoch": 0.021067772231153227, "grad_norm": 6.09375, "learning_rate": 0.0009991102157945106, "loss": 1.5079, "step": 249 }, { "epoch": 0.02115238175818597, "grad_norm": 1.5703125, "learning_rate": 0.000999102255942948, "loss": 1.2744, "step": 250 }, { "epoch": 0.021236991285218717, "grad_norm": 2.0625, "learning_rate": 0.0009990942606780343, "loss": 1.6377, "step": 251 }, { "epoch": 0.02132160081225146, "grad_norm": 1.84375, "learning_rate": 0.0009990862300003364, "loss": 1.9196, "step": 252 }, { "epoch": 0.021406210339284203, "grad_norm": 2.125, "learning_rate": 0.0009990781639104243, "loss": 1.5648, "step": 253 }, { "epoch": 0.021490819866316947, "grad_norm": 2.125, "learning_rate": 0.00099907006240887, "loss": 1.3309, "step": 254 }, { "epoch": 0.02157542939334969, "grad_norm": 1.390625, "learning_rate": 0.0009990619254962487, "loss": 1.1751, "step": 255 }, { "epoch": 0.021660038920382436, "grad_norm": 2.390625, "learning_rate": 0.0009990537531731375, "loss": 1.2317, "step": 256 }, { "epoch": 0.02174464844741518, "grad_norm": 1.640625, "learning_rate": 0.0009990455454401165, "loss": 1.2704, "step": 257 }, { "epoch": 0.021829257974447923, "grad_norm": 1.9140625, "learning_rate": 0.0009990373022977681, "loss": 1.214, "step": 258 }, { "epoch": 0.021913867501480666, "grad_norm": 1.875, "learning_rate": 0.0009990290237466767, "loss": 1.3812, "step": 259 }, { "epoch": 0.02199847702851341, "grad_norm": 2.625, "learning_rate": 0.0009990207097874304, "loss": 1.4461, "step": 260 }, { "epoch": 0.022083086555546156, "grad_norm": 2.609375, "learning_rate": 0.0009990123604206186, "loss": 1.7456, "step": 261 }, { "epoch": 0.0221676960825789, "grad_norm": 2.21875, "learning_rate": 0.0009990039756468335, "loss": 1.7591, "step": 262 }, { "epoch": 0.022252305609611642, "grad_norm": 4.1875, "learning_rate": 0.0009989955554666708, "loss": 1.3499, "step": 263 }, { "epoch": 0.022336915136644386, "grad_norm": 2.671875, "learning_rate": 0.0009989870998807276, "loss": 1.6826, "step": 264 }, { "epoch": 0.02242152466367713, "grad_norm": 1.765625, "learning_rate": 0.0009989786088896036, "loss": 1.1738, "step": 265 }, { "epoch": 0.022506134190709876, "grad_norm": 5.46875, "learning_rate": 0.0009989700824939013, "loss": 2.0315, "step": 266 }, { "epoch": 0.02259074371774262, "grad_norm": 1.9140625, "learning_rate": 0.0009989615206942263, "loss": 1.1226, "step": 267 }, { "epoch": 0.022675353244775362, "grad_norm": 3.40625, "learning_rate": 0.0009989529234911852, "loss": 1.2221, "step": 268 }, { "epoch": 0.022759962771808105, "grad_norm": 1.96875, "learning_rate": 0.0009989442908853888, "loss": 1.486, "step": 269 }, { "epoch": 0.02284457229884085, "grad_norm": 2.203125, "learning_rate": 0.0009989356228774491, "loss": 1.3799, "step": 270 }, { "epoch": 0.022929181825873595, "grad_norm": 1.7265625, "learning_rate": 0.0009989269194679814, "loss": 1.1169, "step": 271 }, { "epoch": 0.02301379135290634, "grad_norm": 3.328125, "learning_rate": 0.000998918180657603, "loss": 1.8589, "step": 272 }, { "epoch": 0.02309840087993908, "grad_norm": 2.890625, "learning_rate": 0.0009989094064469341, "loss": 1.758, "step": 273 }, { "epoch": 0.023183010406971825, "grad_norm": 2.28125, "learning_rate": 0.0009989005968365973, "loss": 1.4167, "step": 274 }, { "epoch": 0.023267619934004568, "grad_norm": 2.359375, "learning_rate": 0.0009988917518272177, "loss": 1.23, "step": 275 }, { "epoch": 0.02335222946103731, "grad_norm": 2.65625, "learning_rate": 0.000998882871419423, "loss": 1.5197, "step": 276 }, { "epoch": 0.023436838988070058, "grad_norm": 1.8828125, "learning_rate": 0.0009988739556138428, "loss": 1.192, "step": 277 }, { "epoch": 0.0235214485151028, "grad_norm": 2.375, "learning_rate": 0.0009988650044111102, "loss": 1.3941, "step": 278 }, { "epoch": 0.023606058042135544, "grad_norm": 4.03125, "learning_rate": 0.0009988560178118603, "loss": 1.9178, "step": 279 }, { "epoch": 0.023690667569168287, "grad_norm": 2.421875, "learning_rate": 0.0009988469958167304, "loss": 1.6953, "step": 280 }, { "epoch": 0.02377527709620103, "grad_norm": 2.28125, "learning_rate": 0.0009988379384263607, "loss": 1.6486, "step": 281 }, { "epoch": 0.023859886623233777, "grad_norm": 2.359375, "learning_rate": 0.0009988288456413944, "loss": 1.6849, "step": 282 }, { "epoch": 0.02394449615026652, "grad_norm": 9.4375, "learning_rate": 0.0009988197174624761, "loss": 1.8455, "step": 283 }, { "epoch": 0.024029105677299264, "grad_norm": 2.75, "learning_rate": 0.0009988105538902537, "loss": 1.6421, "step": 284 }, { "epoch": 0.024113715204332007, "grad_norm": 2.203125, "learning_rate": 0.0009988013549253774, "loss": 1.4932, "step": 285 }, { "epoch": 0.02419832473136475, "grad_norm": 2.046875, "learning_rate": 0.0009987921205684997, "loss": 1.1856, "step": 286 }, { "epoch": 0.024282934258397497, "grad_norm": 1.984375, "learning_rate": 0.0009987828508202764, "loss": 1.236, "step": 287 }, { "epoch": 0.02436754378543024, "grad_norm": 2.390625, "learning_rate": 0.0009987735456813644, "loss": 1.5252, "step": 288 }, { "epoch": 0.024452153312462983, "grad_norm": 1.921875, "learning_rate": 0.0009987642051524244, "loss": 1.1868, "step": 289 }, { "epoch": 0.024536762839495727, "grad_norm": 2.953125, "learning_rate": 0.0009987548292341192, "loss": 1.9085, "step": 290 }, { "epoch": 0.02462137236652847, "grad_norm": 2.265625, "learning_rate": 0.0009987454179271138, "loss": 1.4697, "step": 291 }, { "epoch": 0.024705981893561216, "grad_norm": 2.125, "learning_rate": 0.0009987359712320763, "loss": 1.3808, "step": 292 }, { "epoch": 0.02479059142059396, "grad_norm": 2.59375, "learning_rate": 0.0009987264891496768, "loss": 1.4499, "step": 293 }, { "epoch": 0.024875200947626703, "grad_norm": 2.59375, "learning_rate": 0.000998716971680588, "loss": 1.3049, "step": 294 }, { "epoch": 0.024959810474659446, "grad_norm": 3.28125, "learning_rate": 0.0009987074188254853, "loss": 1.3704, "step": 295 }, { "epoch": 0.02504442000169219, "grad_norm": 2.265625, "learning_rate": 0.0009986978305850467, "loss": 1.6277, "step": 296 }, { "epoch": 0.025129029528724936, "grad_norm": 1.9296875, "learning_rate": 0.0009986882069599522, "loss": 1.4311, "step": 297 }, { "epoch": 0.02521363905575768, "grad_norm": 1.78125, "learning_rate": 0.0009986785479508846, "loss": 1.0015, "step": 298 }, { "epoch": 0.025298248582790422, "grad_norm": 2.484375, "learning_rate": 0.0009986688535585297, "loss": 1.2818, "step": 299 }, { "epoch": 0.025382858109823166, "grad_norm": 3.390625, "learning_rate": 0.000998659123783575, "loss": 1.459, "step": 300 }, { "epoch": 0.02546746763685591, "grad_norm": 3.453125, "learning_rate": 0.000998649358626711, "loss": 1.597, "step": 301 }, { "epoch": 0.025552077163888656, "grad_norm": 3.265625, "learning_rate": 0.0009986395580886304, "loss": 1.6184, "step": 302 }, { "epoch": 0.0256366866909214, "grad_norm": 3.171875, "learning_rate": 0.0009986297221700286, "loss": 1.6303, "step": 303 }, { "epoch": 0.025721296217954142, "grad_norm": 1.984375, "learning_rate": 0.0009986198508716037, "loss": 1.2013, "step": 304 }, { "epoch": 0.025805905744986885, "grad_norm": 2.71875, "learning_rate": 0.0009986099441940562, "loss": 1.5271, "step": 305 }, { "epoch": 0.02589051527201963, "grad_norm": 2.0625, "learning_rate": 0.0009986000021380888, "loss": 1.5848, "step": 306 }, { "epoch": 0.02597512479905237, "grad_norm": 2.34375, "learning_rate": 0.0009985900247044069, "loss": 1.5211, "step": 307 }, { "epoch": 0.02605973432608512, "grad_norm": 2.359375, "learning_rate": 0.0009985800118937184, "loss": 1.5063, "step": 308 }, { "epoch": 0.02614434385311786, "grad_norm": 3.171875, "learning_rate": 0.0009985699637067341, "loss": 1.4056, "step": 309 }, { "epoch": 0.026228953380150605, "grad_norm": 12.1875, "learning_rate": 0.0009985598801441665, "loss": 1.1841, "step": 310 }, { "epoch": 0.026313562907183348, "grad_norm": 3.609375, "learning_rate": 0.0009985497612067314, "loss": 1.5763, "step": 311 }, { "epoch": 0.02639817243421609, "grad_norm": 2.890625, "learning_rate": 0.0009985396068951467, "loss": 1.4368, "step": 312 }, { "epoch": 0.026482781961248838, "grad_norm": 2.53125, "learning_rate": 0.0009985294172101327, "loss": 1.4364, "step": 313 }, { "epoch": 0.02656739148828158, "grad_norm": 2.703125, "learning_rate": 0.000998519192152413, "loss": 1.2049, "step": 314 }, { "epoch": 0.026652001015314324, "grad_norm": 2.734375, "learning_rate": 0.0009985089317227122, "loss": 1.4638, "step": 315 }, { "epoch": 0.026736610542347067, "grad_norm": 1.859375, "learning_rate": 0.0009984986359217588, "loss": 1.3997, "step": 316 }, { "epoch": 0.02682122006937981, "grad_norm": 2.6875, "learning_rate": 0.0009984883047502835, "loss": 1.2781, "step": 317 }, { "epoch": 0.026905829596412557, "grad_norm": 2.078125, "learning_rate": 0.0009984779382090192, "loss": 1.2534, "step": 318 }, { "epoch": 0.0269904391234453, "grad_norm": 2.09375, "learning_rate": 0.0009984675362987013, "loss": 1.3566, "step": 319 }, { "epoch": 0.027075048650478044, "grad_norm": 2.59375, "learning_rate": 0.000998457099020068, "loss": 1.7275, "step": 320 }, { "epoch": 0.027159658177510787, "grad_norm": 13.25, "learning_rate": 0.00099844662637386, "loss": 1.0102, "step": 321 }, { "epoch": 0.02724426770454353, "grad_norm": 2.703125, "learning_rate": 0.0009984361183608197, "loss": 1.6163, "step": 322 }, { "epoch": 0.027328877231576277, "grad_norm": 2.578125, "learning_rate": 0.0009984255749816938, "loss": 1.983, "step": 323 }, { "epoch": 0.02741348675860902, "grad_norm": 3.21875, "learning_rate": 0.0009984149962372294, "loss": 2.1572, "step": 324 }, { "epoch": 0.027498096285641763, "grad_norm": 2.265625, "learning_rate": 0.0009984043821281777, "loss": 1.4192, "step": 325 }, { "epoch": 0.027582705812674507, "grad_norm": 3.5625, "learning_rate": 0.0009983937326552915, "loss": 1.873, "step": 326 }, { "epoch": 0.02766731533970725, "grad_norm": 2.15625, "learning_rate": 0.0009983830478193264, "loss": 1.554, "step": 327 }, { "epoch": 0.027751924866739996, "grad_norm": 2.453125, "learning_rate": 0.000998372327621041, "loss": 1.6128, "step": 328 }, { "epoch": 0.02783653439377274, "grad_norm": 1.625, "learning_rate": 0.0009983615720611954, "loss": 1.2974, "step": 329 }, { "epoch": 0.027921143920805483, "grad_norm": 7.65625, "learning_rate": 0.000998350781140553, "loss": 1.3522, "step": 330 }, { "epoch": 0.028005753447838226, "grad_norm": 4.0625, "learning_rate": 0.0009983399548598795, "loss": 1.6402, "step": 331 }, { "epoch": 0.02809036297487097, "grad_norm": 55.5, "learning_rate": 0.0009983290932199429, "loss": 1.5473, "step": 332 }, { "epoch": 0.028174972501903716, "grad_norm": 13.0625, "learning_rate": 0.000998318196221514, "loss": 1.3685, "step": 333 }, { "epoch": 0.02825958202893646, "grad_norm": 3.21875, "learning_rate": 0.000998307263865366, "loss": 1.3612, "step": 334 }, { "epoch": 0.028344191555969202, "grad_norm": 4.0625, "learning_rate": 0.0009982962961522744, "loss": 1.4535, "step": 335 }, { "epoch": 0.028428801083001946, "grad_norm": 74.0, "learning_rate": 0.0009982852930830177, "loss": 1.45, "step": 336 }, { "epoch": 0.02851341061003469, "grad_norm": 4.3125, "learning_rate": 0.0009982742546583764, "loss": 1.8693, "step": 337 }, { "epoch": 0.028598020137067432, "grad_norm": 5.65625, "learning_rate": 0.0009982631808791338, "loss": 1.6081, "step": 338 }, { "epoch": 0.02868262966410018, "grad_norm": 86.0, "learning_rate": 0.0009982520717460757, "loss": 1.2881, "step": 339 }, { "epoch": 0.028767239191132922, "grad_norm": 56.25, "learning_rate": 0.0009982409272599902, "loss": 1.5507, "step": 340 }, { "epoch": 0.028851848718165665, "grad_norm": 6.1875, "learning_rate": 0.000998229747421668, "loss": 1.7009, "step": 341 }, { "epoch": 0.02893645824519841, "grad_norm": 4.96875, "learning_rate": 0.0009982185322319026, "loss": 1.7586, "step": 342 }, { "epoch": 0.02902106777223115, "grad_norm": 35.0, "learning_rate": 0.0009982072816914894, "loss": 1.3968, "step": 343 }, { "epoch": 0.0291056772992639, "grad_norm": 3.046875, "learning_rate": 0.0009981959958012272, "loss": 1.335, "step": 344 }, { "epoch": 0.02919028682629664, "grad_norm": 4.5, "learning_rate": 0.0009981846745619164, "loss": 1.4444, "step": 345 }, { "epoch": 0.029274896353329385, "grad_norm": 28.25, "learning_rate": 0.0009981733179743604, "loss": 1.1728, "step": 346 }, { "epoch": 0.029359505880362128, "grad_norm": 256.0, "learning_rate": 0.0009981619260393648, "loss": 1.2869, "step": 347 }, { "epoch": 0.02944411540739487, "grad_norm": 3056.0, "learning_rate": 0.0009981504987577382, "loss": 1.3753, "step": 348 }, { "epoch": 0.029528724934427618, "grad_norm": 764.0, "learning_rate": 0.0009981390361302911, "loss": 1.794, "step": 349 }, { "epoch": 0.02961333446146036, "grad_norm": 1184.0, "learning_rate": 0.0009981275381578372, "loss": 2.0408, "step": 350 }, { "epoch": 0.029697943988493104, "grad_norm": 612.0, "learning_rate": 0.000998116004841192, "loss": 2.8433, "step": 351 }, { "epoch": 0.029782553515525848, "grad_norm": 544.0, "learning_rate": 0.000998104436181174, "loss": 3.2391, "step": 352 }, { "epoch": 0.02986716304255859, "grad_norm": 1808.0, "learning_rate": 0.000998092832178604, "loss": 2.8884, "step": 353 }, { "epoch": 0.029951772569591337, "grad_norm": 3456.0, "learning_rate": 0.0009980811928343054, "loss": 3.3631, "step": 354 }, { "epoch": 0.03003638209662408, "grad_norm": 478.0, "learning_rate": 0.000998069518149104, "loss": 3.3154, "step": 355 }, { "epoch": 0.030120991623656824, "grad_norm": 880.0, "learning_rate": 0.0009980578081238284, "loss": 3.3374, "step": 356 }, { "epoch": 0.030205601150689567, "grad_norm": 2608.0, "learning_rate": 0.0009980460627593089, "loss": 2.6807, "step": 357 }, { "epoch": 0.03029021067772231, "grad_norm": 83456.0, "learning_rate": 0.0009980342820563794, "loss": 2.9398, "step": 358 }, { "epoch": 0.030374820204755057, "grad_norm": 9600.0, "learning_rate": 0.0009980224660158757, "loss": 3.1462, "step": 359 }, { "epoch": 0.0304594297317878, "grad_norm": 7072.0, "learning_rate": 0.000998010614638636, "loss": 2.514, "step": 360 }, { "epoch": 0.030544039258820543, "grad_norm": 419840.0, "learning_rate": 0.0009979987279255013, "loss": 2.7013, "step": 361 }, { "epoch": 0.030628648785853287, "grad_norm": 9216.0, "learning_rate": 0.0009979868058773152, "loss": 2.6713, "step": 362 }, { "epoch": 0.03071325831288603, "grad_norm": 15232.0, "learning_rate": 0.0009979748484949232, "loss": 2.321, "step": 363 }, { "epoch": 0.030797867839918776, "grad_norm": 2768.0, "learning_rate": 0.0009979628557791743, "loss": 2.851, "step": 364 }, { "epoch": 0.03088247736695152, "grad_norm": 1440.0, "learning_rate": 0.0009979508277309188, "loss": 2.9006, "step": 365 }, { "epoch": 0.030967086893984263, "grad_norm": 1192.0, "learning_rate": 0.0009979387643510107, "loss": 2.954, "step": 366 }, { "epoch": 0.031051696421017006, "grad_norm": 1020.0, "learning_rate": 0.0009979266656403056, "loss": 2.7029, "step": 367 }, { "epoch": 0.03113630594804975, "grad_norm": 368.0, "learning_rate": 0.000997914531599662, "loss": 3.0723, "step": 368 }, { "epoch": 0.031220915475082493, "grad_norm": 118.5, "learning_rate": 0.000997902362229941, "loss": 3.4108, "step": 369 }, { "epoch": 0.03130552500211524, "grad_norm": 171.0, "learning_rate": 0.000997890157532006, "loss": 3.0421, "step": 370 }, { "epoch": 0.03139013452914798, "grad_norm": 63.75, "learning_rate": 0.000997877917506723, "loss": 3.0251, "step": 371 }, { "epoch": 0.031474744056180726, "grad_norm": 20.0, "learning_rate": 0.0009978656421549604, "loss": 2.686, "step": 372 }, { "epoch": 0.03155935358321347, "grad_norm": 49.0, "learning_rate": 0.0009978533314775892, "loss": 2.9034, "step": 373 }, { "epoch": 0.03164396311024621, "grad_norm": 14.5, "learning_rate": 0.000997840985475483, "loss": 2.2301, "step": 374 }, { "epoch": 0.031728572637278955, "grad_norm": 15.4375, "learning_rate": 0.0009978286041495174, "loss": 2.1013, "step": 375 }, { "epoch": 0.0318131821643117, "grad_norm": 211.0, "learning_rate": 0.0009978161875005716, "loss": 1.7976, "step": 376 }, { "epoch": 0.03189779169134445, "grad_norm": 34.5, "learning_rate": 0.000997803735529526, "loss": 2.2469, "step": 377 }, { "epoch": 0.03198240121837719, "grad_norm": 51.75, "learning_rate": 0.0009977912482372646, "loss": 1.7826, "step": 378 }, { "epoch": 0.032067010745409935, "grad_norm": 31.375, "learning_rate": 0.000997778725624673, "loss": 1.8353, "step": 379 }, { "epoch": 0.03215162027244268, "grad_norm": 25.25, "learning_rate": 0.00099776616769264, "loss": 1.7702, "step": 380 }, { "epoch": 0.03223622979947542, "grad_norm": 9.0625, "learning_rate": 0.0009977535744420563, "loss": 2.3, "step": 381 }, { "epoch": 0.032320839326508165, "grad_norm": 7.09375, "learning_rate": 0.000997740945873816, "loss": 1.9972, "step": 382 }, { "epoch": 0.03240544885354091, "grad_norm": 5.0625, "learning_rate": 0.0009977282819888145, "loss": 1.7951, "step": 383 }, { "epoch": 0.03249005838057365, "grad_norm": 5.15625, "learning_rate": 0.0009977155827879509, "loss": 1.5886, "step": 384 }, { "epoch": 0.032574667907606394, "grad_norm": 9.375, "learning_rate": 0.000997702848272126, "loss": 1.5342, "step": 385 }, { "epoch": 0.03265927743463914, "grad_norm": 7.34375, "learning_rate": 0.0009976900784422434, "loss": 1.9163, "step": 386 }, { "epoch": 0.03274388696167188, "grad_norm": 6.84375, "learning_rate": 0.0009976772732992092, "loss": 1.9442, "step": 387 }, { "epoch": 0.03282849648870463, "grad_norm": 5.78125, "learning_rate": 0.000997664432843932, "loss": 2.1904, "step": 388 }, { "epoch": 0.032913106015737374, "grad_norm": 9.875, "learning_rate": 0.0009976515570773229, "loss": 1.5792, "step": 389 }, { "epoch": 0.03299771554277012, "grad_norm": 6.84375, "learning_rate": 0.0009976386460002953, "loss": 2.18, "step": 390 }, { "epoch": 0.03308232506980286, "grad_norm": 6.78125, "learning_rate": 0.0009976256996137656, "loss": 1.4938, "step": 391 }, { "epoch": 0.033166934596835604, "grad_norm": 4.34375, "learning_rate": 0.000997612717918652, "loss": 1.3875, "step": 392 }, { "epoch": 0.03325154412386835, "grad_norm": 4.28125, "learning_rate": 0.000997599700915876, "loss": 1.724, "step": 393 }, { "epoch": 0.03333615365090109, "grad_norm": 3.296875, "learning_rate": 0.000997586648606361, "loss": 1.7405, "step": 394 }, { "epoch": 0.033420763177933833, "grad_norm": 3.25, "learning_rate": 0.0009975735609910332, "loss": 1.8569, "step": 395 }, { "epoch": 0.03350537270496658, "grad_norm": 5.03125, "learning_rate": 0.0009975604380708211, "loss": 2.2328, "step": 396 }, { "epoch": 0.03358998223199932, "grad_norm": 35.25, "learning_rate": 0.0009975472798466562, "loss": 1.3729, "step": 397 }, { "epoch": 0.03367459175903207, "grad_norm": 55.25, "learning_rate": 0.0009975340863194713, "loss": 1.679, "step": 398 }, { "epoch": 0.03375920128606481, "grad_norm": 21.375, "learning_rate": 0.0009975208574902034, "loss": 1.6629, "step": 399 }, { "epoch": 0.033843810813097557, "grad_norm": 5.09375, "learning_rate": 0.000997507593359791, "loss": 1.5835, "step": 400 }, { "epoch": 0.0339284203401303, "grad_norm": 4.53125, "learning_rate": 0.0009974942939291746, "loss": 2.1673, "step": 401 }, { "epoch": 0.03401302986716304, "grad_norm": 11.875, "learning_rate": 0.0009974809591992986, "loss": 1.9143, "step": 402 }, { "epoch": 0.034097639394195786, "grad_norm": 6.375, "learning_rate": 0.0009974675891711087, "loss": 1.2408, "step": 403 }, { "epoch": 0.03418224892122853, "grad_norm": 6.4375, "learning_rate": 0.0009974541838455538, "loss": 1.779, "step": 404 }, { "epoch": 0.03426685844826127, "grad_norm": 9.6875, "learning_rate": 0.000997440743223585, "loss": 1.5609, "step": 405 }, { "epoch": 0.034351467975294016, "grad_norm": 10.8125, "learning_rate": 0.0009974272673061557, "loss": 1.7199, "step": 406 }, { "epoch": 0.03443607750232676, "grad_norm": 11.1875, "learning_rate": 0.0009974137560942228, "loss": 1.6702, "step": 407 }, { "epoch": 0.03452068702935951, "grad_norm": 6.5625, "learning_rate": 0.000997400209588744, "loss": 1.49, "step": 408 }, { "epoch": 0.03460529655639225, "grad_norm": 21.875, "learning_rate": 0.000997386627790681, "loss": 2.2663, "step": 409 }, { "epoch": 0.034689906083424996, "grad_norm": 7.125, "learning_rate": 0.0009973730107009978, "loss": 1.6606, "step": 410 }, { "epoch": 0.03477451561045774, "grad_norm": 15.125, "learning_rate": 0.0009973593583206602, "loss": 1.6024, "step": 411 }, { "epoch": 0.03485912513749048, "grad_norm": 9.5625, "learning_rate": 0.0009973456706506368, "loss": 1.6296, "step": 412 }, { "epoch": 0.034943734664523225, "grad_norm": 7.8125, "learning_rate": 0.0009973319476918989, "loss": 1.5316, "step": 413 }, { "epoch": 0.03502834419155597, "grad_norm": 6.0, "learning_rate": 0.0009973181894454201, "loss": 1.6886, "step": 414 }, { "epoch": 0.03511295371858871, "grad_norm": 7.34375, "learning_rate": 0.000997304395912177, "loss": 1.5605, "step": 415 }, { "epoch": 0.035197563245621455, "grad_norm": 5.96875, "learning_rate": 0.0009972905670931481, "loss": 1.6617, "step": 416 }, { "epoch": 0.0352821727726542, "grad_norm": 115.0, "learning_rate": 0.0009972767029893144, "loss": 1.4393, "step": 417 }, { "epoch": 0.03536678229968694, "grad_norm": 7.3125, "learning_rate": 0.0009972628036016597, "loss": 1.658, "step": 418 }, { "epoch": 0.03545139182671969, "grad_norm": 4.34375, "learning_rate": 0.0009972488689311703, "loss": 1.4791, "step": 419 }, { "epoch": 0.035536001353752435, "grad_norm": 4.34375, "learning_rate": 0.0009972348989788352, "loss": 1.9138, "step": 420 }, { "epoch": 0.03562061088078518, "grad_norm": 3.9375, "learning_rate": 0.000997220893745645, "loss": 1.6945, "step": 421 }, { "epoch": 0.03570522040781792, "grad_norm": 3.640625, "learning_rate": 0.0009972068532325938, "loss": 1.531, "step": 422 }, { "epoch": 0.035789829934850664, "grad_norm": 6.375, "learning_rate": 0.0009971927774406779, "loss": 1.2508, "step": 423 }, { "epoch": 0.03587443946188341, "grad_norm": 4.15625, "learning_rate": 0.000997178666370896, "loss": 1.8392, "step": 424 }, { "epoch": 0.03595904898891615, "grad_norm": 6.21875, "learning_rate": 0.0009971645200242492, "loss": 2.0438, "step": 425 }, { "epoch": 0.036043658515948894, "grad_norm": 2.890625, "learning_rate": 0.0009971503384017413, "loss": 1.1425, "step": 426 }, { "epoch": 0.03612826804298164, "grad_norm": 2.796875, "learning_rate": 0.0009971361215043784, "loss": 1.6439, "step": 427 }, { "epoch": 0.03621287757001438, "grad_norm": 3.484375, "learning_rate": 0.0009971218693331696, "loss": 1.4733, "step": 428 }, { "epoch": 0.03629748709704713, "grad_norm": 2.75, "learning_rate": 0.000997107581889126, "loss": 1.4184, "step": 429 }, { "epoch": 0.036382096624079874, "grad_norm": 3.78125, "learning_rate": 0.0009970932591732613, "loss": 1.8347, "step": 430 }, { "epoch": 0.03646670615111262, "grad_norm": 2.328125, "learning_rate": 0.0009970789011865915, "loss": 1.5557, "step": 431 }, { "epoch": 0.03655131567814536, "grad_norm": 2.875, "learning_rate": 0.0009970645079301359, "loss": 1.624, "step": 432 }, { "epoch": 0.0366359252051781, "grad_norm": 2.984375, "learning_rate": 0.0009970500794049153, "loss": 1.3857, "step": 433 }, { "epoch": 0.03672053473221085, "grad_norm": 2.8125, "learning_rate": 0.0009970356156119538, "loss": 1.4808, "step": 434 }, { "epoch": 0.03680514425924359, "grad_norm": 5.34375, "learning_rate": 0.0009970211165522774, "loss": 1.2682, "step": 435 }, { "epoch": 0.03688975378627633, "grad_norm": 7.84375, "learning_rate": 0.000997006582226915, "loss": 1.4656, "step": 436 }, { "epoch": 0.036974363313309076, "grad_norm": 4.21875, "learning_rate": 0.000996992012636898, "loss": 1.4898, "step": 437 }, { "epoch": 0.03705897284034182, "grad_norm": 6.46875, "learning_rate": 0.0009969774077832597, "loss": 1.5521, "step": 438 }, { "epoch": 0.03714358236737457, "grad_norm": 2.921875, "learning_rate": 0.000996962767667037, "loss": 1.1943, "step": 439 }, { "epoch": 0.03722819189440731, "grad_norm": 2.421875, "learning_rate": 0.0009969480922892681, "loss": 1.3804, "step": 440 }, { "epoch": 0.037312801421440056, "grad_norm": 3.8125, "learning_rate": 0.0009969333816509946, "loss": 1.6806, "step": 441 }, { "epoch": 0.0373974109484728, "grad_norm": 10.1875, "learning_rate": 0.0009969186357532604, "loss": 2.4735, "step": 442 }, { "epoch": 0.03748202047550554, "grad_norm": 4.5625, "learning_rate": 0.0009969038545971115, "loss": 1.4318, "step": 443 }, { "epoch": 0.037566630002538286, "grad_norm": 6.40625, "learning_rate": 0.0009968890381835968, "loss": 1.7157, "step": 444 }, { "epoch": 0.03765123952957103, "grad_norm": 2.953125, "learning_rate": 0.0009968741865137676, "loss": 2.0397, "step": 445 }, { "epoch": 0.03773584905660377, "grad_norm": 3.984375, "learning_rate": 0.0009968592995886778, "loss": 1.4634, "step": 446 }, { "epoch": 0.037820458583636515, "grad_norm": 4.28125, "learning_rate": 0.0009968443774093835, "loss": 1.4054, "step": 447 }, { "epoch": 0.03790506811066926, "grad_norm": 2.984375, "learning_rate": 0.0009968294199769433, "loss": 1.3067, "step": 448 }, { "epoch": 0.037989677637702, "grad_norm": 2.421875, "learning_rate": 0.0009968144272924188, "loss": 1.1902, "step": 449 }, { "epoch": 0.03807428716473475, "grad_norm": 2.953125, "learning_rate": 0.0009967993993568738, "loss": 1.284, "step": 450 }, { "epoch": 0.038158896691767495, "grad_norm": 3.28125, "learning_rate": 0.0009967843361713747, "loss": 1.7728, "step": 451 }, { "epoch": 0.03824350621880024, "grad_norm": 2.890625, "learning_rate": 0.0009967692377369898, "loss": 2.0907, "step": 452 }, { "epoch": 0.03832811574583298, "grad_norm": 2.1875, "learning_rate": 0.000996754104054791, "loss": 1.5884, "step": 453 }, { "epoch": 0.038412725272865725, "grad_norm": 2.640625, "learning_rate": 0.0009967389351258516, "loss": 1.4421, "step": 454 }, { "epoch": 0.03849733479989847, "grad_norm": 3.1875, "learning_rate": 0.0009967237309512482, "loss": 1.4248, "step": 455 }, { "epoch": 0.03858194432693121, "grad_norm": 7.09375, "learning_rate": 0.0009967084915320594, "loss": 1.588, "step": 456 }, { "epoch": 0.038666553853963954, "grad_norm": 1112.0, "learning_rate": 0.0009966932168693668, "loss": 1.2521, "step": 457 }, { "epoch": 0.0387511633809967, "grad_norm": 266.0, "learning_rate": 0.0009966779069642538, "loss": 1.5216, "step": 458 }, { "epoch": 0.03883577290802944, "grad_norm": 5.5625, "learning_rate": 0.0009966625618178068, "loss": 2.8209, "step": 459 }, { "epoch": 0.03892038243506219, "grad_norm": 3.171875, "learning_rate": 0.0009966471814311152, "loss": 1.7797, "step": 460 }, { "epoch": 0.039004991962094934, "grad_norm": 3.828125, "learning_rate": 0.0009966317658052695, "loss": 2.0008, "step": 461 }, { "epoch": 0.03908960148912768, "grad_norm": 3.578125, "learning_rate": 0.0009966163149413637, "loss": 1.947, "step": 462 }, { "epoch": 0.03917421101616042, "grad_norm": 2.6875, "learning_rate": 0.0009966008288404943, "loss": 1.8034, "step": 463 }, { "epoch": 0.039258820543193164, "grad_norm": 2.71875, "learning_rate": 0.00099658530750376, "loss": 1.4772, "step": 464 }, { "epoch": 0.03934343007022591, "grad_norm": 2.421875, "learning_rate": 0.0009965697509322622, "loss": 1.603, "step": 465 }, { "epoch": 0.03942803959725865, "grad_norm": 2.1875, "learning_rate": 0.0009965541591271045, "loss": 1.4341, "step": 466 }, { "epoch": 0.039512649124291394, "grad_norm": 2.328125, "learning_rate": 0.0009965385320893933, "loss": 1.594, "step": 467 }, { "epoch": 0.03959725865132414, "grad_norm": 2.21875, "learning_rate": 0.0009965228698202374, "loss": 1.7873, "step": 468 }, { "epoch": 0.03968186817835688, "grad_norm": 2.0, "learning_rate": 0.0009965071723207484, "loss": 1.5389, "step": 469 }, { "epoch": 0.03976647770538963, "grad_norm": 1.4609375, "learning_rate": 0.0009964914395920396, "loss": 1.3452, "step": 470 }, { "epoch": 0.03985108723242237, "grad_norm": 1.7421875, "learning_rate": 0.0009964756716352276, "loss": 1.5352, "step": 471 }, { "epoch": 0.03993569675945512, "grad_norm": 2.265625, "learning_rate": 0.0009964598684514313, "loss": 1.0926, "step": 472 }, { "epoch": 0.04002030628648786, "grad_norm": 2.15625, "learning_rate": 0.0009964440300417716, "loss": 1.4208, "step": 473 }, { "epoch": 0.0401049158135206, "grad_norm": 2.53125, "learning_rate": 0.0009964281564073725, "loss": 1.2632, "step": 474 }, { "epoch": 0.040189525340553346, "grad_norm": 3.828125, "learning_rate": 0.0009964122475493605, "loss": 1.5355, "step": 475 }, { "epoch": 0.04027413486758609, "grad_norm": 2.546875, "learning_rate": 0.000996396303468864, "loss": 1.8134, "step": 476 }, { "epoch": 0.04035874439461883, "grad_norm": 3.3125, "learning_rate": 0.0009963803241670147, "loss": 1.6089, "step": 477 }, { "epoch": 0.040443353921651576, "grad_norm": 2.796875, "learning_rate": 0.0009963643096449462, "loss": 1.5777, "step": 478 }, { "epoch": 0.04052796344868432, "grad_norm": 3.71875, "learning_rate": 0.0009963482599037946, "loss": 1.6556, "step": 479 }, { "epoch": 0.04061257297571707, "grad_norm": 18.625, "learning_rate": 0.0009963321749446993, "loss": 1.472, "step": 480 }, { "epoch": 0.04069718250274981, "grad_norm": 2.203125, "learning_rate": 0.000996316054768801, "loss": 1.401, "step": 481 }, { "epoch": 0.040781792029782556, "grad_norm": 2.578125, "learning_rate": 0.0009962998993772439, "loss": 1.4282, "step": 482 }, { "epoch": 0.0408664015568153, "grad_norm": 3.09375, "learning_rate": 0.0009962837087711738, "loss": 1.6441, "step": 483 }, { "epoch": 0.04095101108384804, "grad_norm": 3.6875, "learning_rate": 0.00099626748295174, "loss": 2.364, "step": 484 }, { "epoch": 0.041035620610880785, "grad_norm": 4.21875, "learning_rate": 0.0009962512219200935, "loss": 2.0495, "step": 485 }, { "epoch": 0.04112023013791353, "grad_norm": 3.34375, "learning_rate": 0.000996234925677388, "loss": 1.4075, "step": 486 }, { "epoch": 0.04120483966494627, "grad_norm": 4.34375, "learning_rate": 0.0009962185942247802, "loss": 1.7289, "step": 487 }, { "epoch": 0.041289449191979015, "grad_norm": 4.15625, "learning_rate": 0.0009962022275634285, "loss": 1.3854, "step": 488 }, { "epoch": 0.04137405871901176, "grad_norm": 3.421875, "learning_rate": 0.0009961858256944942, "loss": 1.6097, "step": 489 }, { "epoch": 0.0414586682460445, "grad_norm": 6.03125, "learning_rate": 0.0009961693886191413, "loss": 1.5453, "step": 490 }, { "epoch": 0.04154327777307725, "grad_norm": 3.203125, "learning_rate": 0.0009961529163385358, "loss": 1.6348, "step": 491 }, { "epoch": 0.041627887300109995, "grad_norm": 2.015625, "learning_rate": 0.000996136408853847, "loss": 1.3966, "step": 492 }, { "epoch": 0.04171249682714274, "grad_norm": 2.75, "learning_rate": 0.0009961198661662455, "loss": 1.8525, "step": 493 }, { "epoch": 0.04179710635417548, "grad_norm": 2.078125, "learning_rate": 0.0009961032882769055, "loss": 1.5937, "step": 494 }, { "epoch": 0.041881715881208224, "grad_norm": 2.828125, "learning_rate": 0.0009960866751870033, "loss": 1.9505, "step": 495 }, { "epoch": 0.04196632540824097, "grad_norm": 3.640625, "learning_rate": 0.0009960700268977173, "loss": 1.7248, "step": 496 }, { "epoch": 0.04205093493527371, "grad_norm": 2.875, "learning_rate": 0.000996053343410229, "loss": 1.8714, "step": 497 }, { "epoch": 0.042135544462306454, "grad_norm": 2.578125, "learning_rate": 0.0009960366247257222, "loss": 1.2586, "step": 498 }, { "epoch": 0.0422201539893392, "grad_norm": 2.71875, "learning_rate": 0.0009960198708453831, "loss": 1.796, "step": 499 }, { "epoch": 0.04230476351637194, "grad_norm": 2.4375, "learning_rate": 0.0009960030817704004, "loss": 1.4065, "step": 500 }, { "epoch": 0.04238937304340469, "grad_norm": 3.296875, "learning_rate": 0.0009959862575019656, "loss": 1.2108, "step": 501 }, { "epoch": 0.042473982570437434, "grad_norm": 2.4375, "learning_rate": 0.0009959693980412722, "loss": 1.6824, "step": 502 }, { "epoch": 0.04255859209747018, "grad_norm": 2.390625, "learning_rate": 0.0009959525033895165, "loss": 1.4973, "step": 503 }, { "epoch": 0.04264320162450292, "grad_norm": 9.9375, "learning_rate": 0.0009959355735478972, "loss": 1.399, "step": 504 }, { "epoch": 0.04272781115153566, "grad_norm": 2.6875, "learning_rate": 0.0009959186085176158, "loss": 2.0055, "step": 505 }, { "epoch": 0.04281242067856841, "grad_norm": 2.28125, "learning_rate": 0.0009959016082998755, "loss": 1.3033, "step": 506 }, { "epoch": 0.04289703020560115, "grad_norm": 2.625, "learning_rate": 0.0009958845728958831, "loss": 2.1366, "step": 507 }, { "epoch": 0.04298163973263389, "grad_norm": 2.15625, "learning_rate": 0.000995867502306847, "loss": 1.4802, "step": 508 }, { "epoch": 0.043066249259666636, "grad_norm": 3.0625, "learning_rate": 0.0009958503965339788, "loss": 2.423, "step": 509 }, { "epoch": 0.04315085878669938, "grad_norm": 2.34375, "learning_rate": 0.0009958332555784918, "loss": 1.7215, "step": 510 }, { "epoch": 0.04323546831373213, "grad_norm": 2.578125, "learning_rate": 0.0009958160794416022, "loss": 1.6019, "step": 511 }, { "epoch": 0.04332007784076487, "grad_norm": 2.109375, "learning_rate": 0.000995798868124529, "loss": 1.3312, "step": 512 }, { "epoch": 0.043404687367797616, "grad_norm": 2.515625, "learning_rate": 0.0009957816216284935, "loss": 1.6573, "step": 513 }, { "epoch": 0.04348929689483036, "grad_norm": 2.21875, "learning_rate": 0.0009957643399547192, "loss": 1.3591, "step": 514 }, { "epoch": 0.0435739064218631, "grad_norm": 3.046875, "learning_rate": 0.0009957470231044322, "loss": 1.4667, "step": 515 }, { "epoch": 0.043658515948895846, "grad_norm": 2.125, "learning_rate": 0.0009957296710788612, "loss": 1.4413, "step": 516 }, { "epoch": 0.04374312547592859, "grad_norm": 2.3125, "learning_rate": 0.0009957122838792378, "loss": 1.1131, "step": 517 }, { "epoch": 0.04382773500296133, "grad_norm": 2.078125, "learning_rate": 0.0009956948615067952, "loss": 1.2172, "step": 518 }, { "epoch": 0.043912344529994075, "grad_norm": 2.75, "learning_rate": 0.00099567740396277, "loss": 1.8067, "step": 519 }, { "epoch": 0.04399695405702682, "grad_norm": 2.5625, "learning_rate": 0.0009956599112484006, "loss": 1.5125, "step": 520 }, { "epoch": 0.04408156358405956, "grad_norm": 1.984375, "learning_rate": 0.0009956423833649283, "loss": 1.5085, "step": 521 }, { "epoch": 0.04416617311109231, "grad_norm": 2.421875, "learning_rate": 0.0009956248203135966, "loss": 1.5852, "step": 522 }, { "epoch": 0.044250782638125055, "grad_norm": 2.796875, "learning_rate": 0.0009956072220956521, "loss": 1.1601, "step": 523 }, { "epoch": 0.0443353921651578, "grad_norm": 2.953125, "learning_rate": 0.000995589588712343, "loss": 1.6977, "step": 524 }, { "epoch": 0.04442000169219054, "grad_norm": 3.359375, "learning_rate": 0.0009955719201649206, "loss": 1.9036, "step": 525 }, { "epoch": 0.044504611219223285, "grad_norm": 2.5, "learning_rate": 0.0009955542164546388, "loss": 1.4687, "step": 526 }, { "epoch": 0.04458922074625603, "grad_norm": 3.0625, "learning_rate": 0.0009955364775827535, "loss": 1.9439, "step": 527 }, { "epoch": 0.04467383027328877, "grad_norm": 466.0, "learning_rate": 0.0009955187035505233, "loss": 1.6088, "step": 528 }, { "epoch": 0.044758439800321514, "grad_norm": 2.90625, "learning_rate": 0.0009955008943592094, "loss": 1.9631, "step": 529 }, { "epoch": 0.04484304932735426, "grad_norm": 2.40625, "learning_rate": 0.0009954830500100756, "loss": 1.3634, "step": 530 }, { "epoch": 0.044927658854387, "grad_norm": 3.140625, "learning_rate": 0.0009954651705043878, "loss": 1.8, "step": 531 }, { "epoch": 0.04501226838141975, "grad_norm": 6.59375, "learning_rate": 0.0009954472558434148, "loss": 1.5134, "step": 532 }, { "epoch": 0.045096877908452494, "grad_norm": 3.046875, "learning_rate": 0.0009954293060284276, "loss": 1.6015, "step": 533 }, { "epoch": 0.04518148743548524, "grad_norm": 2.578125, "learning_rate": 0.0009954113210606998, "loss": 1.4167, "step": 534 }, { "epoch": 0.04526609696251798, "grad_norm": 4.65625, "learning_rate": 0.0009953933009415076, "loss": 1.9478, "step": 535 }, { "epoch": 0.045350706489550724, "grad_norm": 3.59375, "learning_rate": 0.0009953752456721297, "loss": 1.6296, "step": 536 }, { "epoch": 0.04543531601658347, "grad_norm": 2.640625, "learning_rate": 0.000995357155253847, "loss": 1.7564, "step": 537 }, { "epoch": 0.04551992554361621, "grad_norm": 3.890625, "learning_rate": 0.0009953390296879431, "loss": 1.3776, "step": 538 }, { "epoch": 0.045604535070648954, "grad_norm": 3.984375, "learning_rate": 0.0009953208689757044, "loss": 1.5307, "step": 539 }, { "epoch": 0.0456891445976817, "grad_norm": 3.03125, "learning_rate": 0.0009953026731184189, "loss": 1.4973, "step": 540 }, { "epoch": 0.04577375412471444, "grad_norm": 2.296875, "learning_rate": 0.0009952844421173782, "loss": 1.9153, "step": 541 }, { "epoch": 0.04585836365174719, "grad_norm": 3.296875, "learning_rate": 0.0009952661759738753, "loss": 1.9208, "step": 542 }, { "epoch": 0.04594297317877993, "grad_norm": 4.5, "learning_rate": 0.0009952478746892068, "loss": 1.2596, "step": 543 }, { "epoch": 0.04602758270581268, "grad_norm": 46.75, "learning_rate": 0.0009952295382646712, "loss": 1.3377, "step": 544 }, { "epoch": 0.04611219223284542, "grad_norm": 10.125, "learning_rate": 0.0009952111667015694, "loss": 2.066, "step": 545 }, { "epoch": 0.04619680175987816, "grad_norm": 2.96875, "learning_rate": 0.0009951927600012048, "loss": 1.248, "step": 546 }, { "epoch": 0.046281411286910906, "grad_norm": 4.78125, "learning_rate": 0.0009951743181648835, "loss": 1.3788, "step": 547 }, { "epoch": 0.04636602081394365, "grad_norm": 2.828125, "learning_rate": 0.0009951558411939142, "loss": 1.508, "step": 548 }, { "epoch": 0.04645063034097639, "grad_norm": 2.953125, "learning_rate": 0.0009951373290896077, "loss": 1.1485, "step": 549 }, { "epoch": 0.046535239868009136, "grad_norm": 5.25, "learning_rate": 0.0009951187818532778, "loss": 1.14, "step": 550 }, { "epoch": 0.04661984939504188, "grad_norm": 2.796875, "learning_rate": 0.00099510019948624, "loss": 1.6612, "step": 551 }, { "epoch": 0.04670445892207462, "grad_norm": 2.640625, "learning_rate": 0.0009950815819898136, "loss": 1.694, "step": 552 }, { "epoch": 0.04678906844910737, "grad_norm": 2.359375, "learning_rate": 0.0009950629293653188, "loss": 1.5725, "step": 553 }, { "epoch": 0.046873677976140116, "grad_norm": 2.328125, "learning_rate": 0.0009950442416140795, "loss": 1.4235, "step": 554 }, { "epoch": 0.04695828750317286, "grad_norm": 2.234375, "learning_rate": 0.0009950255187374216, "loss": 1.1938, "step": 555 }, { "epoch": 0.0470428970302056, "grad_norm": 2.234375, "learning_rate": 0.0009950067607366732, "loss": 1.648, "step": 556 }, { "epoch": 0.047127506557238345, "grad_norm": 2.09375, "learning_rate": 0.000994987967613166, "loss": 1.5901, "step": 557 }, { "epoch": 0.04721211608427109, "grad_norm": 3.125, "learning_rate": 0.0009949691393682327, "loss": 2.1817, "step": 558 }, { "epoch": 0.04729672561130383, "grad_norm": 2.390625, "learning_rate": 0.0009949502760032097, "loss": 1.7478, "step": 559 }, { "epoch": 0.047381335138336575, "grad_norm": 3.28125, "learning_rate": 0.0009949313775194353, "loss": 1.5831, "step": 560 }, { "epoch": 0.04746594466536932, "grad_norm": 2.4375, "learning_rate": 0.0009949124439182506, "loss": 1.1959, "step": 561 }, { "epoch": 0.04755055419240206, "grad_norm": 5.40625, "learning_rate": 0.0009948934752009985, "loss": 1.5904, "step": 562 }, { "epoch": 0.04763516371943481, "grad_norm": 2.90625, "learning_rate": 0.0009948744713690256, "loss": 1.669, "step": 563 }, { "epoch": 0.047719773246467555, "grad_norm": 3.453125, "learning_rate": 0.0009948554324236798, "loss": 1.5525, "step": 564 }, { "epoch": 0.0478043827735003, "grad_norm": 2.765625, "learning_rate": 0.0009948363583663121, "loss": 1.3652, "step": 565 }, { "epoch": 0.04788899230053304, "grad_norm": 23.75, "learning_rate": 0.000994817249198276, "loss": 1.809, "step": 566 }, { "epoch": 0.047973601827565784, "grad_norm": 6.9375, "learning_rate": 0.000994798104920927, "loss": 2.4332, "step": 567 }, { "epoch": 0.04805821135459853, "grad_norm": 14.375, "learning_rate": 0.0009947789255356243, "loss": 1.3239, "step": 568 }, { "epoch": 0.04814282088163127, "grad_norm": 5.0, "learning_rate": 0.0009947597110437278, "loss": 1.5916, "step": 569 }, { "epoch": 0.048227430408664014, "grad_norm": 2.96875, "learning_rate": 0.0009947404614466017, "loss": 1.8177, "step": 570 }, { "epoch": 0.04831203993569676, "grad_norm": 10.9375, "learning_rate": 0.000994721176745611, "loss": 1.3777, "step": 571 }, { "epoch": 0.0483966494627295, "grad_norm": 3.28125, "learning_rate": 0.0009947018569421246, "loss": 2.0263, "step": 572 }, { "epoch": 0.04848125898976225, "grad_norm": 4.03125, "learning_rate": 0.0009946825020375131, "loss": 1.9254, "step": 573 }, { "epoch": 0.048565868516794994, "grad_norm": 3.34375, "learning_rate": 0.00099466311203315, "loss": 1.3581, "step": 574 }, { "epoch": 0.04865047804382774, "grad_norm": 3.75, "learning_rate": 0.0009946436869304107, "loss": 2.6882, "step": 575 }, { "epoch": 0.04873508757086048, "grad_norm": 4.6875, "learning_rate": 0.0009946242267306739, "loss": 1.1264, "step": 576 }, { "epoch": 0.048819697097893223, "grad_norm": 6.34375, "learning_rate": 0.0009946047314353204, "loss": 1.405, "step": 577 }, { "epoch": 0.04890430662492597, "grad_norm": 2.484375, "learning_rate": 0.000994585201045733, "loss": 1.4567, "step": 578 }, { "epoch": 0.04898891615195871, "grad_norm": 2.65625, "learning_rate": 0.000994565635563298, "loss": 1.2529, "step": 579 }, { "epoch": 0.04907352567899145, "grad_norm": 2.765625, "learning_rate": 0.0009945460349894033, "loss": 1.725, "step": 580 }, { "epoch": 0.049158135206024196, "grad_norm": 3.203125, "learning_rate": 0.0009945263993254398, "loss": 1.7213, "step": 581 }, { "epoch": 0.04924274473305694, "grad_norm": 3.8125, "learning_rate": 0.0009945067285728007, "loss": 2.0613, "step": 582 }, { "epoch": 0.04932735426008968, "grad_norm": 3.453125, "learning_rate": 0.0009944870227328816, "loss": 1.7861, "step": 583 }, { "epoch": 0.04941196378712243, "grad_norm": 3.09375, "learning_rate": 0.000994467281807081, "loss": 1.6237, "step": 584 }, { "epoch": 0.049496573314155176, "grad_norm": 4.125, "learning_rate": 0.0009944475057967995, "loss": 1.434, "step": 585 }, { "epoch": 0.04958118284118792, "grad_norm": 2.5, "learning_rate": 0.00099442769470344, "loss": 1.276, "step": 586 }, { "epoch": 0.04966579236822066, "grad_norm": 2.921875, "learning_rate": 0.0009944078485284084, "loss": 1.6379, "step": 587 }, { "epoch": 0.049750401895253406, "grad_norm": 2.90625, "learning_rate": 0.000994387967273113, "loss": 2.2163, "step": 588 }, { "epoch": 0.04983501142228615, "grad_norm": 2.53125, "learning_rate": 0.000994368050938964, "loss": 1.1714, "step": 589 }, { "epoch": 0.04991962094931889, "grad_norm": 2.265625, "learning_rate": 0.0009943480995273752, "loss": 1.6011, "step": 590 }, { "epoch": 0.050004230476351635, "grad_norm": 2.53125, "learning_rate": 0.0009943281130397618, "loss": 1.8517, "step": 591 }, { "epoch": 0.05008884000338438, "grad_norm": 2.328125, "learning_rate": 0.000994308091477542, "loss": 1.7369, "step": 592 }, { "epoch": 0.05017344953041712, "grad_norm": 2.34375, "learning_rate": 0.0009942880348421364, "loss": 1.5299, "step": 593 }, { "epoch": 0.05025805905744987, "grad_norm": 3.71875, "learning_rate": 0.0009942679431349682, "loss": 1.8462, "step": 594 }, { "epoch": 0.050342668584482615, "grad_norm": 1.9921875, "learning_rate": 0.000994247816357463, "loss": 1.1763, "step": 595 }, { "epoch": 0.05042727811151536, "grad_norm": 2.125, "learning_rate": 0.0009942276545110485, "loss": 1.6205, "step": 596 }, { "epoch": 0.0505118876385481, "grad_norm": 2.109375, "learning_rate": 0.0009942074575971557, "loss": 1.674, "step": 597 }, { "epoch": 0.050596497165580845, "grad_norm": 2.015625, "learning_rate": 0.0009941872256172175, "loss": 1.2339, "step": 598 }, { "epoch": 0.05068110669261359, "grad_norm": 1.9296875, "learning_rate": 0.0009941669585726697, "loss": 1.473, "step": 599 }, { "epoch": 0.05076571621964633, "grad_norm": 2.609375, "learning_rate": 0.0009941466564649497, "loss": 1.6188, "step": 600 }, { "epoch": 0.050850325746679075, "grad_norm": 2.21875, "learning_rate": 0.0009941263192954986, "loss": 1.3364, "step": 601 }, { "epoch": 0.05093493527371182, "grad_norm": 2.625, "learning_rate": 0.0009941059470657593, "loss": 1.3214, "step": 602 }, { "epoch": 0.05101954480074456, "grad_norm": 4.6875, "learning_rate": 0.0009940855397771772, "loss": 1.5279, "step": 603 }, { "epoch": 0.05110415432777731, "grad_norm": 2.78125, "learning_rate": 0.0009940650974312, "loss": 1.4447, "step": 604 }, { "epoch": 0.051188763854810054, "grad_norm": 3.34375, "learning_rate": 0.0009940446200292787, "loss": 1.9854, "step": 605 }, { "epoch": 0.0512733733818428, "grad_norm": 2.890625, "learning_rate": 0.000994024107572866, "loss": 2.004, "step": 606 }, { "epoch": 0.05135798290887554, "grad_norm": 2.484375, "learning_rate": 0.0009940035600634171, "loss": 1.4318, "step": 607 }, { "epoch": 0.051442592435908284, "grad_norm": 2.8125, "learning_rate": 0.0009939829775023905, "loss": 2.39, "step": 608 }, { "epoch": 0.05152720196294103, "grad_norm": 2.640625, "learning_rate": 0.000993962359891246, "loss": 1.607, "step": 609 }, { "epoch": 0.05161181148997377, "grad_norm": 2.546875, "learning_rate": 0.0009939417072314472, "loss": 1.7664, "step": 610 }, { "epoch": 0.051696421017006514, "grad_norm": 2.03125, "learning_rate": 0.000993921019524459, "loss": 1.537, "step": 611 }, { "epoch": 0.05178103054403926, "grad_norm": 2.5625, "learning_rate": 0.0009939002967717491, "loss": 1.7743, "step": 612 }, { "epoch": 0.051865640071072, "grad_norm": 2.671875, "learning_rate": 0.0009938795389747884, "loss": 1.266, "step": 613 }, { "epoch": 0.05195024959810474, "grad_norm": 6.75, "learning_rate": 0.0009938587461350492, "loss": 2.0028, "step": 614 }, { "epoch": 0.05203485912513749, "grad_norm": 2.828125, "learning_rate": 0.0009938379182540074, "loss": 1.4361, "step": 615 }, { "epoch": 0.05211946865217024, "grad_norm": 3.296875, "learning_rate": 0.0009938170553331406, "loss": 1.9379, "step": 616 }, { "epoch": 0.05220407817920298, "grad_norm": 2.84375, "learning_rate": 0.0009937961573739287, "loss": 2.0253, "step": 617 }, { "epoch": 0.05228868770623572, "grad_norm": 2.515625, "learning_rate": 0.000993775224377855, "loss": 1.8613, "step": 618 }, { "epoch": 0.052373297233268466, "grad_norm": 1.8984375, "learning_rate": 0.0009937542563464047, "loss": 1.1693, "step": 619 }, { "epoch": 0.05245790676030121, "grad_norm": 2.484375, "learning_rate": 0.0009937332532810655, "loss": 1.6421, "step": 620 }, { "epoch": 0.05254251628733395, "grad_norm": 1.96875, "learning_rate": 0.0009937122151833279, "loss": 1.3297, "step": 621 }, { "epoch": 0.052627125814366696, "grad_norm": 2.34375, "learning_rate": 0.0009936911420546839, "loss": 2.0026, "step": 622 }, { "epoch": 0.05271173534139944, "grad_norm": 2.34375, "learning_rate": 0.0009936700338966295, "loss": 1.5152, "step": 623 }, { "epoch": 0.05279634486843218, "grad_norm": 2.25, "learning_rate": 0.000993648890710662, "loss": 1.6053, "step": 624 }, { "epoch": 0.05288095439546493, "grad_norm": 3.40625, "learning_rate": 0.000993627712498282, "loss": 1.2965, "step": 625 }, { "epoch": 0.052965563922497676, "grad_norm": 2.984375, "learning_rate": 0.0009936064992609917, "loss": 1.3589, "step": 626 }, { "epoch": 0.05305017344953042, "grad_norm": 5.09375, "learning_rate": 0.0009935852510002965, "loss": 2.1085, "step": 627 }, { "epoch": 0.05313478297656316, "grad_norm": 3.21875, "learning_rate": 0.000993563967717704, "loss": 1.7981, "step": 628 }, { "epoch": 0.053219392503595905, "grad_norm": 2.40625, "learning_rate": 0.0009935426494147243, "loss": 1.3866, "step": 629 }, { "epoch": 0.05330400203062865, "grad_norm": 3.078125, "learning_rate": 0.00099352129609287, "loss": 1.7455, "step": 630 }, { "epoch": 0.05338861155766139, "grad_norm": 2.421875, "learning_rate": 0.0009934999077536566, "loss": 1.417, "step": 631 }, { "epoch": 0.053473221084694135, "grad_norm": 3.4375, "learning_rate": 0.0009934784843986014, "loss": 1.609, "step": 632 }, { "epoch": 0.05355783061172688, "grad_norm": 4.15625, "learning_rate": 0.0009934570260292242, "loss": 1.5699, "step": 633 }, { "epoch": 0.05364244013875962, "grad_norm": 4.125, "learning_rate": 0.000993435532647048, "loss": 1.3726, "step": 634 }, { "epoch": 0.05372704966579237, "grad_norm": 3.046875, "learning_rate": 0.0009934140042535974, "loss": 1.7899, "step": 635 }, { "epoch": 0.053811659192825115, "grad_norm": 2.515625, "learning_rate": 0.0009933924408504003, "loss": 1.9819, "step": 636 }, { "epoch": 0.05389626871985786, "grad_norm": 2.765625, "learning_rate": 0.0009933708424389868, "loss": 2.1737, "step": 637 }, { "epoch": 0.0539808782468906, "grad_norm": 2.203125, "learning_rate": 0.0009933492090208888, "loss": 1.491, "step": 638 }, { "epoch": 0.054065487773923344, "grad_norm": 2.234375, "learning_rate": 0.000993327540597642, "loss": 1.7265, "step": 639 }, { "epoch": 0.05415009730095609, "grad_norm": 2.34375, "learning_rate": 0.0009933058371707833, "loss": 1.9788, "step": 640 }, { "epoch": 0.05423470682798883, "grad_norm": 2.046875, "learning_rate": 0.0009932840987418531, "loss": 1.5674, "step": 641 }, { "epoch": 0.054319316355021574, "grad_norm": 2.484375, "learning_rate": 0.0009932623253123935, "loss": 1.7566, "step": 642 }, { "epoch": 0.05440392588205432, "grad_norm": 2.3125, "learning_rate": 0.0009932405168839495, "loss": 1.6038, "step": 643 }, { "epoch": 0.05448853540908706, "grad_norm": 2.625, "learning_rate": 0.0009932186734580685, "loss": 1.7238, "step": 644 }, { "epoch": 0.054573144936119804, "grad_norm": 2.125, "learning_rate": 0.0009931967950363005, "loss": 1.1529, "step": 645 }, { "epoch": 0.054657754463152554, "grad_norm": 2.46875, "learning_rate": 0.0009931748816201977, "loss": 1.6076, "step": 646 }, { "epoch": 0.0547423639901853, "grad_norm": 2.375, "learning_rate": 0.0009931529332113153, "loss": 1.53, "step": 647 }, { "epoch": 0.05482697351721804, "grad_norm": 2.109375, "learning_rate": 0.00099313094981121, "loss": 1.2956, "step": 648 }, { "epoch": 0.054911583044250784, "grad_norm": 2.15625, "learning_rate": 0.000993108931421442, "loss": 1.419, "step": 649 }, { "epoch": 0.05499619257128353, "grad_norm": 3.203125, "learning_rate": 0.0009930868780435736, "loss": 1.7091, "step": 650 }, { "epoch": 0.05508080209831627, "grad_norm": 2.703125, "learning_rate": 0.0009930647896791696, "loss": 1.5703, "step": 651 }, { "epoch": 0.05516541162534901, "grad_norm": 1.9921875, "learning_rate": 0.000993042666329797, "loss": 1.4627, "step": 652 }, { "epoch": 0.055250021152381756, "grad_norm": 1.6328125, "learning_rate": 0.000993020507997026, "loss": 1.1559, "step": 653 }, { "epoch": 0.0553346306794145, "grad_norm": 2.0625, "learning_rate": 0.0009929983146824285, "loss": 1.4138, "step": 654 }, { "epoch": 0.05541924020644724, "grad_norm": 2.484375, "learning_rate": 0.000992976086387579, "loss": 1.4449, "step": 655 }, { "epoch": 0.05550384973347999, "grad_norm": 3.515625, "learning_rate": 0.0009929538231140552, "loss": 1.5909, "step": 656 }, { "epoch": 0.055588459260512736, "grad_norm": 1.8515625, "learning_rate": 0.0009929315248634362, "loss": 1.1685, "step": 657 }, { "epoch": 0.05567306878754548, "grad_norm": 1.7890625, "learning_rate": 0.000992909191637305, "loss": 1.3446, "step": 658 }, { "epoch": 0.05575767831457822, "grad_norm": 2.890625, "learning_rate": 0.0009928868234372452, "loss": 1.6815, "step": 659 }, { "epoch": 0.055842287841610966, "grad_norm": 2.078125, "learning_rate": 0.0009928644202648446, "loss": 1.393, "step": 660 }, { "epoch": 0.05592689736864371, "grad_norm": 5.28125, "learning_rate": 0.000992841982121693, "loss": 2.9083, "step": 661 }, { "epoch": 0.05601150689567645, "grad_norm": 2.015625, "learning_rate": 0.0009928195090093818, "loss": 1.6078, "step": 662 }, { "epoch": 0.056096116422709195, "grad_norm": 1.9375, "learning_rate": 0.0009927970009295057, "loss": 1.7307, "step": 663 }, { "epoch": 0.05618072594974194, "grad_norm": 2.171875, "learning_rate": 0.0009927744578836618, "loss": 1.7337, "step": 664 }, { "epoch": 0.05626533547677468, "grad_norm": 1.8046875, "learning_rate": 0.00099275187987345, "loss": 1.6907, "step": 665 }, { "epoch": 0.05634994500380743, "grad_norm": 2.09375, "learning_rate": 0.0009927292669004721, "loss": 1.4351, "step": 666 }, { "epoch": 0.056434554530840175, "grad_norm": 2.765625, "learning_rate": 0.0009927066189663323, "loss": 1.7726, "step": 667 }, { "epoch": 0.05651916405787292, "grad_norm": 2.40625, "learning_rate": 0.0009926839360726379, "loss": 1.2951, "step": 668 }, { "epoch": 0.05660377358490566, "grad_norm": 2.109375, "learning_rate": 0.0009926612182209983, "loss": 1.2836, "step": 669 }, { "epoch": 0.056688383111938405, "grad_norm": 2.65625, "learning_rate": 0.000992638465413025, "loss": 1.9349, "step": 670 }, { "epoch": 0.05677299263897115, "grad_norm": 2.3125, "learning_rate": 0.000992615677650333, "loss": 1.4464, "step": 671 }, { "epoch": 0.05685760216600389, "grad_norm": 2.71875, "learning_rate": 0.0009925928549345388, "loss": 1.9914, "step": 672 }, { "epoch": 0.056942211693036635, "grad_norm": 1.8984375, "learning_rate": 0.000992569997267262, "loss": 1.186, "step": 673 }, { "epoch": 0.05702682122006938, "grad_norm": 21.875, "learning_rate": 0.0009925471046501244, "loss": 1.4508, "step": 674 }, { "epoch": 0.05711143074710212, "grad_norm": 4.1875, "learning_rate": 0.0009925241770847501, "loss": 1.4313, "step": 675 }, { "epoch": 0.057196040274134864, "grad_norm": 2.078125, "learning_rate": 0.000992501214572766, "loss": 1.356, "step": 676 }, { "epoch": 0.057280649801167614, "grad_norm": 2.109375, "learning_rate": 0.0009924782171158019, "loss": 1.6079, "step": 677 }, { "epoch": 0.05736525932820036, "grad_norm": 2.546875, "learning_rate": 0.0009924551847154885, "loss": 1.2967, "step": 678 }, { "epoch": 0.0574498688552331, "grad_norm": 2.609375, "learning_rate": 0.0009924321173734611, "loss": 1.246, "step": 679 }, { "epoch": 0.057534478382265844, "grad_norm": 2.484375, "learning_rate": 0.0009924090150913558, "loss": 1.5219, "step": 680 }, { "epoch": 0.05761908790929859, "grad_norm": 2.875, "learning_rate": 0.0009923858778708122, "loss": 1.6032, "step": 681 }, { "epoch": 0.05770369743633133, "grad_norm": 2.53125, "learning_rate": 0.0009923627057134715, "loss": 2.0224, "step": 682 }, { "epoch": 0.057788306963364074, "grad_norm": 1.796875, "learning_rate": 0.0009923394986209781, "loss": 1.0825, "step": 683 }, { "epoch": 0.05787291649039682, "grad_norm": 2.828125, "learning_rate": 0.000992316256594979, "loss": 1.7923, "step": 684 }, { "epoch": 0.05795752601742956, "grad_norm": 1.8828125, "learning_rate": 0.0009922929796371228, "loss": 1.2554, "step": 685 }, { "epoch": 0.0580421355444623, "grad_norm": 2.109375, "learning_rate": 0.0009922696677490612, "loss": 1.477, "step": 686 }, { "epoch": 0.05812674507149505, "grad_norm": 2.234375, "learning_rate": 0.0009922463209324484, "loss": 1.9843, "step": 687 }, { "epoch": 0.0582113545985278, "grad_norm": 1.71875, "learning_rate": 0.0009922229391889409, "loss": 1.2976, "step": 688 }, { "epoch": 0.05829596412556054, "grad_norm": 2.40625, "learning_rate": 0.0009921995225201977, "loss": 1.3527, "step": 689 }, { "epoch": 0.05838057365259328, "grad_norm": 2.125, "learning_rate": 0.0009921760709278803, "loss": 1.4138, "step": 690 }, { "epoch": 0.058465183179626026, "grad_norm": 2.03125, "learning_rate": 0.000992152584413653, "loss": 1.2841, "step": 691 }, { "epoch": 0.05854979270665877, "grad_norm": 2.109375, "learning_rate": 0.0009921290629791818, "loss": 1.4886, "step": 692 }, { "epoch": 0.05863440223369151, "grad_norm": 1.9296875, "learning_rate": 0.0009921055066261356, "loss": 1.489, "step": 693 }, { "epoch": 0.058719011760724256, "grad_norm": 2.359375, "learning_rate": 0.0009920819153561864, "loss": 1.6962, "step": 694 }, { "epoch": 0.058803621287757, "grad_norm": 2.234375, "learning_rate": 0.0009920582891710075, "loss": 1.5303, "step": 695 }, { "epoch": 0.05888823081478974, "grad_norm": 2.078125, "learning_rate": 0.0009920346280722758, "loss": 1.5211, "step": 696 }, { "epoch": 0.05897284034182249, "grad_norm": 1.828125, "learning_rate": 0.0009920109320616698, "loss": 1.1595, "step": 697 }, { "epoch": 0.059057449868855236, "grad_norm": 2.265625, "learning_rate": 0.0009919872011408708, "loss": 1.8798, "step": 698 }, { "epoch": 0.05914205939588798, "grad_norm": 3.09375, "learning_rate": 0.0009919634353115628, "loss": 2.0343, "step": 699 }, { "epoch": 0.05922666892292072, "grad_norm": 2.734375, "learning_rate": 0.000991939634575432, "loss": 1.7153, "step": 700 }, { "epoch": 0.059311278449953465, "grad_norm": 2.59375, "learning_rate": 0.000991915798934167, "loss": 1.6021, "step": 701 }, { "epoch": 0.05939588797698621, "grad_norm": 2.75, "learning_rate": 0.0009918919283894594, "loss": 2.0145, "step": 702 }, { "epoch": 0.05948049750401895, "grad_norm": 10.8125, "learning_rate": 0.0009918680229430027, "loss": 1.7758, "step": 703 }, { "epoch": 0.059565107031051695, "grad_norm": 87.0, "learning_rate": 0.000991844082596493, "loss": 1.5483, "step": 704 }, { "epoch": 0.05964971655808444, "grad_norm": 17.0, "learning_rate": 0.000991820107351629, "loss": 1.4835, "step": 705 }, { "epoch": 0.05973432608511718, "grad_norm": 3.0, "learning_rate": 0.0009917960972101118, "loss": 1.5788, "step": 706 }, { "epoch": 0.059818935612149925, "grad_norm": 3.203125, "learning_rate": 0.0009917720521736453, "loss": 1.7573, "step": 707 }, { "epoch": 0.059903545139182675, "grad_norm": 4.625, "learning_rate": 0.0009917479722439356, "loss": 1.4461, "step": 708 }, { "epoch": 0.05998815466621542, "grad_norm": 3.140625, "learning_rate": 0.0009917238574226907, "loss": 2.0573, "step": 709 }, { "epoch": 0.06007276419324816, "grad_norm": 9.0625, "learning_rate": 0.0009916997077116223, "loss": 1.5221, "step": 710 }, { "epoch": 0.060157373720280904, "grad_norm": 20.625, "learning_rate": 0.0009916755231124438, "loss": 1.5474, "step": 711 }, { "epoch": 0.06024198324731365, "grad_norm": 37.5, "learning_rate": 0.0009916513036268707, "loss": 1.5596, "step": 712 }, { "epoch": 0.06032659277434639, "grad_norm": 9.0625, "learning_rate": 0.000991627049256622, "loss": 1.5698, "step": 713 }, { "epoch": 0.060411202301379134, "grad_norm": 5.5625, "learning_rate": 0.0009916027600034183, "loss": 1.4073, "step": 714 }, { "epoch": 0.06049581182841188, "grad_norm": 11.1875, "learning_rate": 0.0009915784358689834, "loss": 2.1335, "step": 715 }, { "epoch": 0.06058042135544462, "grad_norm": 6.34375, "learning_rate": 0.000991554076855043, "loss": 1.6378, "step": 716 }, { "epoch": 0.060665030882477364, "grad_norm": 2.484375, "learning_rate": 0.0009915296829633253, "loss": 1.3378, "step": 717 }, { "epoch": 0.060749640409510114, "grad_norm": 2.703125, "learning_rate": 0.0009915052541955612, "loss": 1.7154, "step": 718 }, { "epoch": 0.06083424993654286, "grad_norm": 2.3125, "learning_rate": 0.0009914807905534843, "loss": 1.574, "step": 719 }, { "epoch": 0.0609188594635756, "grad_norm": 5.125, "learning_rate": 0.0009914562920388303, "loss": 1.0806, "step": 720 }, { "epoch": 0.061003468990608344, "grad_norm": 2.5625, "learning_rate": 0.0009914317586533373, "loss": 1.4744, "step": 721 }, { "epoch": 0.06108807851764109, "grad_norm": 2.390625, "learning_rate": 0.000991407190398746, "loss": 1.752, "step": 722 }, { "epoch": 0.06117268804467383, "grad_norm": 4.25, "learning_rate": 0.0009913825872768, "loss": 1.405, "step": 723 }, { "epoch": 0.06125729757170657, "grad_norm": 3.828125, "learning_rate": 0.0009913579492892447, "loss": 1.5219, "step": 724 }, { "epoch": 0.061341907098739316, "grad_norm": 2.546875, "learning_rate": 0.0009913332764378282, "loss": 1.3847, "step": 725 }, { "epoch": 0.06142651662577206, "grad_norm": 3.046875, "learning_rate": 0.0009913085687243014, "loss": 2.0721, "step": 726 }, { "epoch": 0.0615111261528048, "grad_norm": 4.75, "learning_rate": 0.000991283826150417, "loss": 1.373, "step": 727 }, { "epoch": 0.06159573567983755, "grad_norm": 2.453125, "learning_rate": 0.0009912590487179311, "loss": 1.8838, "step": 728 }, { "epoch": 0.061680345206870296, "grad_norm": 2.125, "learning_rate": 0.0009912342364286014, "loss": 1.6944, "step": 729 }, { "epoch": 0.06176495473390304, "grad_norm": 2.453125, "learning_rate": 0.0009912093892841885, "loss": 1.4671, "step": 730 }, { "epoch": 0.06184956426093578, "grad_norm": 2.0625, "learning_rate": 0.0009911845072864557, "loss": 1.8892, "step": 731 }, { "epoch": 0.061934173787968526, "grad_norm": 2.453125, "learning_rate": 0.000991159590437168, "loss": 1.6079, "step": 732 }, { "epoch": 0.06201878331500127, "grad_norm": 1.7421875, "learning_rate": 0.0009911346387380935, "loss": 1.2397, "step": 733 }, { "epoch": 0.06210339284203401, "grad_norm": 1.9765625, "learning_rate": 0.000991109652191003, "loss": 1.5609, "step": 734 }, { "epoch": 0.062188002369066755, "grad_norm": 1.640625, "learning_rate": 0.000991084630797669, "loss": 1.1347, "step": 735 }, { "epoch": 0.0622726118960995, "grad_norm": 2.703125, "learning_rate": 0.000991059574559867, "loss": 1.2393, "step": 736 }, { "epoch": 0.06235722142313224, "grad_norm": 2.0, "learning_rate": 0.0009910344834793748, "loss": 1.5733, "step": 737 }, { "epoch": 0.062441830950164985, "grad_norm": 2.296875, "learning_rate": 0.0009910093575579725, "loss": 1.396, "step": 738 }, { "epoch": 0.06252644047719773, "grad_norm": 2.328125, "learning_rate": 0.0009909841967974435, "loss": 1.7831, "step": 739 }, { "epoch": 0.06261105000423048, "grad_norm": 1.921875, "learning_rate": 0.0009909590011995726, "loss": 1.3194, "step": 740 }, { "epoch": 0.06269565953126321, "grad_norm": 1.734375, "learning_rate": 0.0009909337707661474, "loss": 1.2303, "step": 741 }, { "epoch": 0.06278026905829596, "grad_norm": 3.96875, "learning_rate": 0.0009909085054989584, "loss": 2.2183, "step": 742 }, { "epoch": 0.0628648785853287, "grad_norm": 1.6171875, "learning_rate": 0.0009908832053997984, "loss": 1.1788, "step": 743 }, { "epoch": 0.06294948811236145, "grad_norm": 2.390625, "learning_rate": 0.0009908578704704621, "loss": 1.4907, "step": 744 }, { "epoch": 0.0630340976393942, "grad_norm": 2.171875, "learning_rate": 0.0009908325007127474, "loss": 1.3706, "step": 745 }, { "epoch": 0.06311870716642694, "grad_norm": 2.015625, "learning_rate": 0.0009908070961284546, "loss": 1.5826, "step": 746 }, { "epoch": 0.06320331669345969, "grad_norm": 2.390625, "learning_rate": 0.0009907816567193857, "loss": 1.4454, "step": 747 }, { "epoch": 0.06328792622049242, "grad_norm": 1.9375, "learning_rate": 0.0009907561824873462, "loss": 1.2485, "step": 748 }, { "epoch": 0.06337253574752517, "grad_norm": 1.6171875, "learning_rate": 0.0009907306734341433, "loss": 1.3329, "step": 749 }, { "epoch": 0.06345714527455791, "grad_norm": 41.0, "learning_rate": 0.0009907051295615873, "loss": 1.9753, "step": 750 }, { "epoch": 0.06354175480159066, "grad_norm": 1.875, "learning_rate": 0.00099067955087149, "loss": 1.1702, "step": 751 }, { "epoch": 0.0636263643286234, "grad_norm": 2.0625, "learning_rate": 0.0009906539373656673, "loss": 1.4072, "step": 752 }, { "epoch": 0.06371097385565615, "grad_norm": 1.828125, "learning_rate": 0.0009906282890459357, "loss": 1.3549, "step": 753 }, { "epoch": 0.0637955833826889, "grad_norm": 1.78125, "learning_rate": 0.0009906026059141155, "loss": 1.5511, "step": 754 }, { "epoch": 0.06388019290972163, "grad_norm": 2.828125, "learning_rate": 0.000990576887972029, "loss": 1.7139, "step": 755 }, { "epoch": 0.06396480243675438, "grad_norm": 2.0625, "learning_rate": 0.0009905511352215008, "loss": 1.4449, "step": 756 }, { "epoch": 0.06404941196378712, "grad_norm": 2.0, "learning_rate": 0.0009905253476643586, "loss": 1.7991, "step": 757 }, { "epoch": 0.06413402149081987, "grad_norm": 2.296875, "learning_rate": 0.0009904995253024313, "loss": 2.0944, "step": 758 }, { "epoch": 0.0642186310178526, "grad_norm": 2.65625, "learning_rate": 0.000990473668137552, "loss": 1.7127, "step": 759 }, { "epoch": 0.06430324054488536, "grad_norm": 1.9453125, "learning_rate": 0.000990447776171555, "loss": 1.3375, "step": 760 }, { "epoch": 0.06438785007191809, "grad_norm": 1.953125, "learning_rate": 0.000990421849406277, "loss": 1.7721, "step": 761 }, { "epoch": 0.06447245959895084, "grad_norm": 2.375, "learning_rate": 0.0009903958878435585, "loss": 1.3878, "step": 762 }, { "epoch": 0.06455706912598358, "grad_norm": 1.8046875, "learning_rate": 0.0009903698914852407, "loss": 1.4458, "step": 763 }, { "epoch": 0.06464167865301633, "grad_norm": 1.9765625, "learning_rate": 0.000990343860333169, "loss": 1.2819, "step": 764 }, { "epoch": 0.06472628818004908, "grad_norm": 2.75, "learning_rate": 0.0009903177943891897, "loss": 1.6695, "step": 765 }, { "epoch": 0.06481089770708182, "grad_norm": 2.953125, "learning_rate": 0.0009902916936551528, "loss": 1.7797, "step": 766 }, { "epoch": 0.06489550723411457, "grad_norm": 2.4375, "learning_rate": 0.0009902655581329098, "loss": 1.3855, "step": 767 }, { "epoch": 0.0649801167611473, "grad_norm": 2.0, "learning_rate": 0.0009902393878243154, "loss": 1.3078, "step": 768 }, { "epoch": 0.06506472628818005, "grad_norm": 3.25, "learning_rate": 0.0009902131827312264, "loss": 1.7763, "step": 769 }, { "epoch": 0.06514933581521279, "grad_norm": 2.359375, "learning_rate": 0.0009901869428555023, "loss": 1.8768, "step": 770 }, { "epoch": 0.06523394534224554, "grad_norm": 1.71875, "learning_rate": 0.0009901606681990047, "loss": 1.3035, "step": 771 }, { "epoch": 0.06531855486927828, "grad_norm": 1.9140625, "learning_rate": 0.0009901343587635982, "loss": 1.7163, "step": 772 }, { "epoch": 0.06540316439631103, "grad_norm": 1.6171875, "learning_rate": 0.000990108014551149, "loss": 1.1899, "step": 773 }, { "epoch": 0.06548777392334376, "grad_norm": 1.609375, "learning_rate": 0.0009900816355635272, "loss": 1.1712, "step": 774 }, { "epoch": 0.06557238345037651, "grad_norm": 2.078125, "learning_rate": 0.0009900552218026037, "loss": 1.4713, "step": 775 }, { "epoch": 0.06565699297740926, "grad_norm": 1.9765625, "learning_rate": 0.0009900287732702532, "loss": 1.2781, "step": 776 }, { "epoch": 0.065741602504442, "grad_norm": 3.1875, "learning_rate": 0.0009900022899683519, "loss": 1.8668, "step": 777 }, { "epoch": 0.06582621203147475, "grad_norm": 2.359375, "learning_rate": 0.000989975771898779, "loss": 1.7424, "step": 778 }, { "epoch": 0.06591082155850748, "grad_norm": 2.609375, "learning_rate": 0.0009899492190634164, "loss": 2.0886, "step": 779 }, { "epoch": 0.06599543108554023, "grad_norm": 2.3125, "learning_rate": 0.0009899226314641477, "loss": 1.5442, "step": 780 }, { "epoch": 0.06608004061257297, "grad_norm": 1.9921875, "learning_rate": 0.0009898960091028596, "loss": 1.5617, "step": 781 }, { "epoch": 0.06616465013960572, "grad_norm": 2.03125, "learning_rate": 0.0009898693519814414, "loss": 1.5265, "step": 782 }, { "epoch": 0.06624925966663846, "grad_norm": 1.8828125, "learning_rate": 0.0009898426601017839, "loss": 1.3403, "step": 783 }, { "epoch": 0.06633386919367121, "grad_norm": 1.5625, "learning_rate": 0.0009898159334657815, "loss": 1.2974, "step": 784 }, { "epoch": 0.06641847872070396, "grad_norm": 2.140625, "learning_rate": 0.00098978917207533, "loss": 1.6667, "step": 785 }, { "epoch": 0.0665030882477367, "grad_norm": 2.59375, "learning_rate": 0.0009897623759323288, "loss": 1.6101, "step": 786 }, { "epoch": 0.06658769777476944, "grad_norm": 2.765625, "learning_rate": 0.0009897355450386792, "loss": 1.5363, "step": 787 }, { "epoch": 0.06667230730180218, "grad_norm": 2.125, "learning_rate": 0.0009897086793962844, "loss": 1.3417, "step": 788 }, { "epoch": 0.06675691682883493, "grad_norm": 2.109375, "learning_rate": 0.0009896817790070512, "loss": 1.5175, "step": 789 }, { "epoch": 0.06684152635586767, "grad_norm": 2.5625, "learning_rate": 0.0009896548438728878, "loss": 2.0061, "step": 790 }, { "epoch": 0.06692613588290042, "grad_norm": 2.0625, "learning_rate": 0.000989627873995706, "loss": 1.3307, "step": 791 }, { "epoch": 0.06701074540993315, "grad_norm": 2.015625, "learning_rate": 0.0009896008693774188, "loss": 1.243, "step": 792 }, { "epoch": 0.0670953549369659, "grad_norm": 1.84375, "learning_rate": 0.0009895738300199424, "loss": 1.1859, "step": 793 }, { "epoch": 0.06717996446399864, "grad_norm": 2.234375, "learning_rate": 0.000989546755925196, "loss": 1.9148, "step": 794 }, { "epoch": 0.06726457399103139, "grad_norm": 3.4375, "learning_rate": 0.0009895196470950996, "loss": 1.9708, "step": 795 }, { "epoch": 0.06734918351806414, "grad_norm": 2.125, "learning_rate": 0.0009894925035315773, "loss": 1.4743, "step": 796 }, { "epoch": 0.06743379304509688, "grad_norm": 2.09375, "learning_rate": 0.000989465325236555, "loss": 1.3024, "step": 797 }, { "epoch": 0.06751840257212963, "grad_norm": 1.875, "learning_rate": 0.0009894381122119611, "loss": 1.3952, "step": 798 }, { "epoch": 0.06760301209916236, "grad_norm": 2.09375, "learning_rate": 0.0009894108644597265, "loss": 1.6025, "step": 799 }, { "epoch": 0.06768762162619511, "grad_norm": 1.8828125, "learning_rate": 0.0009893835819817843, "loss": 1.4616, "step": 800 }, { "epoch": 0.06777223115322785, "grad_norm": 2.109375, "learning_rate": 0.0009893562647800705, "loss": 1.4377, "step": 801 }, { "epoch": 0.0678568406802606, "grad_norm": 1.6953125, "learning_rate": 0.0009893289128565233, "loss": 1.1917, "step": 802 }, { "epoch": 0.06794145020729334, "grad_norm": 1.9765625, "learning_rate": 0.0009893015262130835, "loss": 1.6836, "step": 803 }, { "epoch": 0.06802605973432609, "grad_norm": 2.9375, "learning_rate": 0.0009892741048516942, "loss": 1.8366, "step": 804 }, { "epoch": 0.06811066926135882, "grad_norm": 1.9375, "learning_rate": 0.0009892466487743013, "loss": 1.2407, "step": 805 }, { "epoch": 0.06819527878839157, "grad_norm": 1.9921875, "learning_rate": 0.0009892191579828527, "loss": 1.3242, "step": 806 }, { "epoch": 0.06827988831542432, "grad_norm": 2.328125, "learning_rate": 0.000989191632479299, "loss": 1.3561, "step": 807 }, { "epoch": 0.06836449784245706, "grad_norm": 2.765625, "learning_rate": 0.000989164072265593, "loss": 1.7873, "step": 808 }, { "epoch": 0.06844910736948981, "grad_norm": 2.609375, "learning_rate": 0.0009891364773436909, "loss": 1.866, "step": 809 }, { "epoch": 0.06853371689652255, "grad_norm": 1.7734375, "learning_rate": 0.0009891088477155501, "loss": 1.301, "step": 810 }, { "epoch": 0.0686183264235553, "grad_norm": 2.203125, "learning_rate": 0.000989081183383131, "loss": 1.9391, "step": 811 }, { "epoch": 0.06870293595058803, "grad_norm": 1.984375, "learning_rate": 0.0009890534843483968, "loss": 1.538, "step": 812 }, { "epoch": 0.06878754547762078, "grad_norm": 2.703125, "learning_rate": 0.000989025750613313, "loss": 1.7845, "step": 813 }, { "epoch": 0.06887215500465352, "grad_norm": 2.375, "learning_rate": 0.000988997982179847, "loss": 1.6156, "step": 814 }, { "epoch": 0.06895676453168627, "grad_norm": 4.15625, "learning_rate": 0.0009889701790499691, "loss": 1.8843, "step": 815 }, { "epoch": 0.06904137405871902, "grad_norm": 2.515625, "learning_rate": 0.0009889423412256524, "loss": 2.4142, "step": 816 }, { "epoch": 0.06912598358575175, "grad_norm": 2.390625, "learning_rate": 0.0009889144687088719, "loss": 1.3258, "step": 817 }, { "epoch": 0.0692105931127845, "grad_norm": 2.09375, "learning_rate": 0.0009888865615016052, "loss": 1.7502, "step": 818 }, { "epoch": 0.06929520263981724, "grad_norm": 256.0, "learning_rate": 0.0009888586196058324, "loss": 2.3116, "step": 819 }, { "epoch": 0.06937981216684999, "grad_norm": 3200.0, "learning_rate": 0.0009888306430235363, "loss": 1.4503, "step": 820 }, { "epoch": 0.06946442169388273, "grad_norm": 1336.0, "learning_rate": 0.000988802631756702, "loss": 1.4526, "step": 821 }, { "epoch": 0.06954903122091548, "grad_norm": 239.0, "learning_rate": 0.0009887745858073167, "loss": 1.7473, "step": 822 }, { "epoch": 0.06963364074794821, "grad_norm": 8896.0, "learning_rate": 0.0009887465051773708, "loss": 1.654, "step": 823 }, { "epoch": 0.06971825027498096, "grad_norm": 260.0, "learning_rate": 0.0009887183898688561, "loss": 1.7425, "step": 824 }, { "epoch": 0.0698028598020137, "grad_norm": 82.5, "learning_rate": 0.000988690239883768, "loss": 1.2647, "step": 825 }, { "epoch": 0.06988746932904645, "grad_norm": 7.21875, "learning_rate": 0.000988662055224104, "loss": 1.8347, "step": 826 }, { "epoch": 0.0699720788560792, "grad_norm": 3.765625, "learning_rate": 0.0009886338358918634, "loss": 1.7102, "step": 827 }, { "epoch": 0.07005668838311194, "grad_norm": 1.96875, "learning_rate": 0.0009886055818890487, "loss": 1.2199, "step": 828 }, { "epoch": 0.07014129791014469, "grad_norm": 2.125, "learning_rate": 0.0009885772932176646, "loss": 1.6728, "step": 829 }, { "epoch": 0.07022590743717742, "grad_norm": 2.96875, "learning_rate": 0.0009885489698797185, "loss": 1.6751, "step": 830 }, { "epoch": 0.07031051696421017, "grad_norm": 2.96875, "learning_rate": 0.00098852061187722, "loss": 2.0866, "step": 831 }, { "epoch": 0.07039512649124291, "grad_norm": 2.140625, "learning_rate": 0.000988492219212181, "loss": 1.8536, "step": 832 }, { "epoch": 0.07047973601827566, "grad_norm": 2.3125, "learning_rate": 0.0009884637918866164, "loss": 1.3587, "step": 833 }, { "epoch": 0.0705643455453084, "grad_norm": 2.109375, "learning_rate": 0.0009884353299025427, "loss": 1.8319, "step": 834 }, { "epoch": 0.07064895507234115, "grad_norm": 2.53125, "learning_rate": 0.00098840683326198, "loss": 1.502, "step": 835 }, { "epoch": 0.07073356459937388, "grad_norm": 2.078125, "learning_rate": 0.00098837830196695, "loss": 1.3094, "step": 836 }, { "epoch": 0.07081817412640663, "grad_norm": 2.296875, "learning_rate": 0.000988349736019477, "loss": 1.7868, "step": 837 }, { "epoch": 0.07090278365343938, "grad_norm": 1.984375, "learning_rate": 0.0009883211354215881, "loss": 1.7548, "step": 838 }, { "epoch": 0.07098739318047212, "grad_norm": 19.5, "learning_rate": 0.0009882925001753124, "loss": 1.4973, "step": 839 }, { "epoch": 0.07107200270750487, "grad_norm": 2.28125, "learning_rate": 0.0009882638302826819, "loss": 1.4404, "step": 840 }, { "epoch": 0.0711566122345376, "grad_norm": 2.25, "learning_rate": 0.0009882351257457305, "loss": 1.2324, "step": 841 }, { "epoch": 0.07124122176157036, "grad_norm": 2.765625, "learning_rate": 0.0009882063865664953, "loss": 1.2775, "step": 842 }, { "epoch": 0.07132583128860309, "grad_norm": 1.9921875, "learning_rate": 0.0009881776127470155, "loss": 1.2671, "step": 843 }, { "epoch": 0.07141044081563584, "grad_norm": 2.578125, "learning_rate": 0.0009881488042893323, "loss": 1.5563, "step": 844 }, { "epoch": 0.07149505034266858, "grad_norm": 3.703125, "learning_rate": 0.0009881199611954901, "loss": 1.2645, "step": 845 }, { "epoch": 0.07157965986970133, "grad_norm": 2.484375, "learning_rate": 0.0009880910834675352, "loss": 1.1689, "step": 846 }, { "epoch": 0.07166426939673408, "grad_norm": 7.90625, "learning_rate": 0.0009880621711075169, "loss": 2.4346, "step": 847 }, { "epoch": 0.07174887892376682, "grad_norm": 2.15625, "learning_rate": 0.0009880332241174864, "loss": 1.6908, "step": 848 }, { "epoch": 0.07183348845079957, "grad_norm": 15.9375, "learning_rate": 0.0009880042424994977, "loss": 1.3413, "step": 849 }, { "epoch": 0.0719180979778323, "grad_norm": 2.375, "learning_rate": 0.0009879752262556072, "loss": 1.2217, "step": 850 }, { "epoch": 0.07200270750486505, "grad_norm": 16.375, "learning_rate": 0.000987946175387874, "loss": 1.9302, "step": 851 }, { "epoch": 0.07208731703189779, "grad_norm": 1.96875, "learning_rate": 0.0009879170898983586, "loss": 1.5108, "step": 852 }, { "epoch": 0.07217192655893054, "grad_norm": 2.625, "learning_rate": 0.000987887969789125, "loss": 1.8845, "step": 853 }, { "epoch": 0.07225653608596327, "grad_norm": 2.0, "learning_rate": 0.00098785881506224, "loss": 1.4357, "step": 854 }, { "epoch": 0.07234114561299602, "grad_norm": 2.703125, "learning_rate": 0.000987829625719772, "loss": 1.9615, "step": 855 }, { "epoch": 0.07242575514002876, "grad_norm": 3.296875, "learning_rate": 0.0009878004017637916, "loss": 1.3379, "step": 856 }, { "epoch": 0.07251036466706151, "grad_norm": 3.0625, "learning_rate": 0.0009877711431963726, "loss": 1.2542, "step": 857 }, { "epoch": 0.07259497419409426, "grad_norm": 3.546875, "learning_rate": 0.0009877418500195915, "loss": 1.6995, "step": 858 }, { "epoch": 0.072679583721127, "grad_norm": 2.34375, "learning_rate": 0.000987712522235526, "loss": 1.3208, "step": 859 }, { "epoch": 0.07276419324815975, "grad_norm": 2.171875, "learning_rate": 0.0009876831598462576, "loss": 1.4122, "step": 860 }, { "epoch": 0.07284880277519248, "grad_norm": 2.484375, "learning_rate": 0.0009876537628538695, "loss": 1.3085, "step": 861 }, { "epoch": 0.07293341230222523, "grad_norm": 3.5, "learning_rate": 0.0009876243312604476, "loss": 1.6158, "step": 862 }, { "epoch": 0.07301802182925797, "grad_norm": 2.71875, "learning_rate": 0.00098759486506808, "loss": 2.0759, "step": 863 }, { "epoch": 0.07310263135629072, "grad_norm": 1.65625, "learning_rate": 0.0009875653642788574, "loss": 1.1766, "step": 864 }, { "epoch": 0.07318724088332346, "grad_norm": 2.421875, "learning_rate": 0.0009875358288948734, "loss": 1.644, "step": 865 }, { "epoch": 0.0732718504103562, "grad_norm": 2.109375, "learning_rate": 0.0009875062589182234, "loss": 1.6867, "step": 866 }, { "epoch": 0.07335645993738894, "grad_norm": 2.078125, "learning_rate": 0.0009874766543510056, "loss": 1.5969, "step": 867 }, { "epoch": 0.0734410694644217, "grad_norm": 2.359375, "learning_rate": 0.0009874470151953203, "loss": 1.4714, "step": 868 }, { "epoch": 0.07352567899145444, "grad_norm": 2.578125, "learning_rate": 0.0009874173414532708, "loss": 1.4911, "step": 869 }, { "epoch": 0.07361028851848718, "grad_norm": 2.359375, "learning_rate": 0.0009873876331269627, "loss": 1.256, "step": 870 }, { "epoch": 0.07369489804551993, "grad_norm": 2.0, "learning_rate": 0.0009873578902185034, "loss": 1.3236, "step": 871 }, { "epoch": 0.07377950757255267, "grad_norm": 2.546875, "learning_rate": 0.0009873281127300037, "loss": 1.5322, "step": 872 }, { "epoch": 0.07386411709958542, "grad_norm": 2.78125, "learning_rate": 0.0009872983006635765, "loss": 1.3789, "step": 873 }, { "epoch": 0.07394872662661815, "grad_norm": 2.0, "learning_rate": 0.0009872684540213367, "loss": 1.5366, "step": 874 }, { "epoch": 0.0740333361536509, "grad_norm": 2.90625, "learning_rate": 0.0009872385728054026, "loss": 1.3476, "step": 875 }, { "epoch": 0.07411794568068364, "grad_norm": 1.7265625, "learning_rate": 0.0009872086570178937, "loss": 1.3212, "step": 876 }, { "epoch": 0.07420255520771639, "grad_norm": 2.0, "learning_rate": 0.0009871787066609333, "loss": 1.5854, "step": 877 }, { "epoch": 0.07428716473474914, "grad_norm": 1.8671875, "learning_rate": 0.0009871487217366461, "loss": 1.9873, "step": 878 }, { "epoch": 0.07437177426178188, "grad_norm": 2.140625, "learning_rate": 0.0009871187022471597, "loss": 1.6567, "step": 879 }, { "epoch": 0.07445638378881463, "grad_norm": 2.171875, "learning_rate": 0.0009870886481946042, "loss": 2.2102, "step": 880 }, { "epoch": 0.07454099331584736, "grad_norm": 2.078125, "learning_rate": 0.0009870585595811122, "loss": 1.6831, "step": 881 }, { "epoch": 0.07462560284288011, "grad_norm": 1.84375, "learning_rate": 0.0009870284364088182, "loss": 1.3647, "step": 882 }, { "epoch": 0.07471021236991285, "grad_norm": 1.921875, "learning_rate": 0.00098699827867986, "loss": 1.6757, "step": 883 }, { "epoch": 0.0747948218969456, "grad_norm": 2.25, "learning_rate": 0.000986968086396377, "loss": 1.7809, "step": 884 }, { "epoch": 0.07487943142397833, "grad_norm": 88.0, "learning_rate": 0.0009869378595605118, "loss": 1.5137, "step": 885 }, { "epoch": 0.07496404095101109, "grad_norm": 2.046875, "learning_rate": 0.0009869075981744089, "loss": 1.2163, "step": 886 }, { "epoch": 0.07504865047804382, "grad_norm": 2.15625, "learning_rate": 0.0009868773022402159, "loss": 2.0105, "step": 887 }, { "epoch": 0.07513326000507657, "grad_norm": 1.9609375, "learning_rate": 0.0009868469717600817, "loss": 1.2363, "step": 888 }, { "epoch": 0.07521786953210932, "grad_norm": 2.453125, "learning_rate": 0.0009868166067361589, "loss": 1.373, "step": 889 }, { "epoch": 0.07530247905914206, "grad_norm": 2.765625, "learning_rate": 0.000986786207170602, "loss": 1.7164, "step": 890 }, { "epoch": 0.07538708858617481, "grad_norm": 2.1875, "learning_rate": 0.0009867557730655677, "loss": 1.5228, "step": 891 }, { "epoch": 0.07547169811320754, "grad_norm": 2.5625, "learning_rate": 0.0009867253044232155, "loss": 1.3065, "step": 892 }, { "epoch": 0.0755563076402403, "grad_norm": 2.28125, "learning_rate": 0.0009866948012457078, "loss": 1.2471, "step": 893 }, { "epoch": 0.07564091716727303, "grad_norm": 2.078125, "learning_rate": 0.0009866642635352081, "loss": 1.4287, "step": 894 }, { "epoch": 0.07572552669430578, "grad_norm": 2.40625, "learning_rate": 0.0009866336912938837, "loss": 1.2643, "step": 895 }, { "epoch": 0.07581013622133852, "grad_norm": 2.21875, "learning_rate": 0.0009866030845239037, "loss": 1.5419, "step": 896 }, { "epoch": 0.07589474574837127, "grad_norm": 3.171875, "learning_rate": 0.0009865724432274396, "loss": 2.086, "step": 897 }, { "epoch": 0.075979355275404, "grad_norm": 2.15625, "learning_rate": 0.0009865417674066658, "loss": 1.5489, "step": 898 }, { "epoch": 0.07606396480243675, "grad_norm": 2.796875, "learning_rate": 0.0009865110570637587, "loss": 1.5994, "step": 899 }, { "epoch": 0.0761485743294695, "grad_norm": 2.546875, "learning_rate": 0.0009864803122008976, "loss": 1.5424, "step": 900 }, { "epoch": 0.07623318385650224, "grad_norm": 5.0, "learning_rate": 0.0009864495328202635, "loss": 1.8551, "step": 901 }, { "epoch": 0.07631779338353499, "grad_norm": 1.8515625, "learning_rate": 0.0009864187189240407, "loss": 1.3119, "step": 902 }, { "epoch": 0.07640240291056773, "grad_norm": 2.03125, "learning_rate": 0.0009863878705144156, "loss": 1.3557, "step": 903 }, { "epoch": 0.07648701243760048, "grad_norm": 2.359375, "learning_rate": 0.0009863569875935767, "loss": 1.1767, "step": 904 }, { "epoch": 0.07657162196463321, "grad_norm": 2.09375, "learning_rate": 0.0009863260701637154, "loss": 1.6619, "step": 905 }, { "epoch": 0.07665623149166596, "grad_norm": 2.078125, "learning_rate": 0.0009862951182270257, "loss": 1.4369, "step": 906 }, { "epoch": 0.0767408410186987, "grad_norm": 1.859375, "learning_rate": 0.0009862641317857034, "loss": 1.2868, "step": 907 }, { "epoch": 0.07682545054573145, "grad_norm": 2.6875, "learning_rate": 0.0009862331108419472, "loss": 1.4289, "step": 908 }, { "epoch": 0.0769100600727642, "grad_norm": 3.734375, "learning_rate": 0.0009862020553979582, "loss": 1.4375, "step": 909 }, { "epoch": 0.07699466959979694, "grad_norm": 3.484375, "learning_rate": 0.00098617096545594, "loss": 1.2853, "step": 910 }, { "epoch": 0.07707927912682969, "grad_norm": 2.546875, "learning_rate": 0.0009861398410180983, "loss": 1.5263, "step": 911 }, { "epoch": 0.07716388865386242, "grad_norm": 4.4375, "learning_rate": 0.000986108682086642, "loss": 1.3333, "step": 912 }, { "epoch": 0.07724849818089517, "grad_norm": 2.75, "learning_rate": 0.0009860774886637814, "loss": 1.8517, "step": 913 }, { "epoch": 0.07733310770792791, "grad_norm": 2.34375, "learning_rate": 0.0009860462607517303, "loss": 1.933, "step": 914 }, { "epoch": 0.07741771723496066, "grad_norm": 3.15625, "learning_rate": 0.000986014998352704, "loss": 1.8137, "step": 915 }, { "epoch": 0.0775023267619934, "grad_norm": 1.7421875, "learning_rate": 0.0009859837014689209, "loss": 1.5091, "step": 916 }, { "epoch": 0.07758693628902615, "grad_norm": 2.0625, "learning_rate": 0.0009859523701026016, "loss": 1.5885, "step": 917 }, { "epoch": 0.07767154581605888, "grad_norm": 3.484375, "learning_rate": 0.0009859210042559693, "loss": 1.7174, "step": 918 }, { "epoch": 0.07775615534309163, "grad_norm": 1.9296875, "learning_rate": 0.0009858896039312494, "loss": 1.4618, "step": 919 }, { "epoch": 0.07784076487012438, "grad_norm": 2.125, "learning_rate": 0.00098585816913067, "loss": 1.5152, "step": 920 }, { "epoch": 0.07792537439715712, "grad_norm": 2.796875, "learning_rate": 0.0009858266998564614, "loss": 1.9132, "step": 921 }, { "epoch": 0.07800998392418987, "grad_norm": 2.28125, "learning_rate": 0.0009857951961108565, "loss": 1.9144, "step": 922 }, { "epoch": 0.0780945934512226, "grad_norm": 2.328125, "learning_rate": 0.0009857636578960909, "loss": 1.5054, "step": 923 }, { "epoch": 0.07817920297825535, "grad_norm": 2.453125, "learning_rate": 0.0009857320852144017, "loss": 1.6493, "step": 924 }, { "epoch": 0.07826381250528809, "grad_norm": 2.171875, "learning_rate": 0.0009857004780680298, "loss": 1.305, "step": 925 }, { "epoch": 0.07834842203232084, "grad_norm": 1.9375, "learning_rate": 0.0009856688364592177, "loss": 1.5856, "step": 926 }, { "epoch": 0.07843303155935358, "grad_norm": 2.1875, "learning_rate": 0.0009856371603902104, "loss": 1.8333, "step": 927 }, { "epoch": 0.07851764108638633, "grad_norm": 2.0, "learning_rate": 0.0009856054498632552, "loss": 1.3659, "step": 928 }, { "epoch": 0.07860225061341906, "grad_norm": 1.7109375, "learning_rate": 0.0009855737048806025, "loss": 1.6661, "step": 929 }, { "epoch": 0.07868686014045181, "grad_norm": 1.9375, "learning_rate": 0.0009855419254445044, "loss": 1.576, "step": 930 }, { "epoch": 0.07877146966748456, "grad_norm": 1.75, "learning_rate": 0.0009855101115572161, "loss": 1.5323, "step": 931 }, { "epoch": 0.0788560791945173, "grad_norm": 1.53125, "learning_rate": 0.0009854782632209946, "loss": 1.1929, "step": 932 }, { "epoch": 0.07894068872155005, "grad_norm": 2.5, "learning_rate": 0.0009854463804381001, "loss": 1.8038, "step": 933 }, { "epoch": 0.07902529824858279, "grad_norm": 1.7421875, "learning_rate": 0.0009854144632107944, "loss": 1.2447, "step": 934 }, { "epoch": 0.07910990777561554, "grad_norm": 6.90625, "learning_rate": 0.0009853825115413424, "loss": 1.8586, "step": 935 }, { "epoch": 0.07919451730264827, "grad_norm": 1.671875, "learning_rate": 0.0009853505254320111, "loss": 1.1978, "step": 936 }, { "epoch": 0.07927912682968102, "grad_norm": 2.21875, "learning_rate": 0.00098531850488507, "loss": 1.8378, "step": 937 }, { "epoch": 0.07936373635671376, "grad_norm": 2.046875, "learning_rate": 0.0009852864499027912, "loss": 1.6816, "step": 938 }, { "epoch": 0.07944834588374651, "grad_norm": 2.234375, "learning_rate": 0.000985254360487449, "loss": 2.4547, "step": 939 }, { "epoch": 0.07953295541077926, "grad_norm": 1.5859375, "learning_rate": 0.0009852222366413207, "loss": 1.337, "step": 940 }, { "epoch": 0.079617564937812, "grad_norm": 1.8203125, "learning_rate": 0.000985190078366685, "loss": 1.3058, "step": 941 }, { "epoch": 0.07970217446484475, "grad_norm": 1.703125, "learning_rate": 0.0009851578856658238, "loss": 1.9367, "step": 942 }, { "epoch": 0.07978678399187748, "grad_norm": 1.8828125, "learning_rate": 0.0009851256585410217, "loss": 1.9327, "step": 943 }, { "epoch": 0.07987139351891023, "grad_norm": 1.78125, "learning_rate": 0.000985093396994565, "loss": 1.3476, "step": 944 }, { "epoch": 0.07995600304594297, "grad_norm": 1.9921875, "learning_rate": 0.0009850611010287428, "loss": 1.6144, "step": 945 }, { "epoch": 0.08004061257297572, "grad_norm": 2.5, "learning_rate": 0.000985028770645847, "loss": 1.2468, "step": 946 }, { "epoch": 0.08012522210000846, "grad_norm": 1.875, "learning_rate": 0.0009849964058481712, "loss": 1.4379, "step": 947 }, { "epoch": 0.0802098316270412, "grad_norm": 1.5546875, "learning_rate": 0.0009849640066380116, "loss": 1.1792, "step": 948 }, { "epoch": 0.08029444115407394, "grad_norm": 1.8359375, "learning_rate": 0.0009849315730176675, "loss": 1.2601, "step": 949 }, { "epoch": 0.08037905068110669, "grad_norm": 2.265625, "learning_rate": 0.0009848991049894403, "loss": 1.5897, "step": 950 }, { "epoch": 0.08046366020813944, "grad_norm": 1.6875, "learning_rate": 0.0009848666025556333, "loss": 1.3158, "step": 951 }, { "epoch": 0.08054826973517218, "grad_norm": 1.75, "learning_rate": 0.0009848340657185527, "loss": 1.5459, "step": 952 }, { "epoch": 0.08063287926220493, "grad_norm": 1.671875, "learning_rate": 0.0009848014944805077, "loss": 1.8678, "step": 953 }, { "epoch": 0.08071748878923767, "grad_norm": 1.96875, "learning_rate": 0.0009847688888438087, "loss": 1.9256, "step": 954 }, { "epoch": 0.08080209831627042, "grad_norm": 1.9765625, "learning_rate": 0.0009847362488107697, "loss": 1.6365, "step": 955 }, { "epoch": 0.08088670784330315, "grad_norm": 2.5, "learning_rate": 0.0009847035743837061, "loss": 2.1988, "step": 956 }, { "epoch": 0.0809713173703359, "grad_norm": 2.328125, "learning_rate": 0.0009846708655649368, "loss": 1.7661, "step": 957 }, { "epoch": 0.08105592689736864, "grad_norm": 1.9765625, "learning_rate": 0.0009846381223567824, "loss": 1.6287, "step": 958 }, { "epoch": 0.08114053642440139, "grad_norm": 1.7734375, "learning_rate": 0.000984605344761566, "loss": 1.3312, "step": 959 }, { "epoch": 0.08122514595143414, "grad_norm": 2.03125, "learning_rate": 0.0009845725327816137, "loss": 1.4061, "step": 960 }, { "epoch": 0.08130975547846687, "grad_norm": 3.625, "learning_rate": 0.0009845396864192532, "loss": 1.923, "step": 961 }, { "epoch": 0.08139436500549962, "grad_norm": 2.578125, "learning_rate": 0.0009845068056768156, "loss": 2.2527, "step": 962 }, { "epoch": 0.08147897453253236, "grad_norm": 2.296875, "learning_rate": 0.0009844738905566335, "loss": 2.0868, "step": 963 }, { "epoch": 0.08156358405956511, "grad_norm": 2.140625, "learning_rate": 0.0009844409410610427, "loss": 1.3897, "step": 964 }, { "epoch": 0.08164819358659785, "grad_norm": 1.96875, "learning_rate": 0.0009844079571923808, "loss": 1.3911, "step": 965 }, { "epoch": 0.0817328031136306, "grad_norm": 2.4375, "learning_rate": 0.0009843749389529882, "loss": 1.8, "step": 966 }, { "epoch": 0.08181741264066333, "grad_norm": 2.15625, "learning_rate": 0.0009843418863452076, "loss": 1.6755, "step": 967 }, { "epoch": 0.08190202216769608, "grad_norm": 2.484375, "learning_rate": 0.0009843087993713843, "loss": 1.3793, "step": 968 }, { "epoch": 0.08198663169472882, "grad_norm": 2.671875, "learning_rate": 0.0009842756780338662, "loss": 2.2142, "step": 969 }, { "epoch": 0.08207124122176157, "grad_norm": 3.078125, "learning_rate": 0.0009842425223350031, "loss": 1.5566, "step": 970 }, { "epoch": 0.08215585074879432, "grad_norm": 1.5078125, "learning_rate": 0.0009842093322771478, "loss": 1.2594, "step": 971 }, { "epoch": 0.08224046027582706, "grad_norm": 1.8828125, "learning_rate": 0.000984176107862655, "loss": 1.4557, "step": 972 }, { "epoch": 0.08232506980285981, "grad_norm": 1.6953125, "learning_rate": 0.0009841428490938821, "loss": 1.5143, "step": 973 }, { "epoch": 0.08240967932989254, "grad_norm": 1.96875, "learning_rate": 0.000984109555973189, "loss": 1.453, "step": 974 }, { "epoch": 0.0824942888569253, "grad_norm": 2.140625, "learning_rate": 0.0009840762285029385, "loss": 1.7673, "step": 975 }, { "epoch": 0.08257889838395803, "grad_norm": 1.671875, "learning_rate": 0.0009840428666854945, "loss": 1.4655, "step": 976 }, { "epoch": 0.08266350791099078, "grad_norm": 1.859375, "learning_rate": 0.0009840094705232244, "loss": 1.4066, "step": 977 }, { "epoch": 0.08274811743802352, "grad_norm": 1.671875, "learning_rate": 0.0009839760400184982, "loss": 1.7262, "step": 978 }, { "epoch": 0.08283272696505627, "grad_norm": 2.09375, "learning_rate": 0.0009839425751736876, "loss": 1.7313, "step": 979 }, { "epoch": 0.082917336492089, "grad_norm": 1.640625, "learning_rate": 0.000983909075991167, "loss": 1.2565, "step": 980 }, { "epoch": 0.08300194601912175, "grad_norm": 2.296875, "learning_rate": 0.0009838755424733134, "loss": 1.6774, "step": 981 }, { "epoch": 0.0830865555461545, "grad_norm": 2.765625, "learning_rate": 0.0009838419746225062, "loss": 2.0531, "step": 982 }, { "epoch": 0.08317116507318724, "grad_norm": 1.421875, "learning_rate": 0.000983808372441127, "loss": 1.2864, "step": 983 }, { "epoch": 0.08325577460021999, "grad_norm": 1.5390625, "learning_rate": 0.0009837747359315602, "loss": 1.2051, "step": 984 }, { "epoch": 0.08334038412725273, "grad_norm": 1.6953125, "learning_rate": 0.0009837410650961928, "loss": 1.4152, "step": 985 }, { "epoch": 0.08342499365428548, "grad_norm": 2.296875, "learning_rate": 0.0009837073599374129, "loss": 1.3432, "step": 986 }, { "epoch": 0.08350960318131821, "grad_norm": 1.59375, "learning_rate": 0.0009836736204576128, "loss": 1.134, "step": 987 }, { "epoch": 0.08359421270835096, "grad_norm": 1.7578125, "learning_rate": 0.0009836398466591862, "loss": 1.2807, "step": 988 }, { "epoch": 0.0836788222353837, "grad_norm": 1.7734375, "learning_rate": 0.0009836060385445297, "loss": 1.7219, "step": 989 }, { "epoch": 0.08376343176241645, "grad_norm": 2.078125, "learning_rate": 0.000983572196116042, "loss": 1.4981, "step": 990 }, { "epoch": 0.0838480412894492, "grad_norm": 1.7890625, "learning_rate": 0.000983538319376124, "loss": 1.4966, "step": 991 }, { "epoch": 0.08393265081648194, "grad_norm": 1.7265625, "learning_rate": 0.00098350440832718, "loss": 1.1321, "step": 992 }, { "epoch": 0.08401726034351469, "grad_norm": 2.703125, "learning_rate": 0.0009834704629716159, "loss": 1.5848, "step": 993 }, { "epoch": 0.08410186987054742, "grad_norm": 1.90625, "learning_rate": 0.0009834364833118398, "loss": 1.4115, "step": 994 }, { "epoch": 0.08418647939758017, "grad_norm": 2.140625, "learning_rate": 0.0009834024693502635, "loss": 1.1149, "step": 995 }, { "epoch": 0.08427108892461291, "grad_norm": 1.9453125, "learning_rate": 0.0009833684210893, "loss": 1.2468, "step": 996 }, { "epoch": 0.08435569845164566, "grad_norm": 2.65625, "learning_rate": 0.000983334338531365, "loss": 1.7233, "step": 997 }, { "epoch": 0.0844403079786784, "grad_norm": 2.171875, "learning_rate": 0.0009833002216788772, "loss": 1.279, "step": 998 }, { "epoch": 0.08452491750571114, "grad_norm": 2.703125, "learning_rate": 0.0009832660705342568, "loss": 1.492, "step": 999 }, { "epoch": 0.08460952703274388, "grad_norm": 2.671875, "learning_rate": 0.0009832318850999276, "loss": 1.5952, "step": 1000 }, { "epoch": 0.08469413655977663, "grad_norm": 2.8125, "learning_rate": 0.0009831976653783148, "loss": 2.1617, "step": 1001 }, { "epoch": 0.08477874608680938, "grad_norm": 2.21875, "learning_rate": 0.0009831634113718465, "loss": 1.8623, "step": 1002 }, { "epoch": 0.08486335561384212, "grad_norm": 2.09375, "learning_rate": 0.0009831291230829532, "loss": 1.2179, "step": 1003 }, { "epoch": 0.08494796514087487, "grad_norm": 2.953125, "learning_rate": 0.0009830948005140677, "loss": 2.7212, "step": 1004 }, { "epoch": 0.0850325746679076, "grad_norm": 2.09375, "learning_rate": 0.0009830604436676254, "loss": 1.3704, "step": 1005 }, { "epoch": 0.08511718419494035, "grad_norm": 1.9921875, "learning_rate": 0.0009830260525460642, "loss": 1.4233, "step": 1006 }, { "epoch": 0.08520179372197309, "grad_norm": 1.7265625, "learning_rate": 0.000982991627151824, "loss": 1.2145, "step": 1007 }, { "epoch": 0.08528640324900584, "grad_norm": 2.21875, "learning_rate": 0.0009829571674873476, "loss": 1.7418, "step": 1008 }, { "epoch": 0.08537101277603858, "grad_norm": 1.6953125, "learning_rate": 0.00098292267355508, "loss": 1.3812, "step": 1009 }, { "epoch": 0.08545562230307133, "grad_norm": 1.9609375, "learning_rate": 0.0009828881453574688, "loss": 1.4297, "step": 1010 }, { "epoch": 0.08554023183010406, "grad_norm": 1.9609375, "learning_rate": 0.0009828535828969639, "loss": 1.407, "step": 1011 }, { "epoch": 0.08562484135713681, "grad_norm": 2.515625, "learning_rate": 0.0009828189861760175, "loss": 2.0553, "step": 1012 }, { "epoch": 0.08570945088416956, "grad_norm": 2.078125, "learning_rate": 0.0009827843551970844, "loss": 1.5334, "step": 1013 }, { "epoch": 0.0857940604112023, "grad_norm": 2.546875, "learning_rate": 0.0009827496899626218, "loss": 1.6022, "step": 1014 }, { "epoch": 0.08587866993823505, "grad_norm": 37.25, "learning_rate": 0.0009827149904750895, "loss": 1.5598, "step": 1015 }, { "epoch": 0.08596327946526779, "grad_norm": 12416.0, "learning_rate": 0.0009826802567369495, "loss": 2.3789, "step": 1016 }, { "epoch": 0.08604788899230054, "grad_norm": 24832.0, "learning_rate": 0.0009826454887506661, "loss": 1.453, "step": 1017 }, { "epoch": 0.08613249851933327, "grad_norm": 1520.0, "learning_rate": 0.0009826106865187068, "loss": 2.0715, "step": 1018 }, { "epoch": 0.08621710804636602, "grad_norm": 268288.0, "learning_rate": 0.0009825758500435402, "loss": 1.5979, "step": 1019 }, { "epoch": 0.08630171757339876, "grad_norm": 724.0, "learning_rate": 0.0009825409793276386, "loss": 2.3891, "step": 1020 }, { "epoch": 0.08638632710043151, "grad_norm": 1032.0, "learning_rate": 0.0009825060743734759, "loss": 1.74, "step": 1021 }, { "epoch": 0.08647093662746426, "grad_norm": 532.0, "learning_rate": 0.0009824711351835293, "loss": 1.8621, "step": 1022 }, { "epoch": 0.086555546154497, "grad_norm": 41472.0, "learning_rate": 0.0009824361617602772, "loss": 2.0616, "step": 1023 }, { "epoch": 0.08664015568152975, "grad_norm": 5632.0, "learning_rate": 0.0009824011541062015, "loss": 2.4814, "step": 1024 }, { "epoch": 0.08672476520856248, "grad_norm": 1728.0, "learning_rate": 0.0009823661122237862, "loss": 2.1939, "step": 1025 }, { "epoch": 0.08680937473559523, "grad_norm": 230.0, "learning_rate": 0.0009823310361155174, "loss": 2.3367, "step": 1026 }, { "epoch": 0.08689398426262797, "grad_norm": 4672.0, "learning_rate": 0.000982295925783884, "loss": 1.7879, "step": 1027 }, { "epoch": 0.08697859378966072, "grad_norm": 258.0, "learning_rate": 0.000982260781231377, "loss": 2.3785, "step": 1028 }, { "epoch": 0.08706320331669345, "grad_norm": 6080.0, "learning_rate": 0.0009822256024604908, "loss": 1.9995, "step": 1029 }, { "epoch": 0.0871478128437262, "grad_norm": 209.0, "learning_rate": 0.0009821903894737206, "loss": 2.6536, "step": 1030 }, { "epoch": 0.08723242237075894, "grad_norm": 416.0, "learning_rate": 0.0009821551422735653, "loss": 2.2265, "step": 1031 }, { "epoch": 0.08731703189779169, "grad_norm": 354.0, "learning_rate": 0.0009821198608625258, "loss": 2.0657, "step": 1032 }, { "epoch": 0.08740164142482444, "grad_norm": 6.875, "learning_rate": 0.0009820845452431057, "loss": 2.2475, "step": 1033 }, { "epoch": 0.08748625095185718, "grad_norm": 27.875, "learning_rate": 0.0009820491954178104, "loss": 2.2301, "step": 1034 }, { "epoch": 0.08757086047888993, "grad_norm": 28.0, "learning_rate": 0.000982013811389148, "loss": 1.9133, "step": 1035 }, { "epoch": 0.08765547000592266, "grad_norm": 3.8125, "learning_rate": 0.00098197839315963, "loss": 1.8696, "step": 1036 }, { "epoch": 0.08774007953295541, "grad_norm": 3.703125, "learning_rate": 0.0009819429407317683, "loss": 2.1083, "step": 1037 }, { "epoch": 0.08782468905998815, "grad_norm": 2.90625, "learning_rate": 0.0009819074541080792, "loss": 1.8446, "step": 1038 }, { "epoch": 0.0879092985870209, "grad_norm": 4.09375, "learning_rate": 0.0009818719332910805, "loss": 1.4616, "step": 1039 }, { "epoch": 0.08799390811405364, "grad_norm": 3.765625, "learning_rate": 0.0009818363782832923, "loss": 1.2599, "step": 1040 }, { "epoch": 0.08807851764108639, "grad_norm": 1.96875, "learning_rate": 0.0009818007890872375, "loss": 1.5326, "step": 1041 }, { "epoch": 0.08816312716811912, "grad_norm": 2.53125, "learning_rate": 0.0009817651657054415, "loss": 2.1217, "step": 1042 }, { "epoch": 0.08824773669515187, "grad_norm": 2.046875, "learning_rate": 0.0009817295081404316, "loss": 1.3865, "step": 1043 }, { "epoch": 0.08833234622218462, "grad_norm": 1.6640625, "learning_rate": 0.000981693816394738, "loss": 1.0872, "step": 1044 }, { "epoch": 0.08841695574921736, "grad_norm": 1.8515625, "learning_rate": 0.0009816580904708932, "loss": 1.2501, "step": 1045 }, { "epoch": 0.08850156527625011, "grad_norm": 2.25, "learning_rate": 0.000981622330371432, "loss": 1.747, "step": 1046 }, { "epoch": 0.08858617480328285, "grad_norm": 2.375, "learning_rate": 0.0009815865360988918, "loss": 1.8884, "step": 1047 }, { "epoch": 0.0886707843303156, "grad_norm": 1.8203125, "learning_rate": 0.0009815507076558123, "loss": 1.4927, "step": 1048 }, { "epoch": 0.08875539385734833, "grad_norm": 1.8515625, "learning_rate": 0.0009815148450447357, "loss": 1.63, "step": 1049 }, { "epoch": 0.08884000338438108, "grad_norm": 1.8046875, "learning_rate": 0.0009814789482682068, "loss": 1.4192, "step": 1050 }, { "epoch": 0.08892461291141382, "grad_norm": 2.03125, "learning_rate": 0.0009814430173287723, "loss": 1.3324, "step": 1051 }, { "epoch": 0.08900922243844657, "grad_norm": 1.9453125, "learning_rate": 0.0009814070522289819, "loss": 1.3373, "step": 1052 }, { "epoch": 0.08909383196547932, "grad_norm": 2.3125, "learning_rate": 0.0009813710529713872, "loss": 1.2407, "step": 1053 }, { "epoch": 0.08917844149251206, "grad_norm": 5.3125, "learning_rate": 0.0009813350195585428, "loss": 1.6743, "step": 1054 }, { "epoch": 0.0892630510195448, "grad_norm": 7.9375, "learning_rate": 0.000981298951993005, "loss": 1.708, "step": 1055 }, { "epoch": 0.08934766054657754, "grad_norm": 3.5625, "learning_rate": 0.0009812628502773335, "loss": 1.9571, "step": 1056 }, { "epoch": 0.08943227007361029, "grad_norm": 4.71875, "learning_rate": 0.0009812267144140897, "loss": 1.6252, "step": 1057 }, { "epoch": 0.08951687960064303, "grad_norm": 3.71875, "learning_rate": 0.000981190544405837, "loss": 1.5166, "step": 1058 }, { "epoch": 0.08960148912767578, "grad_norm": 1.9140625, "learning_rate": 0.0009811543402551426, "loss": 1.2253, "step": 1059 }, { "epoch": 0.08968609865470852, "grad_norm": 8.9375, "learning_rate": 0.000981118101964575, "loss": 1.6067, "step": 1060 }, { "epoch": 0.08977070818174127, "grad_norm": 4.78125, "learning_rate": 0.0009810818295367054, "loss": 2.1231, "step": 1061 }, { "epoch": 0.089855317708774, "grad_norm": 2.265625, "learning_rate": 0.0009810455229741076, "loss": 2.0207, "step": 1062 }, { "epoch": 0.08993992723580675, "grad_norm": 2.390625, "learning_rate": 0.0009810091822793575, "loss": 1.3291, "step": 1063 }, { "epoch": 0.0900245367628395, "grad_norm": 2.140625, "learning_rate": 0.000980972807455034, "loss": 1.372, "step": 1064 }, { "epoch": 0.09010914628987224, "grad_norm": 2.703125, "learning_rate": 0.0009809363985037176, "loss": 1.6328, "step": 1065 }, { "epoch": 0.09019375581690499, "grad_norm": 1.6640625, "learning_rate": 0.0009808999554279919, "loss": 1.2573, "step": 1066 }, { "epoch": 0.09027836534393772, "grad_norm": 2.65625, "learning_rate": 0.0009808634782304427, "loss": 1.8395, "step": 1067 }, { "epoch": 0.09036297487097047, "grad_norm": 2.5625, "learning_rate": 0.0009808269669136583, "loss": 1.7144, "step": 1068 }, { "epoch": 0.09044758439800321, "grad_norm": 2.28125, "learning_rate": 0.0009807904214802288, "loss": 1.6188, "step": 1069 }, { "epoch": 0.09053219392503596, "grad_norm": 2.125, "learning_rate": 0.000980753841932748, "loss": 1.5533, "step": 1070 }, { "epoch": 0.0906168034520687, "grad_norm": 2.59375, "learning_rate": 0.0009807172282738108, "loss": 1.3053, "step": 1071 }, { "epoch": 0.09070141297910145, "grad_norm": 1.8671875, "learning_rate": 0.0009806805805060155, "loss": 1.2844, "step": 1072 }, { "epoch": 0.09078602250613418, "grad_norm": 2.25, "learning_rate": 0.000980643898631962, "loss": 1.3895, "step": 1073 }, { "epoch": 0.09087063203316693, "grad_norm": 2.8125, "learning_rate": 0.0009806071826542534, "loss": 2.4032, "step": 1074 }, { "epoch": 0.09095524156019968, "grad_norm": 2.34375, "learning_rate": 0.0009805704325754946, "loss": 1.4699, "step": 1075 }, { "epoch": 0.09103985108723242, "grad_norm": 2.0, "learning_rate": 0.0009805336483982932, "loss": 1.5715, "step": 1076 }, { "epoch": 0.09112446061426517, "grad_norm": 2.890625, "learning_rate": 0.0009804968301252593, "loss": 1.7278, "step": 1077 }, { "epoch": 0.09120907014129791, "grad_norm": 2.484375, "learning_rate": 0.0009804599777590052, "loss": 1.8499, "step": 1078 }, { "epoch": 0.09129367966833066, "grad_norm": 1.9296875, "learning_rate": 0.000980423091302146, "loss": 1.4527, "step": 1079 }, { "epoch": 0.0913782891953634, "grad_norm": 1.8515625, "learning_rate": 0.0009803861707572985, "loss": 1.2488, "step": 1080 }, { "epoch": 0.09146289872239614, "grad_norm": 1.546875, "learning_rate": 0.0009803492161270825, "loss": 1.1751, "step": 1081 }, { "epoch": 0.09154750824942888, "grad_norm": 2.15625, "learning_rate": 0.0009803122274141202, "loss": 1.6812, "step": 1082 }, { "epoch": 0.09163211777646163, "grad_norm": 1.8359375, "learning_rate": 0.0009802752046210363, "loss": 1.3578, "step": 1083 }, { "epoch": 0.09171672730349438, "grad_norm": 2.328125, "learning_rate": 0.0009802381477504573, "loss": 1.748, "step": 1084 }, { "epoch": 0.09180133683052712, "grad_norm": 2.3125, "learning_rate": 0.0009802010568050127, "loss": 1.4377, "step": 1085 }, { "epoch": 0.09188594635755987, "grad_norm": 1.375, "learning_rate": 0.0009801639317873346, "loss": 1.1001, "step": 1086 }, { "epoch": 0.0919705558845926, "grad_norm": 2.046875, "learning_rate": 0.0009801267727000565, "loss": 1.5289, "step": 1087 }, { "epoch": 0.09205516541162535, "grad_norm": 1.734375, "learning_rate": 0.0009800895795458154, "loss": 1.2743, "step": 1088 }, { "epoch": 0.09213977493865809, "grad_norm": 2.0, "learning_rate": 0.0009800523523272504, "loss": 1.3311, "step": 1089 }, { "epoch": 0.09222438446569084, "grad_norm": 2.5625, "learning_rate": 0.0009800150910470025, "loss": 1.9524, "step": 1090 }, { "epoch": 0.09230899399272358, "grad_norm": 2.59375, "learning_rate": 0.000979977795707716, "loss": 1.273, "step": 1091 }, { "epoch": 0.09239360351975633, "grad_norm": 2.78125, "learning_rate": 0.000979940466312037, "loss": 1.3509, "step": 1092 }, { "epoch": 0.09247821304678906, "grad_norm": 3.53125, "learning_rate": 0.000979903102862614, "loss": 2.1755, "step": 1093 }, { "epoch": 0.09256282257382181, "grad_norm": 3.28125, "learning_rate": 0.0009798657053620983, "loss": 1.535, "step": 1094 }, { "epoch": 0.09264743210085456, "grad_norm": 2.6875, "learning_rate": 0.0009798282738131435, "loss": 1.8625, "step": 1095 }, { "epoch": 0.0927320416278873, "grad_norm": 2.203125, "learning_rate": 0.000979790808218405, "loss": 1.3442, "step": 1096 }, { "epoch": 0.09281665115492005, "grad_norm": 2.0625, "learning_rate": 0.0009797533085805419, "loss": 1.3641, "step": 1097 }, { "epoch": 0.09290126068195279, "grad_norm": 1.96875, "learning_rate": 0.0009797157749022144, "loss": 1.4349, "step": 1098 }, { "epoch": 0.09298587020898554, "grad_norm": 2.5, "learning_rate": 0.0009796782071860857, "loss": 1.5218, "step": 1099 }, { "epoch": 0.09307047973601827, "grad_norm": 2.59375, "learning_rate": 0.0009796406054348216, "loss": 1.777, "step": 1100 }, { "epoch": 0.09315508926305102, "grad_norm": 2.453125, "learning_rate": 0.0009796029696510899, "loss": 1.616, "step": 1101 }, { "epoch": 0.09323969879008376, "grad_norm": 5.5, "learning_rate": 0.0009795652998375612, "loss": 1.91, "step": 1102 }, { "epoch": 0.09332430831711651, "grad_norm": 62.75, "learning_rate": 0.0009795275959969083, "loss": 1.4356, "step": 1103 }, { "epoch": 0.09340891784414924, "grad_norm": 4.125, "learning_rate": 0.0009794898581318063, "loss": 1.2968, "step": 1104 }, { "epoch": 0.093493527371182, "grad_norm": 2.34375, "learning_rate": 0.000979452086244933, "loss": 1.3711, "step": 1105 }, { "epoch": 0.09357813689821474, "grad_norm": 2.015625, "learning_rate": 0.0009794142803389683, "loss": 1.4057, "step": 1106 }, { "epoch": 0.09366274642524748, "grad_norm": 33.75, "learning_rate": 0.0009793764404165948, "loss": 1.554, "step": 1107 }, { "epoch": 0.09374735595228023, "grad_norm": 42.5, "learning_rate": 0.0009793385664804974, "loss": 1.7888, "step": 1108 }, { "epoch": 0.09383196547931297, "grad_norm": 16.25, "learning_rate": 0.0009793006585333635, "loss": 2.6923, "step": 1109 }, { "epoch": 0.09391657500634572, "grad_norm": 65.0, "learning_rate": 0.0009792627165778826, "loss": 1.4619, "step": 1110 }, { "epoch": 0.09400118453337845, "grad_norm": 2.78125, "learning_rate": 0.000979224740616747, "loss": 1.7485, "step": 1111 }, { "epoch": 0.0940857940604112, "grad_norm": 2.8125, "learning_rate": 0.0009791867306526514, "loss": 1.4483, "step": 1112 }, { "epoch": 0.09417040358744394, "grad_norm": 2.109375, "learning_rate": 0.0009791486866882926, "loss": 1.5252, "step": 1113 }, { "epoch": 0.09425501311447669, "grad_norm": 2.4375, "learning_rate": 0.0009791106087263697, "loss": 1.3683, "step": 1114 }, { "epoch": 0.09433962264150944, "grad_norm": 2.359375, "learning_rate": 0.0009790724967695848, "loss": 1.6058, "step": 1115 }, { "epoch": 0.09442423216854218, "grad_norm": 2.625, "learning_rate": 0.000979034350820642, "loss": 1.7291, "step": 1116 }, { "epoch": 0.09450884169557493, "grad_norm": 2.09375, "learning_rate": 0.0009789961708822482, "loss": 1.5204, "step": 1117 }, { "epoch": 0.09459345122260766, "grad_norm": 2.21875, "learning_rate": 0.000978957956957112, "loss": 1.8387, "step": 1118 }, { "epoch": 0.09467806074964041, "grad_norm": 2.109375, "learning_rate": 0.000978919709047945, "loss": 1.9336, "step": 1119 }, { "epoch": 0.09476267027667315, "grad_norm": 1.84375, "learning_rate": 0.0009788814271574611, "loss": 1.4482, "step": 1120 }, { "epoch": 0.0948472798037059, "grad_norm": 1.9140625, "learning_rate": 0.0009788431112883764, "loss": 1.6606, "step": 1121 }, { "epoch": 0.09493188933073864, "grad_norm": 1.7734375, "learning_rate": 0.0009788047614434098, "loss": 1.5378, "step": 1122 }, { "epoch": 0.09501649885777139, "grad_norm": 1.890625, "learning_rate": 0.0009787663776252822, "loss": 1.5033, "step": 1123 }, { "epoch": 0.09510110838480412, "grad_norm": 1.6171875, "learning_rate": 0.000978727959836717, "loss": 1.2545, "step": 1124 }, { "epoch": 0.09518571791183687, "grad_norm": 2.703125, "learning_rate": 0.0009786895080804406, "loss": 2.2442, "step": 1125 }, { "epoch": 0.09527032743886962, "grad_norm": 1.765625, "learning_rate": 0.0009786510223591808, "loss": 1.2678, "step": 1126 }, { "epoch": 0.09535493696590236, "grad_norm": 2.3125, "learning_rate": 0.0009786125026756685, "loss": 1.4403, "step": 1127 }, { "epoch": 0.09543954649293511, "grad_norm": 2.453125, "learning_rate": 0.0009785739490326367, "loss": 1.2975, "step": 1128 }, { "epoch": 0.09552415601996785, "grad_norm": 2.4375, "learning_rate": 0.0009785353614328212, "loss": 2.0455, "step": 1129 }, { "epoch": 0.0956087655470006, "grad_norm": 2.015625, "learning_rate": 0.0009784967398789597, "loss": 1.8203, "step": 1130 }, { "epoch": 0.09569337507403333, "grad_norm": 2.171875, "learning_rate": 0.0009784580843737924, "loss": 1.4559, "step": 1131 }, { "epoch": 0.09577798460106608, "grad_norm": 2.21875, "learning_rate": 0.0009784193949200627, "loss": 1.647, "step": 1132 }, { "epoch": 0.09586259412809882, "grad_norm": 2.15625, "learning_rate": 0.0009783806715205152, "loss": 1.4784, "step": 1133 }, { "epoch": 0.09594720365513157, "grad_norm": 2.375, "learning_rate": 0.0009783419141778979, "loss": 1.7829, "step": 1134 }, { "epoch": 0.0960318131821643, "grad_norm": 2.03125, "learning_rate": 0.0009783031228949603, "loss": 1.3547, "step": 1135 }, { "epoch": 0.09611642270919706, "grad_norm": 2.421875, "learning_rate": 0.0009782642976744551, "loss": 1.7349, "step": 1136 }, { "epoch": 0.0962010322362298, "grad_norm": 2.15625, "learning_rate": 0.0009782254385191373, "loss": 1.7968, "step": 1137 }, { "epoch": 0.09628564176326254, "grad_norm": 2.15625, "learning_rate": 0.0009781865454317638, "loss": 1.6307, "step": 1138 }, { "epoch": 0.09637025129029529, "grad_norm": 2.140625, "learning_rate": 0.000978147618415094, "loss": 1.742, "step": 1139 }, { "epoch": 0.09645486081732803, "grad_norm": 2.21875, "learning_rate": 0.0009781086574718906, "loss": 1.7787, "step": 1140 }, { "epoch": 0.09653947034436078, "grad_norm": 1.7421875, "learning_rate": 0.0009780696626049178, "loss": 1.4011, "step": 1141 }, { "epoch": 0.09662407987139351, "grad_norm": 1.8671875, "learning_rate": 0.000978030633816942, "loss": 1.3218, "step": 1142 }, { "epoch": 0.09670868939842626, "grad_norm": 2.015625, "learning_rate": 0.000977991571110733, "loss": 1.4626, "step": 1143 }, { "epoch": 0.096793298925459, "grad_norm": 2.34375, "learning_rate": 0.0009779524744890622, "loss": 1.7715, "step": 1144 }, { "epoch": 0.09687790845249175, "grad_norm": 2.71875, "learning_rate": 0.0009779133439547037, "loss": 1.9679, "step": 1145 }, { "epoch": 0.0969625179795245, "grad_norm": 1.703125, "learning_rate": 0.000977874179510434, "loss": 1.3441, "step": 1146 }, { "epoch": 0.09704712750655724, "grad_norm": 2.25, "learning_rate": 0.000977834981159032, "loss": 1.8169, "step": 1147 }, { "epoch": 0.09713173703358999, "grad_norm": 2.265625, "learning_rate": 0.0009777957489032789, "loss": 2.0271, "step": 1148 }, { "epoch": 0.09721634656062272, "grad_norm": 1.546875, "learning_rate": 0.0009777564827459584, "loss": 1.3061, "step": 1149 }, { "epoch": 0.09730095608765547, "grad_norm": 1.8828125, "learning_rate": 0.0009777171826898566, "loss": 1.3582, "step": 1150 }, { "epoch": 0.09738556561468821, "grad_norm": 2.625, "learning_rate": 0.000977677848737762, "loss": 1.9084, "step": 1151 }, { "epoch": 0.09747017514172096, "grad_norm": 1.953125, "learning_rate": 0.0009776384808924657, "loss": 1.3422, "step": 1152 }, { "epoch": 0.0975547846687537, "grad_norm": 2.296875, "learning_rate": 0.0009775990791567608, "loss": 1.7844, "step": 1153 }, { "epoch": 0.09763939419578645, "grad_norm": 2.03125, "learning_rate": 0.0009775596435334431, "loss": 1.7936, "step": 1154 }, { "epoch": 0.09772400372281918, "grad_norm": 1.640625, "learning_rate": 0.0009775201740253106, "loss": 1.4572, "step": 1155 }, { "epoch": 0.09780861324985193, "grad_norm": 2.109375, "learning_rate": 0.000977480670635164, "loss": 1.4436, "step": 1156 }, { "epoch": 0.09789322277688468, "grad_norm": 13.25, "learning_rate": 0.000977441133365806, "loss": 1.9157, "step": 1157 }, { "epoch": 0.09797783230391742, "grad_norm": 2.234375, "learning_rate": 0.0009774015622200421, "loss": 1.58, "step": 1158 }, { "epoch": 0.09806244183095017, "grad_norm": 3.484375, "learning_rate": 0.00097736195720068, "loss": 1.1432, "step": 1159 }, { "epoch": 0.0981470513579829, "grad_norm": 2.140625, "learning_rate": 0.0009773223183105297, "loss": 1.3685, "step": 1160 }, { "epoch": 0.09823166088501566, "grad_norm": 2.796875, "learning_rate": 0.0009772826455524042, "loss": 2.6012, "step": 1161 }, { "epoch": 0.09831627041204839, "grad_norm": 1.5390625, "learning_rate": 0.000977242938929118, "loss": 1.5334, "step": 1162 }, { "epoch": 0.09840087993908114, "grad_norm": 1.234375, "learning_rate": 0.0009772031984434885, "loss": 1.0355, "step": 1163 }, { "epoch": 0.09848548946611388, "grad_norm": 1.8359375, "learning_rate": 0.0009771634240983353, "loss": 1.378, "step": 1164 }, { "epoch": 0.09857009899314663, "grad_norm": 1.21875, "learning_rate": 0.000977123615896481, "loss": 1.0549, "step": 1165 }, { "epoch": 0.09865470852017937, "grad_norm": 1.8671875, "learning_rate": 0.00097708377384075, "loss": 1.7946, "step": 1166 }, { "epoch": 0.09873931804721212, "grad_norm": 20.625, "learning_rate": 0.0009770438979339693, "loss": 1.2534, "step": 1167 }, { "epoch": 0.09882392757424487, "grad_norm": 27.25, "learning_rate": 0.0009770039881789678, "loss": 1.6131, "step": 1168 }, { "epoch": 0.0989085371012776, "grad_norm": 161.0, "learning_rate": 0.000976964044578578, "loss": 1.7596, "step": 1169 }, { "epoch": 0.09899314662831035, "grad_norm": 222.0, "learning_rate": 0.000976924067135633, "loss": 1.6324, "step": 1170 }, { "epoch": 0.09907775615534309, "grad_norm": 212.0, "learning_rate": 0.0009768840558529708, "loss": 1.6193, "step": 1171 }, { "epoch": 0.09916236568237584, "grad_norm": 36.25, "learning_rate": 0.000976844010733429, "loss": 1.4593, "step": 1172 }, { "epoch": 0.09924697520940857, "grad_norm": 4.21875, "learning_rate": 0.0009768039317798502, "loss": 1.7706, "step": 1173 }, { "epoch": 0.09933158473644133, "grad_norm": 7.03125, "learning_rate": 0.000976763818995077, "loss": 1.3412, "step": 1174 }, { "epoch": 0.09941619426347406, "grad_norm": 4.5, "learning_rate": 0.0009767236723819562, "loss": 2.4674, "step": 1175 }, { "epoch": 0.09950080379050681, "grad_norm": 2.75, "learning_rate": 0.0009766834919433362, "loss": 2.0204, "step": 1176 }, { "epoch": 0.09958541331753956, "grad_norm": 2.890625, "learning_rate": 0.0009766432776820682, "loss": 1.6212, "step": 1177 }, { "epoch": 0.0996700228445723, "grad_norm": 4.6875, "learning_rate": 0.0009766030296010054, "loss": 1.5863, "step": 1178 }, { "epoch": 0.09975463237160505, "grad_norm": 5.03125, "learning_rate": 0.0009765627477030035, "loss": 1.8125, "step": 1179 }, { "epoch": 0.09983924189863778, "grad_norm": 5.21875, "learning_rate": 0.0009765224319909207, "loss": 1.5478, "step": 1180 }, { "epoch": 0.09992385142567053, "grad_norm": 3.140625, "learning_rate": 0.0009764820824676176, "loss": 1.7456, "step": 1181 }, { "epoch": 0.10000846095270327, "grad_norm": 2.640625, "learning_rate": 0.0009764416991359571, "loss": 1.5472, "step": 1182 }, { "epoch": 0.10009307047973602, "grad_norm": 3.71875, "learning_rate": 0.0009764012819988047, "loss": 1.5653, "step": 1183 }, { "epoch": 0.10017768000676876, "grad_norm": 1.953125, "learning_rate": 0.000976360831059028, "loss": 1.1331, "step": 1184 }, { "epoch": 0.10026228953380151, "grad_norm": 1.8359375, "learning_rate": 0.0009763203463194974, "loss": 1.5689, "step": 1185 }, { "epoch": 0.10034689906083424, "grad_norm": 2.390625, "learning_rate": 0.0009762798277830851, "loss": 1.5637, "step": 1186 }, { "epoch": 0.100431508587867, "grad_norm": 2.203125, "learning_rate": 0.0009762392754526665, "loss": 1.3388, "step": 1187 }, { "epoch": 0.10051611811489974, "grad_norm": 2.359375, "learning_rate": 0.0009761986893311185, "loss": 1.6358, "step": 1188 }, { "epoch": 0.10060072764193248, "grad_norm": 3.171875, "learning_rate": 0.0009761580694213211, "loss": 1.9164, "step": 1189 }, { "epoch": 0.10068533716896523, "grad_norm": 1.9921875, "learning_rate": 0.0009761174157261564, "loss": 1.2581, "step": 1190 }, { "epoch": 0.10076994669599797, "grad_norm": 1.9921875, "learning_rate": 0.000976076728248509, "loss": 1.7589, "step": 1191 }, { "epoch": 0.10085455622303072, "grad_norm": 8.875, "learning_rate": 0.0009760360069912659, "loss": 1.4762, "step": 1192 }, { "epoch": 0.10093916575006345, "grad_norm": 1.921875, "learning_rate": 0.0009759952519573161, "loss": 1.2083, "step": 1193 }, { "epoch": 0.1010237752770962, "grad_norm": 2.140625, "learning_rate": 0.0009759544631495518, "loss": 1.7694, "step": 1194 }, { "epoch": 0.10110838480412894, "grad_norm": 2.1875, "learning_rate": 0.0009759136405708669, "loss": 1.7721, "step": 1195 }, { "epoch": 0.10119299433116169, "grad_norm": 2.03125, "learning_rate": 0.0009758727842241578, "loss": 1.4142, "step": 1196 }, { "epoch": 0.10127760385819443, "grad_norm": 3.90625, "learning_rate": 0.0009758318941123235, "loss": 1.8119, "step": 1197 }, { "epoch": 0.10136221338522718, "grad_norm": 2.09375, "learning_rate": 0.0009757909702382654, "loss": 1.4722, "step": 1198 }, { "epoch": 0.10144682291225993, "grad_norm": 3.734375, "learning_rate": 0.0009757500126048875, "loss": 1.9893, "step": 1199 }, { "epoch": 0.10153143243929266, "grad_norm": 2.140625, "learning_rate": 0.0009757090212150954, "loss": 1.6535, "step": 1200 }, { "epoch": 0.10161604196632541, "grad_norm": 2.6875, "learning_rate": 0.0009756679960717978, "loss": 1.5505, "step": 1201 }, { "epoch": 0.10170065149335815, "grad_norm": 1.9140625, "learning_rate": 0.0009756269371779056, "loss": 1.2525, "step": 1202 }, { "epoch": 0.1017852610203909, "grad_norm": 2.3125, "learning_rate": 0.0009755858445363321, "loss": 1.2738, "step": 1203 }, { "epoch": 0.10186987054742364, "grad_norm": 7.9375, "learning_rate": 0.0009755447181499932, "loss": 2.0184, "step": 1204 }, { "epoch": 0.10195448007445639, "grad_norm": 2.09375, "learning_rate": 0.0009755035580218067, "loss": 1.5274, "step": 1205 }, { "epoch": 0.10203908960148912, "grad_norm": 1.96875, "learning_rate": 0.0009754623641546931, "loss": 1.43, "step": 1206 }, { "epoch": 0.10212369912852187, "grad_norm": 4.625, "learning_rate": 0.0009754211365515755, "loss": 1.9757, "step": 1207 }, { "epoch": 0.10220830865555462, "grad_norm": 2.703125, "learning_rate": 0.0009753798752153789, "loss": 1.4827, "step": 1208 }, { "epoch": 0.10229291818258736, "grad_norm": 2.296875, "learning_rate": 0.0009753385801490311, "loss": 1.4773, "step": 1209 }, { "epoch": 0.10237752770962011, "grad_norm": 2.171875, "learning_rate": 0.0009752972513554621, "loss": 1.0962, "step": 1210 }, { "epoch": 0.10246213723665284, "grad_norm": 2.953125, "learning_rate": 0.0009752558888376044, "loss": 1.6157, "step": 1211 }, { "epoch": 0.1025467467636856, "grad_norm": 3.03125, "learning_rate": 0.000975214492598393, "loss": 2.0855, "step": 1212 }, { "epoch": 0.10263135629071833, "grad_norm": 1.96875, "learning_rate": 0.0009751730626407646, "loss": 1.7336, "step": 1213 }, { "epoch": 0.10271596581775108, "grad_norm": 1.9296875, "learning_rate": 0.0009751315989676595, "loss": 1.2921, "step": 1214 }, { "epoch": 0.10280057534478382, "grad_norm": 2.171875, "learning_rate": 0.0009750901015820192, "loss": 1.7493, "step": 1215 }, { "epoch": 0.10288518487181657, "grad_norm": 1.9921875, "learning_rate": 0.0009750485704867884, "loss": 1.5447, "step": 1216 }, { "epoch": 0.1029697943988493, "grad_norm": 1.625, "learning_rate": 0.0009750070056849139, "loss": 1.2892, "step": 1217 }, { "epoch": 0.10305440392588205, "grad_norm": 1.9765625, "learning_rate": 0.0009749654071793445, "loss": 1.7133, "step": 1218 }, { "epoch": 0.1031390134529148, "grad_norm": 2.203125, "learning_rate": 0.0009749237749730324, "loss": 1.5131, "step": 1219 }, { "epoch": 0.10322362297994754, "grad_norm": 1.859375, "learning_rate": 0.0009748821090689311, "loss": 1.7161, "step": 1220 }, { "epoch": 0.10330823250698029, "grad_norm": 1.65625, "learning_rate": 0.0009748404094699972, "loss": 1.1276, "step": 1221 }, { "epoch": 0.10339284203401303, "grad_norm": 1.875, "learning_rate": 0.0009747986761791894, "loss": 1.2483, "step": 1222 }, { "epoch": 0.10347745156104578, "grad_norm": 2.984375, "learning_rate": 0.0009747569091994688, "loss": 1.5523, "step": 1223 }, { "epoch": 0.10356206108807851, "grad_norm": 2.28125, "learning_rate": 0.0009747151085337988, "loss": 1.4974, "step": 1224 }, { "epoch": 0.10364667061511126, "grad_norm": 3.78125, "learning_rate": 0.0009746732741851458, "loss": 1.578, "step": 1225 }, { "epoch": 0.103731280142144, "grad_norm": 2.203125, "learning_rate": 0.0009746314061564776, "loss": 1.4485, "step": 1226 }, { "epoch": 0.10381588966917675, "grad_norm": 3.0, "learning_rate": 0.0009745895044507653, "loss": 1.6686, "step": 1227 }, { "epoch": 0.10390049919620949, "grad_norm": 2.921875, "learning_rate": 0.0009745475690709817, "loss": 2.2154, "step": 1228 }, { "epoch": 0.10398510872324224, "grad_norm": 2.359375, "learning_rate": 0.0009745056000201025, "loss": 1.263, "step": 1229 }, { "epoch": 0.10406971825027499, "grad_norm": 2.40625, "learning_rate": 0.0009744635973011053, "loss": 1.6155, "step": 1230 }, { "epoch": 0.10415432777730772, "grad_norm": 1.921875, "learning_rate": 0.0009744215609169708, "loss": 1.6001, "step": 1231 }, { "epoch": 0.10423893730434047, "grad_norm": 3.75, "learning_rate": 0.0009743794908706813, "loss": 1.5411, "step": 1232 }, { "epoch": 0.10432354683137321, "grad_norm": 2.71875, "learning_rate": 0.0009743373871652219, "loss": 1.705, "step": 1233 }, { "epoch": 0.10440815635840596, "grad_norm": 2.46875, "learning_rate": 0.0009742952498035802, "loss": 1.4604, "step": 1234 }, { "epoch": 0.1044927658854387, "grad_norm": 2.984375, "learning_rate": 0.0009742530787887458, "loss": 1.7508, "step": 1235 }, { "epoch": 0.10457737541247145, "grad_norm": 2.4375, "learning_rate": 0.000974210874123711, "loss": 1.531, "step": 1236 }, { "epoch": 0.10466198493950418, "grad_norm": 12.4375, "learning_rate": 0.0009741686358114704, "loss": 1.711, "step": 1237 }, { "epoch": 0.10474659446653693, "grad_norm": 2.0, "learning_rate": 0.0009741263638550211, "loss": 1.2051, "step": 1238 }, { "epoch": 0.10483120399356968, "grad_norm": 2.546875, "learning_rate": 0.0009740840582573622, "loss": 1.6329, "step": 1239 }, { "epoch": 0.10491581352060242, "grad_norm": 2.390625, "learning_rate": 0.0009740417190214956, "loss": 1.3478, "step": 1240 }, { "epoch": 0.10500042304763517, "grad_norm": 3.5625, "learning_rate": 0.0009739993461504255, "loss": 2.0825, "step": 1241 }, { "epoch": 0.1050850325746679, "grad_norm": 3.484375, "learning_rate": 0.0009739569396471584, "loss": 1.5619, "step": 1242 }, { "epoch": 0.10516964210170066, "grad_norm": 4.6875, "learning_rate": 0.0009739144995147032, "loss": 1.6857, "step": 1243 }, { "epoch": 0.10525425162873339, "grad_norm": 2.734375, "learning_rate": 0.0009738720257560712, "loss": 2.055, "step": 1244 }, { "epoch": 0.10533886115576614, "grad_norm": 3.75, "learning_rate": 0.0009738295183742761, "loss": 1.5064, "step": 1245 }, { "epoch": 0.10542347068279888, "grad_norm": 3.09375, "learning_rate": 0.0009737869773723338, "loss": 1.6104, "step": 1246 }, { "epoch": 0.10550808020983163, "grad_norm": 2.421875, "learning_rate": 0.0009737444027532632, "loss": 1.384, "step": 1247 }, { "epoch": 0.10559268973686436, "grad_norm": 2.5625, "learning_rate": 0.0009737017945200846, "loss": 1.2434, "step": 1248 }, { "epoch": 0.10567729926389711, "grad_norm": 10.5, "learning_rate": 0.0009736591526758215, "loss": 1.3119, "step": 1249 }, { "epoch": 0.10576190879092986, "grad_norm": 4.46875, "learning_rate": 0.0009736164772234996, "loss": 1.5795, "step": 1250 }, { "epoch": 0.1058465183179626, "grad_norm": 2.125, "learning_rate": 0.0009735737681661467, "loss": 1.509, "step": 1251 }, { "epoch": 0.10593112784499535, "grad_norm": 2.109375, "learning_rate": 0.0009735310255067933, "loss": 1.8188, "step": 1252 }, { "epoch": 0.10601573737202809, "grad_norm": 2.03125, "learning_rate": 0.0009734882492484722, "loss": 1.4994, "step": 1253 }, { "epoch": 0.10610034689906084, "grad_norm": 2.21875, "learning_rate": 0.0009734454393942185, "loss": 1.4028, "step": 1254 }, { "epoch": 0.10618495642609357, "grad_norm": 1.890625, "learning_rate": 0.0009734025959470697, "loss": 1.4703, "step": 1255 }, { "epoch": 0.10626956595312632, "grad_norm": 2.296875, "learning_rate": 0.0009733597189100658, "loss": 1.5421, "step": 1256 }, { "epoch": 0.10635417548015906, "grad_norm": 2.03125, "learning_rate": 0.0009733168082862489, "loss": 1.2942, "step": 1257 }, { "epoch": 0.10643878500719181, "grad_norm": 60.25, "learning_rate": 0.0009732738640786641, "loss": 1.1292, "step": 1258 }, { "epoch": 0.10652339453422455, "grad_norm": 7.40625, "learning_rate": 0.000973230886290358, "loss": 1.8368, "step": 1259 }, { "epoch": 0.1066080040612573, "grad_norm": 2.40625, "learning_rate": 0.0009731878749243804, "loss": 1.3996, "step": 1260 }, { "epoch": 0.10669261358829005, "grad_norm": 1.78125, "learning_rate": 0.0009731448299837829, "loss": 1.2125, "step": 1261 }, { "epoch": 0.10677722311532278, "grad_norm": 2.71875, "learning_rate": 0.0009731017514716196, "loss": 2.307, "step": 1262 }, { "epoch": 0.10686183264235553, "grad_norm": 3.0, "learning_rate": 0.0009730586393909475, "loss": 1.9541, "step": 1263 }, { "epoch": 0.10694644216938827, "grad_norm": 2.203125, "learning_rate": 0.0009730154937448254, "loss": 1.3867, "step": 1264 }, { "epoch": 0.10703105169642102, "grad_norm": 1.9140625, "learning_rate": 0.0009729723145363146, "loss": 1.5045, "step": 1265 }, { "epoch": 0.10711566122345376, "grad_norm": 2.1875, "learning_rate": 0.0009729291017684789, "loss": 1.5067, "step": 1266 }, { "epoch": 0.1072002707504865, "grad_norm": 3.75, "learning_rate": 0.0009728858554443844, "loss": 1.6486, "step": 1267 }, { "epoch": 0.10728488027751924, "grad_norm": 1.875, "learning_rate": 0.0009728425755670995, "loss": 1.3745, "step": 1268 }, { "epoch": 0.10736948980455199, "grad_norm": 2.546875, "learning_rate": 0.0009727992621396954, "loss": 1.9907, "step": 1269 }, { "epoch": 0.10745409933158474, "grad_norm": 2.3125, "learning_rate": 0.0009727559151652451, "loss": 1.2928, "step": 1270 }, { "epoch": 0.10753870885861748, "grad_norm": 2.671875, "learning_rate": 0.0009727125346468241, "loss": 1.7407, "step": 1271 }, { "epoch": 0.10762331838565023, "grad_norm": 2.984375, "learning_rate": 0.0009726691205875109, "loss": 1.6605, "step": 1272 }, { "epoch": 0.10770792791268297, "grad_norm": 2.78125, "learning_rate": 0.0009726256729903855, "loss": 2.0841, "step": 1273 }, { "epoch": 0.10779253743971572, "grad_norm": 2.875, "learning_rate": 0.000972582191858531, "loss": 2.1977, "step": 1274 }, { "epoch": 0.10787714696674845, "grad_norm": 1.921875, "learning_rate": 0.000972538677195032, "loss": 1.8519, "step": 1275 }, { "epoch": 0.1079617564937812, "grad_norm": 2.4375, "learning_rate": 0.0009724951290029767, "loss": 2.0959, "step": 1276 }, { "epoch": 0.10804636602081394, "grad_norm": 2.4375, "learning_rate": 0.0009724515472854547, "loss": 2.4057, "step": 1277 }, { "epoch": 0.10813097554784669, "grad_norm": 2.578125, "learning_rate": 0.0009724079320455584, "loss": 1.0793, "step": 1278 }, { "epoch": 0.10821558507487943, "grad_norm": 2.5, "learning_rate": 0.0009723642832863824, "loss": 1.5975, "step": 1279 }, { "epoch": 0.10830019460191218, "grad_norm": 1.90625, "learning_rate": 0.0009723206010110238, "loss": 1.6228, "step": 1280 }, { "epoch": 0.10838480412894493, "grad_norm": 1.75, "learning_rate": 0.0009722768852225819, "loss": 1.3604, "step": 1281 }, { "epoch": 0.10846941365597766, "grad_norm": 2.015625, "learning_rate": 0.0009722331359241587, "loss": 1.8574, "step": 1282 }, { "epoch": 0.10855402318301041, "grad_norm": 3.046875, "learning_rate": 0.0009721893531188584, "loss": 1.2707, "step": 1283 }, { "epoch": 0.10863863271004315, "grad_norm": 2.921875, "learning_rate": 0.0009721455368097876, "loss": 1.3574, "step": 1284 }, { "epoch": 0.1087232422370759, "grad_norm": 9.5625, "learning_rate": 0.000972101687000055, "loss": 1.623, "step": 1285 }, { "epoch": 0.10880785176410863, "grad_norm": 3.59375, "learning_rate": 0.0009720578036927721, "loss": 1.2452, "step": 1286 }, { "epoch": 0.10889246129114138, "grad_norm": 2.1875, "learning_rate": 0.0009720138868910524, "loss": 1.2837, "step": 1287 }, { "epoch": 0.10897707081817412, "grad_norm": 3.046875, "learning_rate": 0.0009719699365980124, "loss": 1.3889, "step": 1288 }, { "epoch": 0.10906168034520687, "grad_norm": 3.03125, "learning_rate": 0.0009719259528167704, "loss": 2.0841, "step": 1289 }, { "epoch": 0.10914628987223961, "grad_norm": 1.8046875, "learning_rate": 0.0009718819355504468, "loss": 1.3764, "step": 1290 }, { "epoch": 0.10923089939927236, "grad_norm": 2.1875, "learning_rate": 0.0009718378848021654, "loss": 1.4692, "step": 1291 }, { "epoch": 0.10931550892630511, "grad_norm": 1.890625, "learning_rate": 0.0009717938005750513, "loss": 1.4256, "step": 1292 }, { "epoch": 0.10940011845333784, "grad_norm": 2.0625, "learning_rate": 0.0009717496828722329, "loss": 1.5266, "step": 1293 }, { "epoch": 0.1094847279803706, "grad_norm": 1.6875, "learning_rate": 0.0009717055316968402, "loss": 1.4898, "step": 1294 }, { "epoch": 0.10956933750740333, "grad_norm": 1.71875, "learning_rate": 0.0009716613470520059, "loss": 1.4224, "step": 1295 }, { "epoch": 0.10965394703443608, "grad_norm": 2.4375, "learning_rate": 0.0009716171289408654, "loss": 1.3272, "step": 1296 }, { "epoch": 0.10973855656146882, "grad_norm": 2.15625, "learning_rate": 0.000971572877366556, "loss": 2.1372, "step": 1297 }, { "epoch": 0.10982316608850157, "grad_norm": 2.109375, "learning_rate": 0.0009715285923322173, "loss": 1.3308, "step": 1298 }, { "epoch": 0.1099077756155343, "grad_norm": 1.8046875, "learning_rate": 0.0009714842738409917, "loss": 1.1388, "step": 1299 }, { "epoch": 0.10999238514256705, "grad_norm": 1.6484375, "learning_rate": 0.000971439921896024, "loss": 1.3074, "step": 1300 }, { "epoch": 0.1100769946695998, "grad_norm": 2.03125, "learning_rate": 0.0009713955365004605, "loss": 1.4713, "step": 1301 }, { "epoch": 0.11016160419663254, "grad_norm": 1.8359375, "learning_rate": 0.0009713511176574513, "loss": 1.4709, "step": 1302 }, { "epoch": 0.11024621372366529, "grad_norm": 2.34375, "learning_rate": 0.0009713066653701477, "loss": 2.0952, "step": 1303 }, { "epoch": 0.11033082325069803, "grad_norm": 2.09375, "learning_rate": 0.0009712621796417037, "loss": 1.7456, "step": 1304 }, { "epoch": 0.11041543277773078, "grad_norm": 2.078125, "learning_rate": 0.000971217660475276, "loss": 1.4681, "step": 1305 }, { "epoch": 0.11050004230476351, "grad_norm": 1.6171875, "learning_rate": 0.0009711731078740232, "loss": 1.3407, "step": 1306 }, { "epoch": 0.11058465183179626, "grad_norm": 1.828125, "learning_rate": 0.0009711285218411066, "loss": 1.6956, "step": 1307 }, { "epoch": 0.110669261358829, "grad_norm": 1.8046875, "learning_rate": 0.0009710839023796896, "loss": 1.3339, "step": 1308 }, { "epoch": 0.11075387088586175, "grad_norm": 1.984375, "learning_rate": 0.0009710392494929383, "loss": 1.8294, "step": 1309 }, { "epoch": 0.11083848041289449, "grad_norm": 1.90625, "learning_rate": 0.0009709945631840211, "loss": 1.6254, "step": 1310 }, { "epoch": 0.11092308993992724, "grad_norm": 2.171875, "learning_rate": 0.0009709498434561085, "loss": 1.7818, "step": 1311 }, { "epoch": 0.11100769946695999, "grad_norm": 1.640625, "learning_rate": 0.0009709050903123735, "loss": 1.2549, "step": 1312 }, { "epoch": 0.11109230899399272, "grad_norm": 1.9375, "learning_rate": 0.0009708603037559918, "loss": 1.4361, "step": 1313 }, { "epoch": 0.11117691852102547, "grad_norm": 2.25, "learning_rate": 0.0009708154837901409, "loss": 1.6412, "step": 1314 }, { "epoch": 0.11126152804805821, "grad_norm": 1.9453125, "learning_rate": 0.000970770630418001, "loss": 1.2866, "step": 1315 }, { "epoch": 0.11134613757509096, "grad_norm": 1.421875, "learning_rate": 0.0009707257436427548, "loss": 1.0412, "step": 1316 }, { "epoch": 0.1114307471021237, "grad_norm": 4.3125, "learning_rate": 0.000970680823467587, "loss": 1.2454, "step": 1317 }, { "epoch": 0.11151535662915645, "grad_norm": 2.328125, "learning_rate": 0.0009706358698956851, "loss": 2.0625, "step": 1318 }, { "epoch": 0.11159996615618918, "grad_norm": 1.8828125, "learning_rate": 0.0009705908829302386, "loss": 1.6725, "step": 1319 }, { "epoch": 0.11168457568322193, "grad_norm": 1.984375, "learning_rate": 0.0009705458625744395, "loss": 1.2903, "step": 1320 }, { "epoch": 0.11176918521025467, "grad_norm": 1.9453125, "learning_rate": 0.0009705008088314819, "loss": 1.3689, "step": 1321 }, { "epoch": 0.11185379473728742, "grad_norm": 2.796875, "learning_rate": 0.0009704557217045632, "loss": 1.4749, "step": 1322 }, { "epoch": 0.11193840426432017, "grad_norm": 2.65625, "learning_rate": 0.0009704106011968821, "loss": 1.4468, "step": 1323 }, { "epoch": 0.1120230137913529, "grad_norm": 3.65625, "learning_rate": 0.0009703654473116401, "loss": 1.4558, "step": 1324 }, { "epoch": 0.11210762331838565, "grad_norm": 2.1875, "learning_rate": 0.0009703202600520411, "loss": 1.3395, "step": 1325 }, { "epoch": 0.11219223284541839, "grad_norm": 2.03125, "learning_rate": 0.0009702750394212913, "loss": 1.2188, "step": 1326 }, { "epoch": 0.11227684237245114, "grad_norm": 2.34375, "learning_rate": 0.0009702297854225992, "loss": 1.7909, "step": 1327 }, { "epoch": 0.11236145189948388, "grad_norm": 2.109375, "learning_rate": 0.0009701844980591761, "loss": 1.3553, "step": 1328 }, { "epoch": 0.11244606142651663, "grad_norm": 1.8515625, "learning_rate": 0.0009701391773342349, "loss": 1.2229, "step": 1329 }, { "epoch": 0.11253067095354936, "grad_norm": 1.8515625, "learning_rate": 0.0009700938232509916, "loss": 1.7963, "step": 1330 }, { "epoch": 0.11261528048058211, "grad_norm": 1.59375, "learning_rate": 0.000970048435812664, "loss": 1.1655, "step": 1331 }, { "epoch": 0.11269989000761486, "grad_norm": 1.640625, "learning_rate": 0.0009700030150224728, "loss": 1.1911, "step": 1332 }, { "epoch": 0.1127844995346476, "grad_norm": 7.03125, "learning_rate": 0.0009699575608836407, "loss": 1.716, "step": 1333 }, { "epoch": 0.11286910906168035, "grad_norm": 5.5625, "learning_rate": 0.0009699120733993926, "loss": 1.4594, "step": 1334 }, { "epoch": 0.11295371858871309, "grad_norm": 2.34375, "learning_rate": 0.0009698665525729565, "loss": 1.6004, "step": 1335 }, { "epoch": 0.11303832811574584, "grad_norm": 3.90625, "learning_rate": 0.0009698209984075618, "loss": 1.4957, "step": 1336 }, { "epoch": 0.11312293764277857, "grad_norm": 2.4375, "learning_rate": 0.0009697754109064411, "loss": 1.6579, "step": 1337 }, { "epoch": 0.11320754716981132, "grad_norm": 1.6953125, "learning_rate": 0.0009697297900728288, "loss": 1.1856, "step": 1338 }, { "epoch": 0.11329215669684406, "grad_norm": 2.09375, "learning_rate": 0.0009696841359099622, "loss": 1.5672, "step": 1339 }, { "epoch": 0.11337676622387681, "grad_norm": 2.265625, "learning_rate": 0.0009696384484210803, "loss": 1.6304, "step": 1340 }, { "epoch": 0.11346137575090955, "grad_norm": 1.5, "learning_rate": 0.0009695927276094249, "loss": 1.3056, "step": 1341 }, { "epoch": 0.1135459852779423, "grad_norm": 2.453125, "learning_rate": 0.00096954697347824, "loss": 1.8999, "step": 1342 }, { "epoch": 0.11363059480497505, "grad_norm": 1.953125, "learning_rate": 0.0009695011860307725, "loss": 1.573, "step": 1343 }, { "epoch": 0.11371520433200778, "grad_norm": 1.875, "learning_rate": 0.0009694553652702707, "loss": 1.1649, "step": 1344 }, { "epoch": 0.11379981385904053, "grad_norm": 1.8671875, "learning_rate": 0.0009694095111999858, "loss": 1.1248, "step": 1345 }, { "epoch": 0.11388442338607327, "grad_norm": 1.890625, "learning_rate": 0.0009693636238231718, "loss": 1.5645, "step": 1346 }, { "epoch": 0.11396903291310602, "grad_norm": 5.1875, "learning_rate": 0.0009693177031430842, "loss": 1.5147, "step": 1347 }, { "epoch": 0.11405364244013876, "grad_norm": 3.296875, "learning_rate": 0.0009692717491629812, "loss": 1.2638, "step": 1348 }, { "epoch": 0.1141382519671715, "grad_norm": 8.125, "learning_rate": 0.0009692257618861236, "loss": 1.2806, "step": 1349 }, { "epoch": 0.11422286149420424, "grad_norm": 2.09375, "learning_rate": 0.0009691797413157744, "loss": 1.6189, "step": 1350 }, { "epoch": 0.11430747102123699, "grad_norm": 2.390625, "learning_rate": 0.0009691336874551991, "loss": 1.6729, "step": 1351 }, { "epoch": 0.11439208054826973, "grad_norm": 2.203125, "learning_rate": 0.0009690876003076649, "loss": 1.3271, "step": 1352 }, { "epoch": 0.11447669007530248, "grad_norm": 2.15625, "learning_rate": 0.0009690414798764422, "loss": 1.3906, "step": 1353 }, { "epoch": 0.11456129960233523, "grad_norm": 2.078125, "learning_rate": 0.0009689953261648036, "loss": 1.3052, "step": 1354 }, { "epoch": 0.11464590912936796, "grad_norm": 2.109375, "learning_rate": 0.0009689491391760236, "loss": 1.2964, "step": 1355 }, { "epoch": 0.11473051865640072, "grad_norm": 2.234375, "learning_rate": 0.0009689029189133794, "loss": 2.0084, "step": 1356 }, { "epoch": 0.11481512818343345, "grad_norm": 1.7890625, "learning_rate": 0.0009688566653801508, "loss": 1.3417, "step": 1357 }, { "epoch": 0.1148997377104662, "grad_norm": 1.8203125, "learning_rate": 0.0009688103785796193, "loss": 1.3176, "step": 1358 }, { "epoch": 0.11498434723749894, "grad_norm": 2.03125, "learning_rate": 0.0009687640585150693, "loss": 1.642, "step": 1359 }, { "epoch": 0.11506895676453169, "grad_norm": 1.84375, "learning_rate": 0.0009687177051897875, "loss": 1.4681, "step": 1360 }, { "epoch": 0.11515356629156442, "grad_norm": 1.5703125, "learning_rate": 0.0009686713186070625, "loss": 1.4532, "step": 1361 }, { "epoch": 0.11523817581859717, "grad_norm": 2.90625, "learning_rate": 0.0009686248987701863, "loss": 2.2981, "step": 1362 }, { "epoch": 0.11532278534562992, "grad_norm": 1.859375, "learning_rate": 0.0009685784456824518, "loss": 1.9279, "step": 1363 }, { "epoch": 0.11540739487266266, "grad_norm": 2.359375, "learning_rate": 0.0009685319593471555, "loss": 2.327, "step": 1364 }, { "epoch": 0.11549200439969541, "grad_norm": 1.46875, "learning_rate": 0.0009684854397675958, "loss": 1.3184, "step": 1365 }, { "epoch": 0.11557661392672815, "grad_norm": 1.9453125, "learning_rate": 0.0009684388869470731, "loss": 1.2853, "step": 1366 }, { "epoch": 0.1156612234537609, "grad_norm": 1.6796875, "learning_rate": 0.0009683923008888909, "loss": 1.4037, "step": 1367 }, { "epoch": 0.11574583298079363, "grad_norm": 1.9296875, "learning_rate": 0.0009683456815963545, "loss": 1.4846, "step": 1368 }, { "epoch": 0.11583044250782638, "grad_norm": 1.828125, "learning_rate": 0.0009682990290727718, "loss": 1.6489, "step": 1369 }, { "epoch": 0.11591505203485912, "grad_norm": 2.015625, "learning_rate": 0.0009682523433214529, "loss": 1.8534, "step": 1370 }, { "epoch": 0.11599966156189187, "grad_norm": 1.421875, "learning_rate": 0.0009682056243457104, "loss": 1.2607, "step": 1371 }, { "epoch": 0.1160842710889246, "grad_norm": 1.875, "learning_rate": 0.0009681588721488592, "loss": 1.6378, "step": 1372 }, { "epoch": 0.11616888061595736, "grad_norm": 2.5, "learning_rate": 0.0009681120867342166, "loss": 1.2047, "step": 1373 }, { "epoch": 0.1162534901429901, "grad_norm": 2.265625, "learning_rate": 0.0009680652681051021, "loss": 1.4987, "step": 1374 }, { "epoch": 0.11633809967002284, "grad_norm": 2.3125, "learning_rate": 0.0009680184162648378, "loss": 2.2783, "step": 1375 }, { "epoch": 0.1164227091970556, "grad_norm": 2.15625, "learning_rate": 0.000967971531216748, "loss": 1.6945, "step": 1376 }, { "epoch": 0.11650731872408833, "grad_norm": 1.765625, "learning_rate": 0.0009679246129641592, "loss": 1.4369, "step": 1377 }, { "epoch": 0.11659192825112108, "grad_norm": 1.875, "learning_rate": 0.0009678776615104008, "loss": 1.2187, "step": 1378 }, { "epoch": 0.11667653777815382, "grad_norm": 1.7109375, "learning_rate": 0.0009678306768588039, "loss": 1.3711, "step": 1379 }, { "epoch": 0.11676114730518657, "grad_norm": 1.8359375, "learning_rate": 0.0009677836590127024, "loss": 1.5878, "step": 1380 }, { "epoch": 0.1168457568322193, "grad_norm": 2.25, "learning_rate": 0.0009677366079754323, "loss": 2.3004, "step": 1381 }, { "epoch": 0.11693036635925205, "grad_norm": 2.046875, "learning_rate": 0.0009676895237503322, "loss": 1.9087, "step": 1382 }, { "epoch": 0.11701497588628479, "grad_norm": 1.828125, "learning_rate": 0.0009676424063407427, "loss": 1.231, "step": 1383 }, { "epoch": 0.11709958541331754, "grad_norm": 2.0625, "learning_rate": 0.0009675952557500071, "loss": 1.8616, "step": 1384 }, { "epoch": 0.11718419494035029, "grad_norm": 2.28125, "learning_rate": 0.0009675480719814712, "loss": 1.3295, "step": 1385 }, { "epoch": 0.11726880446738303, "grad_norm": 2.09375, "learning_rate": 0.0009675008550384826, "loss": 1.6486, "step": 1386 }, { "epoch": 0.11735341399441578, "grad_norm": 1.875, "learning_rate": 0.0009674536049243913, "loss": 1.4837, "step": 1387 }, { "epoch": 0.11743802352144851, "grad_norm": 1.40625, "learning_rate": 0.0009674063216425503, "loss": 1.0784, "step": 1388 }, { "epoch": 0.11752263304848126, "grad_norm": 1.8046875, "learning_rate": 0.0009673590051963145, "loss": 1.2668, "step": 1389 }, { "epoch": 0.117607242575514, "grad_norm": 2.15625, "learning_rate": 0.000967311655589041, "loss": 1.7921, "step": 1390 }, { "epoch": 0.11769185210254675, "grad_norm": 2.171875, "learning_rate": 0.0009672642728240896, "loss": 1.5237, "step": 1391 }, { "epoch": 0.11777646162957948, "grad_norm": 1.8515625, "learning_rate": 0.0009672168569048221, "loss": 1.3162, "step": 1392 }, { "epoch": 0.11786107115661223, "grad_norm": 2.578125, "learning_rate": 0.0009671694078346031, "loss": 2.2811, "step": 1393 }, { "epoch": 0.11794568068364499, "grad_norm": 1.796875, "learning_rate": 0.0009671219256167993, "loss": 1.4129, "step": 1394 }, { "epoch": 0.11803029021067772, "grad_norm": 2.140625, "learning_rate": 0.0009670744102547795, "loss": 1.5846, "step": 1395 }, { "epoch": 0.11811489973771047, "grad_norm": 2.21875, "learning_rate": 0.0009670268617519153, "loss": 1.3402, "step": 1396 }, { "epoch": 0.11819950926474321, "grad_norm": 2.3125, "learning_rate": 0.0009669792801115804, "loss": 1.4835, "step": 1397 }, { "epoch": 0.11828411879177596, "grad_norm": 1.5234375, "learning_rate": 0.000966931665337151, "loss": 1.1357, "step": 1398 }, { "epoch": 0.1183687283188087, "grad_norm": 1.703125, "learning_rate": 0.0009668840174320055, "loss": 1.1846, "step": 1399 }, { "epoch": 0.11845333784584144, "grad_norm": 2.25, "learning_rate": 0.0009668363363995248, "loss": 1.4786, "step": 1400 }, { "epoch": 0.11853794737287418, "grad_norm": 2.84375, "learning_rate": 0.0009667886222430919, "loss": 2.3769, "step": 1401 }, { "epoch": 0.11862255689990693, "grad_norm": 1.8203125, "learning_rate": 0.0009667408749660924, "loss": 1.7232, "step": 1402 }, { "epoch": 0.11870716642693967, "grad_norm": 2.265625, "learning_rate": 0.0009666930945719141, "loss": 2.4226, "step": 1403 }, { "epoch": 0.11879177595397242, "grad_norm": 1.828125, "learning_rate": 0.0009666452810639474, "loss": 2.1532, "step": 1404 }, { "epoch": 0.11887638548100517, "grad_norm": 1.8828125, "learning_rate": 0.0009665974344455848, "loss": 1.4863, "step": 1405 }, { "epoch": 0.1189609950080379, "grad_norm": 1.7734375, "learning_rate": 0.000966549554720221, "loss": 1.7287, "step": 1406 }, { "epoch": 0.11904560453507065, "grad_norm": 1.4375, "learning_rate": 0.0009665016418912535, "loss": 1.4212, "step": 1407 }, { "epoch": 0.11913021406210339, "grad_norm": 2.0, "learning_rate": 0.0009664536959620817, "loss": 1.7071, "step": 1408 }, { "epoch": 0.11921482358913614, "grad_norm": 2.140625, "learning_rate": 0.000966405716936108, "loss": 2.0373, "step": 1409 }, { "epoch": 0.11929943311616888, "grad_norm": 2.125, "learning_rate": 0.0009663577048167363, "loss": 1.4342, "step": 1410 }, { "epoch": 0.11938404264320163, "grad_norm": 1.890625, "learning_rate": 0.0009663096596073731, "loss": 1.349, "step": 1411 }, { "epoch": 0.11946865217023436, "grad_norm": 2.078125, "learning_rate": 0.000966261581311428, "loss": 1.5168, "step": 1412 }, { "epoch": 0.11955326169726711, "grad_norm": 1.7890625, "learning_rate": 0.0009662134699323118, "loss": 1.5704, "step": 1413 }, { "epoch": 0.11963787122429985, "grad_norm": 2.21875, "learning_rate": 0.0009661653254734385, "loss": 1.589, "step": 1414 }, { "epoch": 0.1197224807513326, "grad_norm": 1.703125, "learning_rate": 0.000966117147938224, "loss": 1.2434, "step": 1415 }, { "epoch": 0.11980709027836535, "grad_norm": 1.7734375, "learning_rate": 0.0009660689373300869, "loss": 1.7883, "step": 1416 }, { "epoch": 0.11989169980539809, "grad_norm": 1.9140625, "learning_rate": 0.0009660206936524476, "loss": 1.288, "step": 1417 }, { "epoch": 0.11997630933243084, "grad_norm": 1.90625, "learning_rate": 0.0009659724169087295, "loss": 1.8011, "step": 1418 }, { "epoch": 0.12006091885946357, "grad_norm": 1.953125, "learning_rate": 0.0009659241071023579, "loss": 1.3013, "step": 1419 }, { "epoch": 0.12014552838649632, "grad_norm": 2.15625, "learning_rate": 0.0009658757642367606, "loss": 1.1419, "step": 1420 }, { "epoch": 0.12023013791352906, "grad_norm": 2.59375, "learning_rate": 0.0009658273883153678, "loss": 1.8795, "step": 1421 }, { "epoch": 0.12031474744056181, "grad_norm": 2.1875, "learning_rate": 0.0009657789793416117, "loss": 1.4308, "step": 1422 }, { "epoch": 0.12039935696759455, "grad_norm": 2.140625, "learning_rate": 0.0009657305373189275, "loss": 1.6976, "step": 1423 }, { "epoch": 0.1204839664946273, "grad_norm": 2.296875, "learning_rate": 0.000965682062250752, "loss": 1.9072, "step": 1424 }, { "epoch": 0.12056857602166005, "grad_norm": 1.8515625, "learning_rate": 0.000965633554140525, "loss": 1.4905, "step": 1425 }, { "epoch": 0.12065318554869278, "grad_norm": 1.8203125, "learning_rate": 0.000965585012991688, "loss": 1.483, "step": 1426 }, { "epoch": 0.12073779507572553, "grad_norm": 1.8203125, "learning_rate": 0.0009655364388076857, "loss": 1.7728, "step": 1427 }, { "epoch": 0.12082240460275827, "grad_norm": 2.25, "learning_rate": 0.0009654878315919642, "loss": 1.2446, "step": 1428 }, { "epoch": 0.12090701412979102, "grad_norm": 2.0, "learning_rate": 0.0009654391913479726, "loss": 1.2488, "step": 1429 }, { "epoch": 0.12099162365682375, "grad_norm": 1.875, "learning_rate": 0.000965390518079162, "loss": 1.4446, "step": 1430 }, { "epoch": 0.1210762331838565, "grad_norm": 1.8671875, "learning_rate": 0.0009653418117889861, "loss": 1.3892, "step": 1431 }, { "epoch": 0.12116084271088924, "grad_norm": 1.8125, "learning_rate": 0.0009652930724809007, "loss": 1.4618, "step": 1432 }, { "epoch": 0.12124545223792199, "grad_norm": 2.34375, "learning_rate": 0.0009652443001583641, "loss": 1.324, "step": 1433 }, { "epoch": 0.12133006176495473, "grad_norm": 2.078125, "learning_rate": 0.0009651954948248368, "loss": 1.3312, "step": 1434 }, { "epoch": 0.12141467129198748, "grad_norm": 1.859375, "learning_rate": 0.0009651466564837819, "loss": 1.7812, "step": 1435 }, { "epoch": 0.12149928081902023, "grad_norm": 2.0625, "learning_rate": 0.0009650977851386646, "loss": 1.6304, "step": 1436 }, { "epoch": 0.12158389034605296, "grad_norm": 2.0625, "learning_rate": 0.0009650488807929525, "loss": 1.4517, "step": 1437 }, { "epoch": 0.12166849987308571, "grad_norm": 1.9140625, "learning_rate": 0.0009649999434501156, "loss": 1.6517, "step": 1438 }, { "epoch": 0.12175310940011845, "grad_norm": 1.421875, "learning_rate": 0.0009649509731136263, "loss": 1.1628, "step": 1439 }, { "epoch": 0.1218377189271512, "grad_norm": 1.3046875, "learning_rate": 0.000964901969786959, "loss": 1.2173, "step": 1440 }, { "epoch": 0.12192232845418394, "grad_norm": 1.640625, "learning_rate": 0.0009648529334735908, "loss": 1.3575, "step": 1441 }, { "epoch": 0.12200693798121669, "grad_norm": 1.6875, "learning_rate": 0.0009648038641770012, "loss": 1.3882, "step": 1442 }, { "epoch": 0.12209154750824942, "grad_norm": 2.0625, "learning_rate": 0.0009647547619006715, "loss": 1.3644, "step": 1443 }, { "epoch": 0.12217615703528217, "grad_norm": 2.21875, "learning_rate": 0.0009647056266480862, "loss": 1.7855, "step": 1444 }, { "epoch": 0.12226076656231491, "grad_norm": 1.9765625, "learning_rate": 0.0009646564584227313, "loss": 1.7909, "step": 1445 }, { "epoch": 0.12234537608934766, "grad_norm": 2.046875, "learning_rate": 0.0009646072572280955, "loss": 1.7245, "step": 1446 }, { "epoch": 0.12242998561638041, "grad_norm": 1.6953125, "learning_rate": 0.00096455802306767, "loss": 1.3732, "step": 1447 }, { "epoch": 0.12251459514341315, "grad_norm": 1.640625, "learning_rate": 0.0009645087559449479, "loss": 1.6771, "step": 1448 }, { "epoch": 0.1225992046704459, "grad_norm": 1.8046875, "learning_rate": 0.0009644594558634252, "loss": 1.5598, "step": 1449 }, { "epoch": 0.12268381419747863, "grad_norm": 2.46875, "learning_rate": 0.0009644101228265997, "loss": 1.7255, "step": 1450 }, { "epoch": 0.12276842372451138, "grad_norm": 2.078125, "learning_rate": 0.000964360756837972, "loss": 1.4548, "step": 1451 }, { "epoch": 0.12285303325154412, "grad_norm": 1.7421875, "learning_rate": 0.0009643113579010446, "loss": 1.209, "step": 1452 }, { "epoch": 0.12293764277857687, "grad_norm": 1.78125, "learning_rate": 0.0009642619260193226, "loss": 1.5805, "step": 1453 }, { "epoch": 0.1230222523056096, "grad_norm": 2.5, "learning_rate": 0.0009642124611963137, "loss": 1.4141, "step": 1454 }, { "epoch": 0.12310686183264236, "grad_norm": 1.9765625, "learning_rate": 0.0009641629634355271, "loss": 1.4156, "step": 1455 }, { "epoch": 0.1231914713596751, "grad_norm": 1.7421875, "learning_rate": 0.0009641134327404755, "loss": 1.3704, "step": 1456 }, { "epoch": 0.12327608088670784, "grad_norm": 1.6015625, "learning_rate": 0.0009640638691146727, "loss": 1.4825, "step": 1457 }, { "epoch": 0.12336069041374059, "grad_norm": 1.9296875, "learning_rate": 0.0009640142725616357, "loss": 1.9116, "step": 1458 }, { "epoch": 0.12344529994077333, "grad_norm": 1.546875, "learning_rate": 0.0009639646430848838, "loss": 1.2592, "step": 1459 }, { "epoch": 0.12352990946780608, "grad_norm": 1.6484375, "learning_rate": 0.000963914980687938, "loss": 1.325, "step": 1460 }, { "epoch": 0.12361451899483882, "grad_norm": 1.8515625, "learning_rate": 0.0009638652853743224, "loss": 1.3751, "step": 1461 }, { "epoch": 0.12369912852187157, "grad_norm": 2.390625, "learning_rate": 0.0009638155571475628, "loss": 1.9815, "step": 1462 }, { "epoch": 0.1237837380489043, "grad_norm": 2.375, "learning_rate": 0.0009637657960111878, "loss": 1.9995, "step": 1463 }, { "epoch": 0.12386834757593705, "grad_norm": 1.7421875, "learning_rate": 0.0009637160019687282, "loss": 1.1554, "step": 1464 }, { "epoch": 0.12395295710296979, "grad_norm": 1.7421875, "learning_rate": 0.000963666175023717, "loss": 1.5514, "step": 1465 }, { "epoch": 0.12403756663000254, "grad_norm": 1.7734375, "learning_rate": 0.0009636163151796895, "loss": 1.7037, "step": 1466 }, { "epoch": 0.12412217615703529, "grad_norm": 1.625, "learning_rate": 0.0009635664224401838, "loss": 1.0985, "step": 1467 }, { "epoch": 0.12420678568406802, "grad_norm": 1.3125, "learning_rate": 0.0009635164968087396, "loss": 1.164, "step": 1468 }, { "epoch": 0.12429139521110077, "grad_norm": 1.9921875, "learning_rate": 0.0009634665382888996, "loss": 1.614, "step": 1469 }, { "epoch": 0.12437600473813351, "grad_norm": 2.34375, "learning_rate": 0.0009634165468842084, "loss": 2.0477, "step": 1470 }, { "epoch": 0.12446061426516626, "grad_norm": 1.8125, "learning_rate": 0.0009633665225982133, "loss": 1.7466, "step": 1471 }, { "epoch": 0.124545223792199, "grad_norm": 3.03125, "learning_rate": 0.0009633164654344636, "loss": 2.6692, "step": 1472 }, { "epoch": 0.12462983331923175, "grad_norm": 2.140625, "learning_rate": 0.0009632663753965109, "loss": 1.8574, "step": 1473 }, { "epoch": 0.12471444284626448, "grad_norm": 2.8125, "learning_rate": 0.0009632162524879096, "loss": 1.8344, "step": 1474 }, { "epoch": 0.12479905237329723, "grad_norm": 1.2421875, "learning_rate": 0.000963166096712216, "loss": 1.117, "step": 1475 }, { "epoch": 0.12488366190032997, "grad_norm": 1.78125, "learning_rate": 0.0009631159080729887, "loss": 1.7059, "step": 1476 }, { "epoch": 0.12496827142736272, "grad_norm": 1.546875, "learning_rate": 0.0009630656865737892, "loss": 1.3479, "step": 1477 }, { "epoch": 0.12505288095439546, "grad_norm": 1.375, "learning_rate": 0.0009630154322181803, "loss": 1.1603, "step": 1478 }, { "epoch": 0.12513749048142822, "grad_norm": 2.1875, "learning_rate": 0.0009629651450097283, "loss": 2.1208, "step": 1479 }, { "epoch": 0.12522210000846096, "grad_norm": 1.671875, "learning_rate": 0.0009629148249520012, "loss": 1.7463, "step": 1480 }, { "epoch": 0.1253067095354937, "grad_norm": 2.015625, "learning_rate": 0.0009628644720485691, "loss": 1.5732, "step": 1481 }, { "epoch": 0.12539131906252643, "grad_norm": 1.6328125, "learning_rate": 0.0009628140863030051, "loss": 1.307, "step": 1482 }, { "epoch": 0.1254759285895592, "grad_norm": 5.15625, "learning_rate": 0.000962763667718884, "loss": 1.1641, "step": 1483 }, { "epoch": 0.12556053811659193, "grad_norm": 2.703125, "learning_rate": 0.0009627132162997833, "loss": 2.9223, "step": 1484 }, { "epoch": 0.12564514764362467, "grad_norm": 2.234375, "learning_rate": 0.0009626627320492829, "loss": 1.8814, "step": 1485 }, { "epoch": 0.1257297571706574, "grad_norm": 1.6171875, "learning_rate": 0.0009626122149709646, "loss": 1.1738, "step": 1486 }, { "epoch": 0.12581436669769017, "grad_norm": 2.09375, "learning_rate": 0.000962561665068413, "loss": 1.9127, "step": 1487 }, { "epoch": 0.1258989762247229, "grad_norm": 2.171875, "learning_rate": 0.0009625110823452148, "loss": 1.6481, "step": 1488 }, { "epoch": 0.12598358575175564, "grad_norm": 1.71875, "learning_rate": 0.0009624604668049588, "loss": 1.5491, "step": 1489 }, { "epoch": 0.1260681952787884, "grad_norm": 2.359375, "learning_rate": 0.0009624098184512368, "loss": 1.5094, "step": 1490 }, { "epoch": 0.12615280480582114, "grad_norm": 1.4609375, "learning_rate": 0.0009623591372876422, "loss": 1.1754, "step": 1491 }, { "epoch": 0.12623741433285388, "grad_norm": 1.984375, "learning_rate": 0.0009623084233177711, "loss": 1.7325, "step": 1492 }, { "epoch": 0.1263220238598866, "grad_norm": 3.546875, "learning_rate": 0.0009622576765452218, "loss": 2.2729, "step": 1493 }, { "epoch": 0.12640663338691938, "grad_norm": 1.8828125, "learning_rate": 0.0009622068969735951, "loss": 1.4665, "step": 1494 }, { "epoch": 0.1264912429139521, "grad_norm": 1.578125, "learning_rate": 0.000962156084606494, "loss": 1.0787, "step": 1495 }, { "epoch": 0.12657585244098485, "grad_norm": 2.40625, "learning_rate": 0.0009621052394475241, "loss": 1.4844, "step": 1496 }, { "epoch": 0.1266604619680176, "grad_norm": 2.484375, "learning_rate": 0.0009620543615002925, "loss": 1.7645, "step": 1497 }, { "epoch": 0.12674507149505035, "grad_norm": 1.8984375, "learning_rate": 0.0009620034507684097, "loss": 1.3078, "step": 1498 }, { "epoch": 0.12682968102208309, "grad_norm": 2.203125, "learning_rate": 0.0009619525072554878, "loss": 1.6513, "step": 1499 }, { "epoch": 0.12691429054911582, "grad_norm": 1.6796875, "learning_rate": 0.0009619015309651414, "loss": 1.4951, "step": 1500 }, { "epoch": 0.12699890007614859, "grad_norm": 1.984375, "learning_rate": 0.0009618505219009876, "loss": 1.335, "step": 1501 }, { "epoch": 0.12708350960318132, "grad_norm": 1.7734375, "learning_rate": 0.0009617994800666457, "loss": 1.5282, "step": 1502 }, { "epoch": 0.12716811913021406, "grad_norm": 1.984375, "learning_rate": 0.0009617484054657373, "loss": 1.6293, "step": 1503 }, { "epoch": 0.1272527286572468, "grad_norm": 1.671875, "learning_rate": 0.0009616972981018862, "loss": 1.3002, "step": 1504 }, { "epoch": 0.12733733818427956, "grad_norm": 1.546875, "learning_rate": 0.000961646157978719, "loss": 1.2245, "step": 1505 }, { "epoch": 0.1274219477113123, "grad_norm": 1.7109375, "learning_rate": 0.000961594985099864, "loss": 1.5624, "step": 1506 }, { "epoch": 0.12750655723834503, "grad_norm": 1.8125, "learning_rate": 0.0009615437794689523, "loss": 1.5946, "step": 1507 }, { "epoch": 0.1275911667653778, "grad_norm": 1.7265625, "learning_rate": 0.0009614925410896171, "loss": 1.5274, "step": 1508 }, { "epoch": 0.12767577629241053, "grad_norm": 1.6328125, "learning_rate": 0.0009614412699654941, "loss": 1.3838, "step": 1509 }, { "epoch": 0.12776038581944327, "grad_norm": 3.203125, "learning_rate": 0.000961389966100221, "loss": 1.8053, "step": 1510 }, { "epoch": 0.127844995346476, "grad_norm": 1.671875, "learning_rate": 0.000961338629497438, "loss": 1.2179, "step": 1511 }, { "epoch": 0.12792960487350877, "grad_norm": 2.140625, "learning_rate": 0.0009612872601607878, "loss": 2.118, "step": 1512 }, { "epoch": 0.1280142144005415, "grad_norm": 2.65625, "learning_rate": 0.0009612358580939151, "loss": 2.1665, "step": 1513 }, { "epoch": 0.12809882392757424, "grad_norm": 1.578125, "learning_rate": 0.0009611844233004674, "loss": 1.1916, "step": 1514 }, { "epoch": 0.12818343345460698, "grad_norm": 2.15625, "learning_rate": 0.0009611329557840938, "loss": 1.7597, "step": 1515 }, { "epoch": 0.12826804298163974, "grad_norm": 2.21875, "learning_rate": 0.0009610814555484464, "loss": 2.5467, "step": 1516 }, { "epoch": 0.12835265250867248, "grad_norm": 1.8203125, "learning_rate": 0.0009610299225971793, "loss": 1.7181, "step": 1517 }, { "epoch": 0.1284372620357052, "grad_norm": 1.734375, "learning_rate": 0.000960978356933949, "loss": 1.5758, "step": 1518 }, { "epoch": 0.12852187156273798, "grad_norm": 1.8828125, "learning_rate": 0.0009609267585624141, "loss": 1.3388, "step": 1519 }, { "epoch": 0.1286064810897707, "grad_norm": 1.65625, "learning_rate": 0.000960875127486236, "loss": 1.2266, "step": 1520 }, { "epoch": 0.12869109061680345, "grad_norm": 1.90625, "learning_rate": 0.0009608234637090778, "loss": 1.3682, "step": 1521 }, { "epoch": 0.12877570014383619, "grad_norm": 2.1875, "learning_rate": 0.0009607717672346057, "loss": 1.4821, "step": 1522 }, { "epoch": 0.12886030967086895, "grad_norm": 2.203125, "learning_rate": 0.0009607200380664873, "loss": 1.7901, "step": 1523 }, { "epoch": 0.1289449191979017, "grad_norm": 2.0625, "learning_rate": 0.0009606682762083933, "loss": 1.5262, "step": 1524 }, { "epoch": 0.12902952872493442, "grad_norm": 1.671875, "learning_rate": 0.0009606164816639966, "loss": 1.2778, "step": 1525 }, { "epoch": 0.12911413825196716, "grad_norm": 1.59375, "learning_rate": 0.0009605646544369718, "loss": 1.2163, "step": 1526 }, { "epoch": 0.12919874777899992, "grad_norm": 1.96875, "learning_rate": 0.0009605127945309964, "loss": 1.3738, "step": 1527 }, { "epoch": 0.12928335730603266, "grad_norm": 1.6875, "learning_rate": 0.0009604609019497501, "loss": 1.2378, "step": 1528 }, { "epoch": 0.1293679668330654, "grad_norm": 1.7109375, "learning_rate": 0.0009604089766969148, "loss": 1.4164, "step": 1529 }, { "epoch": 0.12945257636009816, "grad_norm": 1.8359375, "learning_rate": 0.0009603570187761752, "loss": 1.8496, "step": 1530 }, { "epoch": 0.1295371858871309, "grad_norm": 1.921875, "learning_rate": 0.0009603050281912174, "loss": 1.7854, "step": 1531 }, { "epoch": 0.12962179541416363, "grad_norm": 2.5, "learning_rate": 0.0009602530049457308, "loss": 1.4132, "step": 1532 }, { "epoch": 0.12970640494119637, "grad_norm": 1.6953125, "learning_rate": 0.0009602009490434063, "loss": 1.1467, "step": 1533 }, { "epoch": 0.12979101446822913, "grad_norm": 1.8125, "learning_rate": 0.0009601488604879376, "loss": 1.423, "step": 1534 }, { "epoch": 0.12987562399526187, "grad_norm": 1.96875, "learning_rate": 0.0009600967392830207, "loss": 1.4438, "step": 1535 }, { "epoch": 0.1299602335222946, "grad_norm": 3.015625, "learning_rate": 0.0009600445854323536, "loss": 2.2134, "step": 1536 }, { "epoch": 0.13004484304932734, "grad_norm": 1.953125, "learning_rate": 0.0009599923989396371, "loss": 1.6875, "step": 1537 }, { "epoch": 0.1301294525763601, "grad_norm": 1.5078125, "learning_rate": 0.0009599401798085738, "loss": 1.1595, "step": 1538 }, { "epoch": 0.13021406210339284, "grad_norm": 1.796875, "learning_rate": 0.0009598879280428691, "loss": 1.8692, "step": 1539 }, { "epoch": 0.13029867163042558, "grad_norm": 1.953125, "learning_rate": 0.0009598356436462303, "loss": 1.2502, "step": 1540 }, { "epoch": 0.13038328115745834, "grad_norm": 1.6796875, "learning_rate": 0.0009597833266223673, "loss": 1.1216, "step": 1541 }, { "epoch": 0.13046789068449108, "grad_norm": 1.8203125, "learning_rate": 0.000959730976974992, "loss": 1.565, "step": 1542 }, { "epoch": 0.13055250021152381, "grad_norm": 1.75, "learning_rate": 0.0009596785947078191, "loss": 1.701, "step": 1543 }, { "epoch": 0.13063710973855655, "grad_norm": 1.8515625, "learning_rate": 0.0009596261798245651, "loss": 1.611, "step": 1544 }, { "epoch": 0.13072171926558931, "grad_norm": 1.8203125, "learning_rate": 0.0009595737323289491, "loss": 1.7228, "step": 1545 }, { "epoch": 0.13080632879262205, "grad_norm": 1.75, "learning_rate": 0.0009595212522246927, "loss": 1.1685, "step": 1546 }, { "epoch": 0.1308909383196548, "grad_norm": 2.15625, "learning_rate": 0.0009594687395155192, "loss": 1.9043, "step": 1547 }, { "epoch": 0.13097554784668752, "grad_norm": 2.296875, "learning_rate": 0.000959416194205155, "loss": 1.9119, "step": 1548 }, { "epoch": 0.1310601573737203, "grad_norm": 1.5703125, "learning_rate": 0.0009593636162973281, "loss": 1.1613, "step": 1549 }, { "epoch": 0.13114476690075302, "grad_norm": 1.890625, "learning_rate": 0.0009593110057957693, "loss": 1.3034, "step": 1550 }, { "epoch": 0.13122937642778576, "grad_norm": 1.9296875, "learning_rate": 0.0009592583627042114, "loss": 1.7489, "step": 1551 }, { "epoch": 0.13131398595481852, "grad_norm": 1.546875, "learning_rate": 0.0009592056870263896, "loss": 1.3276, "step": 1552 }, { "epoch": 0.13139859548185126, "grad_norm": 1.6796875, "learning_rate": 0.0009591529787660416, "loss": 1.3964, "step": 1553 }, { "epoch": 0.131483205008884, "grad_norm": 1.921875, "learning_rate": 0.0009591002379269074, "loss": 1.9128, "step": 1554 }, { "epoch": 0.13156781453591673, "grad_norm": 1.78125, "learning_rate": 0.0009590474645127287, "loss": 1.1463, "step": 1555 }, { "epoch": 0.1316524240629495, "grad_norm": 2.046875, "learning_rate": 0.0009589946585272504, "loss": 1.4311, "step": 1556 }, { "epoch": 0.13173703358998223, "grad_norm": 2.078125, "learning_rate": 0.0009589418199742192, "loss": 1.566, "step": 1557 }, { "epoch": 0.13182164311701497, "grad_norm": 2.40625, "learning_rate": 0.0009588889488573842, "loss": 2.4008, "step": 1558 }, { "epoch": 0.13190625264404773, "grad_norm": 2.015625, "learning_rate": 0.000958836045180497, "loss": 1.7791, "step": 1559 }, { "epoch": 0.13199086217108047, "grad_norm": 1.796875, "learning_rate": 0.0009587831089473108, "loss": 1.6562, "step": 1560 }, { "epoch": 0.1320754716981132, "grad_norm": 2.28125, "learning_rate": 0.0009587301401615822, "loss": 2.122, "step": 1561 }, { "epoch": 0.13216008122514594, "grad_norm": 2.265625, "learning_rate": 0.0009586771388270692, "loss": 1.5242, "step": 1562 }, { "epoch": 0.1322446907521787, "grad_norm": 2.59375, "learning_rate": 0.0009586241049475328, "loss": 2.136, "step": 1563 }, { "epoch": 0.13232930027921144, "grad_norm": 2.296875, "learning_rate": 0.0009585710385267357, "loss": 1.8985, "step": 1564 }, { "epoch": 0.13241390980624418, "grad_norm": 5.625, "learning_rate": 0.0009585179395684432, "loss": 2.2641, "step": 1565 }, { "epoch": 0.13249851933327692, "grad_norm": 1.703125, "learning_rate": 0.000958464808076423, "loss": 1.0959, "step": 1566 }, { "epoch": 0.13258312886030968, "grad_norm": 2.03125, "learning_rate": 0.0009584116440544449, "loss": 1.2457, "step": 1567 }, { "epoch": 0.13266773838734242, "grad_norm": 2.5625, "learning_rate": 0.0009583584475062812, "loss": 1.6552, "step": 1568 }, { "epoch": 0.13275234791437515, "grad_norm": 2.59375, "learning_rate": 0.0009583052184357064, "loss": 1.7083, "step": 1569 }, { "epoch": 0.13283695744140792, "grad_norm": 60.25, "learning_rate": 0.0009582519568464971, "loss": 1.7787, "step": 1570 }, { "epoch": 0.13292156696844065, "grad_norm": 2.5625, "learning_rate": 0.0009581986627424328, "loss": 1.76, "step": 1571 }, { "epoch": 0.1330061764954734, "grad_norm": 1.78125, "learning_rate": 0.0009581453361272947, "loss": 1.2456, "step": 1572 }, { "epoch": 0.13309078602250612, "grad_norm": 1.96875, "learning_rate": 0.0009580919770048667, "loss": 1.2971, "step": 1573 }, { "epoch": 0.1331753955495389, "grad_norm": 1.6640625, "learning_rate": 0.0009580385853789346, "loss": 1.0194, "step": 1574 }, { "epoch": 0.13326000507657162, "grad_norm": 1.9375, "learning_rate": 0.0009579851612532868, "loss": 1.6361, "step": 1575 }, { "epoch": 0.13334461460360436, "grad_norm": 1.9140625, "learning_rate": 0.0009579317046317142, "loss": 1.405, "step": 1576 }, { "epoch": 0.1334292241306371, "grad_norm": 1.78125, "learning_rate": 0.0009578782155180097, "loss": 1.3932, "step": 1577 }, { "epoch": 0.13351383365766986, "grad_norm": 1.9765625, "learning_rate": 0.0009578246939159682, "loss": 1.9802, "step": 1578 }, { "epoch": 0.1335984431847026, "grad_norm": 1.4921875, "learning_rate": 0.0009577711398293878, "loss": 1.1689, "step": 1579 }, { "epoch": 0.13368305271173533, "grad_norm": 2.09375, "learning_rate": 0.0009577175532620681, "loss": 1.6548, "step": 1580 }, { "epoch": 0.1337676622387681, "grad_norm": 1.703125, "learning_rate": 0.0009576639342178114, "loss": 1.2942, "step": 1581 }, { "epoch": 0.13385227176580083, "grad_norm": 1.8125, "learning_rate": 0.000957610282700422, "loss": 1.3007, "step": 1582 }, { "epoch": 0.13393688129283357, "grad_norm": 1.7578125, "learning_rate": 0.0009575565987137069, "loss": 1.3788, "step": 1583 }, { "epoch": 0.1340214908198663, "grad_norm": 1.4921875, "learning_rate": 0.0009575028822614751, "loss": 1.2827, "step": 1584 }, { "epoch": 0.13410610034689907, "grad_norm": 1.6015625, "learning_rate": 0.0009574491333475382, "loss": 1.4072, "step": 1585 }, { "epoch": 0.1341907098739318, "grad_norm": 2.078125, "learning_rate": 0.0009573953519757095, "loss": 1.4955, "step": 1586 }, { "epoch": 0.13427531940096454, "grad_norm": 2.0625, "learning_rate": 0.0009573415381498052, "loss": 1.5971, "step": 1587 }, { "epoch": 0.13435992892799728, "grad_norm": 1.4765625, "learning_rate": 0.0009572876918736437, "loss": 1.1167, "step": 1588 }, { "epoch": 0.13444453845503004, "grad_norm": 1.90625, "learning_rate": 0.0009572338131510458, "loss": 1.9549, "step": 1589 }, { "epoch": 0.13452914798206278, "grad_norm": 2.609375, "learning_rate": 0.000957179901985834, "loss": 1.6955, "step": 1590 }, { "epoch": 0.13461375750909552, "grad_norm": 1.8671875, "learning_rate": 0.0009571259583818337, "loss": 1.7026, "step": 1591 }, { "epoch": 0.13469836703612828, "grad_norm": 1.921875, "learning_rate": 0.0009570719823428724, "loss": 1.6718, "step": 1592 }, { "epoch": 0.13478297656316102, "grad_norm": 1.6953125, "learning_rate": 0.00095701797387278, "loss": 1.2349, "step": 1593 }, { "epoch": 0.13486758609019375, "grad_norm": 1.9921875, "learning_rate": 0.0009569639329753885, "loss": 1.7962, "step": 1594 }, { "epoch": 0.1349521956172265, "grad_norm": 1.5546875, "learning_rate": 0.0009569098596545325, "loss": 1.2815, "step": 1595 }, { "epoch": 0.13503680514425925, "grad_norm": 1.5, "learning_rate": 0.0009568557539140484, "loss": 1.3097, "step": 1596 }, { "epoch": 0.135121414671292, "grad_norm": 1.6015625, "learning_rate": 0.0009568016157577757, "loss": 1.3788, "step": 1597 }, { "epoch": 0.13520602419832473, "grad_norm": 1.515625, "learning_rate": 0.0009567474451895552, "loss": 1.3699, "step": 1598 }, { "epoch": 0.13529063372535746, "grad_norm": 1.9453125, "learning_rate": 0.0009566932422132309, "loss": 1.6911, "step": 1599 }, { "epoch": 0.13537524325239023, "grad_norm": 1.90625, "learning_rate": 0.0009566390068326487, "loss": 1.5648, "step": 1600 }, { "epoch": 0.13545985277942296, "grad_norm": 2.15625, "learning_rate": 0.0009565847390516567, "loss": 1.5729, "step": 1601 }, { "epoch": 0.1355444623064557, "grad_norm": 2.0, "learning_rate": 0.0009565304388741053, "loss": 1.349, "step": 1602 }, { "epoch": 0.13562907183348846, "grad_norm": 2.5, "learning_rate": 0.0009564761063038476, "loss": 1.7593, "step": 1603 }, { "epoch": 0.1357136813605212, "grad_norm": 2.71875, "learning_rate": 0.0009564217413447385, "loss": 1.7255, "step": 1604 }, { "epoch": 0.13579829088755394, "grad_norm": 2.375, "learning_rate": 0.0009563673440006357, "loss": 1.4864, "step": 1605 }, { "epoch": 0.13588290041458667, "grad_norm": 1.59375, "learning_rate": 0.0009563129142753985, "loss": 1.335, "step": 1606 }, { "epoch": 0.13596750994161944, "grad_norm": 1.6875, "learning_rate": 0.0009562584521728892, "loss": 1.8015, "step": 1607 }, { "epoch": 0.13605211946865217, "grad_norm": 1.34375, "learning_rate": 0.000956203957696972, "loss": 1.3246, "step": 1608 }, { "epoch": 0.1361367289956849, "grad_norm": 1.875, "learning_rate": 0.0009561494308515138, "loss": 1.8571, "step": 1609 }, { "epoch": 0.13622133852271764, "grad_norm": 1.65625, "learning_rate": 0.000956094871640383, "loss": 1.5245, "step": 1610 }, { "epoch": 0.1363059480497504, "grad_norm": 1.546875, "learning_rate": 0.000956040280067451, "loss": 1.1897, "step": 1611 }, { "epoch": 0.13639055757678314, "grad_norm": 1.671875, "learning_rate": 0.0009559856561365915, "loss": 1.3043, "step": 1612 }, { "epoch": 0.13647516710381588, "grad_norm": 2.0625, "learning_rate": 0.0009559309998516801, "loss": 1.5654, "step": 1613 }, { "epoch": 0.13655977663084864, "grad_norm": 1.8203125, "learning_rate": 0.000955876311216595, "loss": 1.6837, "step": 1614 }, { "epoch": 0.13664438615788138, "grad_norm": 1.71875, "learning_rate": 0.0009558215902352163, "loss": 1.8868, "step": 1615 }, { "epoch": 0.13672899568491412, "grad_norm": 2.515625, "learning_rate": 0.000955766836911427, "loss": 1.9476, "step": 1616 }, { "epoch": 0.13681360521194685, "grad_norm": 1.4453125, "learning_rate": 0.0009557120512491121, "loss": 1.2054, "step": 1617 }, { "epoch": 0.13689821473897962, "grad_norm": 2.171875, "learning_rate": 0.0009556572332521586, "loss": 1.9723, "step": 1618 }, { "epoch": 0.13698282426601235, "grad_norm": 1.8046875, "learning_rate": 0.0009556023829244562, "loss": 1.2909, "step": 1619 }, { "epoch": 0.1370674337930451, "grad_norm": 3.921875, "learning_rate": 0.0009555475002698967, "loss": 1.2966, "step": 1620 }, { "epoch": 0.13715204332007785, "grad_norm": 1.8046875, "learning_rate": 0.0009554925852923744, "loss": 1.2749, "step": 1621 }, { "epoch": 0.1372366528471106, "grad_norm": 1.4765625, "learning_rate": 0.0009554376379957855, "loss": 1.2405, "step": 1622 }, { "epoch": 0.13732126237414333, "grad_norm": 1.9296875, "learning_rate": 0.000955382658384029, "loss": 1.5182, "step": 1623 }, { "epoch": 0.13740587190117606, "grad_norm": 1.8125, "learning_rate": 0.0009553276464610058, "loss": 1.3612, "step": 1624 }, { "epoch": 0.13749048142820883, "grad_norm": 1.5078125, "learning_rate": 0.0009552726022306192, "loss": 1.174, "step": 1625 }, { "epoch": 0.13757509095524156, "grad_norm": 1.6875, "learning_rate": 0.0009552175256967748, "loss": 1.3092, "step": 1626 }, { "epoch": 0.1376597004822743, "grad_norm": 2.1875, "learning_rate": 0.0009551624168633806, "loss": 1.7415, "step": 1627 }, { "epoch": 0.13774431000930704, "grad_norm": 2.65625, "learning_rate": 0.0009551072757343467, "loss": 1.7089, "step": 1628 }, { "epoch": 0.1378289195363398, "grad_norm": 1.8515625, "learning_rate": 0.0009550521023135856, "loss": 1.3249, "step": 1629 }, { "epoch": 0.13791352906337254, "grad_norm": 2.53125, "learning_rate": 0.0009549968966050122, "loss": 1.3095, "step": 1630 }, { "epoch": 0.13799813859040527, "grad_norm": 1.9453125, "learning_rate": 0.0009549416586125434, "loss": 1.8808, "step": 1631 }, { "epoch": 0.13808274811743804, "grad_norm": 2.296875, "learning_rate": 0.0009548863883400986, "loss": 1.3694, "step": 1632 }, { "epoch": 0.13816735764447077, "grad_norm": 1.8984375, "learning_rate": 0.0009548310857915996, "loss": 1.3032, "step": 1633 }, { "epoch": 0.1382519671715035, "grad_norm": 1.3671875, "learning_rate": 0.0009547757509709702, "loss": 1.1035, "step": 1634 }, { "epoch": 0.13833657669853625, "grad_norm": 2.09375, "learning_rate": 0.0009547203838821367, "loss": 1.1446, "step": 1635 }, { "epoch": 0.138421186225569, "grad_norm": 1.53125, "learning_rate": 0.0009546649845290276, "loss": 1.1907, "step": 1636 }, { "epoch": 0.13850579575260175, "grad_norm": 1.8984375, "learning_rate": 0.0009546095529155736, "loss": 1.6539, "step": 1637 }, { "epoch": 0.13859040527963448, "grad_norm": 1.390625, "learning_rate": 0.000954554089045708, "loss": 1.0665, "step": 1638 }, { "epoch": 0.13867501480666722, "grad_norm": 1.96875, "learning_rate": 0.000954498592923366, "loss": 1.5894, "step": 1639 }, { "epoch": 0.13875962433369998, "grad_norm": 1.984375, "learning_rate": 0.0009544430645524856, "loss": 1.6517, "step": 1640 }, { "epoch": 0.13884423386073272, "grad_norm": 1.78125, "learning_rate": 0.0009543875039370065, "loss": 1.572, "step": 1641 }, { "epoch": 0.13892884338776545, "grad_norm": 2.296875, "learning_rate": 0.0009543319110808708, "loss": 2.5625, "step": 1642 }, { "epoch": 0.13901345291479822, "grad_norm": 1.65625, "learning_rate": 0.0009542762859880232, "loss": 1.1857, "step": 1643 }, { "epoch": 0.13909806244183096, "grad_norm": 1.71875, "learning_rate": 0.0009542206286624106, "loss": 1.14, "step": 1644 }, { "epoch": 0.1391826719688637, "grad_norm": 1.3984375, "learning_rate": 0.0009541649391079823, "loss": 1.2491, "step": 1645 }, { "epoch": 0.13926728149589643, "grad_norm": 2.046875, "learning_rate": 0.0009541092173286894, "loss": 1.6809, "step": 1646 }, { "epoch": 0.1393518910229292, "grad_norm": 1.703125, "learning_rate": 0.0009540534633284854, "loss": 1.5627, "step": 1647 }, { "epoch": 0.13943650054996193, "grad_norm": 1.5625, "learning_rate": 0.0009539976771113268, "loss": 1.2508, "step": 1648 }, { "epoch": 0.13952111007699466, "grad_norm": 1.6796875, "learning_rate": 0.0009539418586811716, "loss": 1.5665, "step": 1649 }, { "epoch": 0.1396057196040274, "grad_norm": 2.109375, "learning_rate": 0.0009538860080419802, "loss": 1.8145, "step": 1650 }, { "epoch": 0.13969032913106016, "grad_norm": 2.109375, "learning_rate": 0.0009538301251977157, "loss": 2.0336, "step": 1651 }, { "epoch": 0.1397749386580929, "grad_norm": 1.5, "learning_rate": 0.0009537742101523431, "loss": 1.2069, "step": 1652 }, { "epoch": 0.13985954818512564, "grad_norm": 2.125, "learning_rate": 0.0009537182629098297, "loss": 1.66, "step": 1653 }, { "epoch": 0.1399441577121584, "grad_norm": 1.734375, "learning_rate": 0.0009536622834741454, "loss": 1.6608, "step": 1654 }, { "epoch": 0.14002876723919114, "grad_norm": 1.640625, "learning_rate": 0.0009536062718492619, "loss": 1.3784, "step": 1655 }, { "epoch": 0.14011337676622387, "grad_norm": 1.7109375, "learning_rate": 0.0009535502280391538, "loss": 1.2864, "step": 1656 }, { "epoch": 0.1401979862932566, "grad_norm": 2.046875, "learning_rate": 0.0009534941520477974, "loss": 1.3351, "step": 1657 }, { "epoch": 0.14028259582028937, "grad_norm": 1.671875, "learning_rate": 0.0009534380438791716, "loss": 1.1172, "step": 1658 }, { "epoch": 0.1403672053473221, "grad_norm": 1.484375, "learning_rate": 0.0009533819035372573, "loss": 1.3158, "step": 1659 }, { "epoch": 0.14045181487435485, "grad_norm": 1.3984375, "learning_rate": 0.0009533257310260382, "loss": 1.3607, "step": 1660 }, { "epoch": 0.14053642440138758, "grad_norm": 1.4296875, "learning_rate": 0.0009532695263494998, "loss": 1.337, "step": 1661 }, { "epoch": 0.14062103392842035, "grad_norm": 3.625, "learning_rate": 0.0009532132895116301, "loss": 1.711, "step": 1662 }, { "epoch": 0.14070564345545308, "grad_norm": 1.546875, "learning_rate": 0.0009531570205164192, "loss": 1.1428, "step": 1663 }, { "epoch": 0.14079025298248582, "grad_norm": 1.9921875, "learning_rate": 0.0009531007193678599, "loss": 1.7447, "step": 1664 }, { "epoch": 0.14087486250951858, "grad_norm": 1.671875, "learning_rate": 0.0009530443860699468, "loss": 1.5005, "step": 1665 }, { "epoch": 0.14095947203655132, "grad_norm": 2.578125, "learning_rate": 0.000952988020626677, "loss": 1.8935, "step": 1666 }, { "epoch": 0.14104408156358406, "grad_norm": 1.7265625, "learning_rate": 0.0009529316230420499, "loss": 1.5742, "step": 1667 }, { "epoch": 0.1411286910906168, "grad_norm": 2.421875, "learning_rate": 0.000952875193320067, "loss": 1.5488, "step": 1668 }, { "epoch": 0.14121330061764956, "grad_norm": 3.34375, "learning_rate": 0.0009528187314647324, "loss": 1.9287, "step": 1669 }, { "epoch": 0.1412979101446823, "grad_norm": 1.8984375, "learning_rate": 0.0009527622374800522, "loss": 1.3773, "step": 1670 }, { "epoch": 0.14138251967171503, "grad_norm": 2.265625, "learning_rate": 0.0009527057113700349, "loss": 1.9271, "step": 1671 }, { "epoch": 0.14146712919874777, "grad_norm": 1.7890625, "learning_rate": 0.0009526491531386914, "loss": 1.8381, "step": 1672 }, { "epoch": 0.14155173872578053, "grad_norm": 1.6953125, "learning_rate": 0.0009525925627900345, "loss": 1.3332, "step": 1673 }, { "epoch": 0.14163634825281327, "grad_norm": 2.40625, "learning_rate": 0.0009525359403280795, "loss": 1.4558, "step": 1674 }, { "epoch": 0.141720957779846, "grad_norm": 2.140625, "learning_rate": 0.0009524792857568443, "loss": 1.738, "step": 1675 }, { "epoch": 0.14180556730687877, "grad_norm": 2.0625, "learning_rate": 0.0009524225990803486, "loss": 1.4631, "step": 1676 }, { "epoch": 0.1418901768339115, "grad_norm": 1.515625, "learning_rate": 0.0009523658803026145, "loss": 1.3987, "step": 1677 }, { "epoch": 0.14197478636094424, "grad_norm": 2.84375, "learning_rate": 0.0009523091294276664, "loss": 1.3746, "step": 1678 }, { "epoch": 0.14205939588797697, "grad_norm": 2.359375, "learning_rate": 0.0009522523464595312, "loss": 1.5523, "step": 1679 }, { "epoch": 0.14214400541500974, "grad_norm": 2.109375, "learning_rate": 0.0009521955314022377, "loss": 1.9243, "step": 1680 }, { "epoch": 0.14222861494204248, "grad_norm": 1.6015625, "learning_rate": 0.0009521386842598172, "loss": 1.4206, "step": 1681 }, { "epoch": 0.1423132244690752, "grad_norm": 1.7578125, "learning_rate": 0.0009520818050363034, "loss": 1.2447, "step": 1682 }, { "epoch": 0.14239783399610798, "grad_norm": 1.8515625, "learning_rate": 0.0009520248937357319, "loss": 1.6907, "step": 1683 }, { "epoch": 0.1424824435231407, "grad_norm": 1.8515625, "learning_rate": 0.0009519679503621408, "loss": 1.4386, "step": 1684 }, { "epoch": 0.14256705305017345, "grad_norm": 2.1875, "learning_rate": 0.0009519109749195707, "loss": 1.481, "step": 1685 }, { "epoch": 0.14265166257720618, "grad_norm": 1.9375, "learning_rate": 0.0009518539674120639, "loss": 1.5913, "step": 1686 }, { "epoch": 0.14273627210423895, "grad_norm": 1.9375, "learning_rate": 0.0009517969278436656, "loss": 2.0174, "step": 1687 }, { "epoch": 0.14282088163127168, "grad_norm": 1.5234375, "learning_rate": 0.0009517398562184228, "loss": 1.2917, "step": 1688 }, { "epoch": 0.14290549115830442, "grad_norm": 2.203125, "learning_rate": 0.0009516827525403852, "loss": 1.8857, "step": 1689 }, { "epoch": 0.14299010068533716, "grad_norm": 1.515625, "learning_rate": 0.0009516256168136043, "loss": 1.0799, "step": 1690 }, { "epoch": 0.14307471021236992, "grad_norm": 2.15625, "learning_rate": 0.0009515684490421342, "loss": 1.7626, "step": 1691 }, { "epoch": 0.14315931973940266, "grad_norm": 1.7734375, "learning_rate": 0.0009515112492300313, "loss": 1.9574, "step": 1692 }, { "epoch": 0.1432439292664354, "grad_norm": 1.8125, "learning_rate": 0.000951454017381354, "loss": 1.2316, "step": 1693 }, { "epoch": 0.14332853879346816, "grad_norm": 1.8515625, "learning_rate": 0.0009513967535001631, "loss": 1.6454, "step": 1694 }, { "epoch": 0.1434131483205009, "grad_norm": 1.75, "learning_rate": 0.0009513394575905217, "loss": 1.6778, "step": 1695 }, { "epoch": 0.14349775784753363, "grad_norm": 1.515625, "learning_rate": 0.0009512821296564955, "loss": 1.2004, "step": 1696 }, { "epoch": 0.14358236737456637, "grad_norm": 1.375, "learning_rate": 0.0009512247697021519, "loss": 1.0097, "step": 1697 }, { "epoch": 0.14366697690159913, "grad_norm": 1.9296875, "learning_rate": 0.0009511673777315608, "loss": 1.8296, "step": 1698 }, { "epoch": 0.14375158642863187, "grad_norm": 1.40625, "learning_rate": 0.0009511099537487944, "loss": 1.3419, "step": 1699 }, { "epoch": 0.1438361959556646, "grad_norm": 1.515625, "learning_rate": 0.0009510524977579273, "loss": 1.1729, "step": 1700 }, { "epoch": 0.14392080548269734, "grad_norm": 1.5390625, "learning_rate": 0.0009509950097630361, "loss": 1.3179, "step": 1701 }, { "epoch": 0.1440054150097301, "grad_norm": 2.109375, "learning_rate": 0.0009509374897682, "loss": 1.7018, "step": 1702 }, { "epoch": 0.14409002453676284, "grad_norm": 1.828125, "learning_rate": 0.0009508799377774999, "loss": 1.2689, "step": 1703 }, { "epoch": 0.14417463406379558, "grad_norm": 1.421875, "learning_rate": 0.0009508223537950198, "loss": 1.2131, "step": 1704 }, { "epoch": 0.14425924359082834, "grad_norm": 1.5703125, "learning_rate": 0.0009507647378248452, "loss": 1.2194, "step": 1705 }, { "epoch": 0.14434385311786108, "grad_norm": 1.625, "learning_rate": 0.0009507070898710642, "loss": 1.3081, "step": 1706 }, { "epoch": 0.1444284626448938, "grad_norm": 1.9453125, "learning_rate": 0.0009506494099377674, "loss": 1.4638, "step": 1707 }, { "epoch": 0.14451307217192655, "grad_norm": 2.046875, "learning_rate": 0.0009505916980290473, "loss": 2.0841, "step": 1708 }, { "epoch": 0.1445976816989593, "grad_norm": 3.015625, "learning_rate": 0.0009505339541489988, "loss": 2.1747, "step": 1709 }, { "epoch": 0.14468229122599205, "grad_norm": 6.0625, "learning_rate": 0.000950476178301719, "loss": 2.0709, "step": 1710 }, { "epoch": 0.14476690075302479, "grad_norm": 1.984375, "learning_rate": 0.0009504183704913074, "loss": 1.538, "step": 1711 }, { "epoch": 0.14485151028005752, "grad_norm": 1.71875, "learning_rate": 0.0009503605307218657, "loss": 1.3979, "step": 1712 }, { "epoch": 0.14493611980709029, "grad_norm": 2.078125, "learning_rate": 0.0009503026589974977, "loss": 2.109, "step": 1713 }, { "epoch": 0.14502072933412302, "grad_norm": 1.6640625, "learning_rate": 0.00095024475532231, "loss": 1.3838, "step": 1714 }, { "epoch": 0.14510533886115576, "grad_norm": 2.140625, "learning_rate": 0.0009501868197004106, "loss": 1.769, "step": 1715 }, { "epoch": 0.14518994838818852, "grad_norm": 2.15625, "learning_rate": 0.0009501288521359108, "loss": 2.2404, "step": 1716 }, { "epoch": 0.14527455791522126, "grad_norm": 1.6484375, "learning_rate": 0.0009500708526329234, "loss": 1.6782, "step": 1717 }, { "epoch": 0.145359167442254, "grad_norm": 1.75, "learning_rate": 0.0009500128211955637, "loss": 1.6564, "step": 1718 }, { "epoch": 0.14544377696928673, "grad_norm": 1.4921875, "learning_rate": 0.0009499547578279493, "loss": 1.2327, "step": 1719 }, { "epoch": 0.1455283864963195, "grad_norm": 2.71875, "learning_rate": 0.0009498966625341998, "loss": 1.0971, "step": 1720 }, { "epoch": 0.14561299602335223, "grad_norm": 1.6953125, "learning_rate": 0.0009498385353184375, "loss": 1.5503, "step": 1721 }, { "epoch": 0.14569760555038497, "grad_norm": 2.328125, "learning_rate": 0.000949780376184787, "loss": 1.5708, "step": 1722 }, { "epoch": 0.1457822150774177, "grad_norm": 2.015625, "learning_rate": 0.0009497221851373745, "loss": 1.6788, "step": 1723 }, { "epoch": 0.14586682460445047, "grad_norm": 1.6796875, "learning_rate": 0.0009496639621803292, "loss": 1.2563, "step": 1724 }, { "epoch": 0.1459514341314832, "grad_norm": 1.5703125, "learning_rate": 0.0009496057073177822, "loss": 1.1107, "step": 1725 }, { "epoch": 0.14603604365851594, "grad_norm": 1.9921875, "learning_rate": 0.0009495474205538668, "loss": 1.5067, "step": 1726 }, { "epoch": 0.1461206531855487, "grad_norm": 1.984375, "learning_rate": 0.0009494891018927186, "loss": 1.4078, "step": 1727 }, { "epoch": 0.14620526271258144, "grad_norm": 3.328125, "learning_rate": 0.0009494307513384759, "loss": 1.2332, "step": 1728 }, { "epoch": 0.14628987223961418, "grad_norm": 1.8046875, "learning_rate": 0.0009493723688952787, "loss": 1.1935, "step": 1729 }, { "epoch": 0.1463744817666469, "grad_norm": 1.796875, "learning_rate": 0.0009493139545672693, "loss": 1.6738, "step": 1730 }, { "epoch": 0.14645909129367968, "grad_norm": 1.484375, "learning_rate": 0.0009492555083585927, "loss": 1.7654, "step": 1731 }, { "epoch": 0.1465437008207124, "grad_norm": 2.15625, "learning_rate": 0.000949197030273396, "loss": 2.1815, "step": 1732 }, { "epoch": 0.14662831034774515, "grad_norm": 1.984375, "learning_rate": 0.000949138520315828, "loss": 1.4195, "step": 1733 }, { "epoch": 0.1467129198747779, "grad_norm": 1.65625, "learning_rate": 0.0009490799784900408, "loss": 1.4145, "step": 1734 }, { "epoch": 0.14679752940181065, "grad_norm": 1.390625, "learning_rate": 0.0009490214048001877, "loss": 1.3291, "step": 1735 }, { "epoch": 0.1468821389288434, "grad_norm": 1.7578125, "learning_rate": 0.0009489627992504249, "loss": 1.8131, "step": 1736 }, { "epoch": 0.14696674845587612, "grad_norm": 1.7734375, "learning_rate": 0.0009489041618449108, "loss": 1.4758, "step": 1737 }, { "epoch": 0.1470513579829089, "grad_norm": 1.6328125, "learning_rate": 0.0009488454925878059, "loss": 1.1145, "step": 1738 }, { "epoch": 0.14713596750994162, "grad_norm": 1.8359375, "learning_rate": 0.0009487867914832729, "loss": 1.4255, "step": 1739 }, { "epoch": 0.14722057703697436, "grad_norm": 1.734375, "learning_rate": 0.0009487280585354772, "loss": 1.2368, "step": 1740 }, { "epoch": 0.1473051865640071, "grad_norm": 25.375, "learning_rate": 0.0009486692937485858, "loss": 1.6871, "step": 1741 }, { "epoch": 0.14738979609103986, "grad_norm": 1.3515625, "learning_rate": 0.0009486104971267684, "loss": 1.1487, "step": 1742 }, { "epoch": 0.1474744056180726, "grad_norm": 2.421875, "learning_rate": 0.000948551668674197, "loss": 1.804, "step": 1743 }, { "epoch": 0.14755901514510533, "grad_norm": 1.8046875, "learning_rate": 0.0009484928083950457, "loss": 1.5622, "step": 1744 }, { "epoch": 0.1476436246721381, "grad_norm": 1.78125, "learning_rate": 0.0009484339162934908, "loss": 1.5271, "step": 1745 }, { "epoch": 0.14772823419917083, "grad_norm": 1.8515625, "learning_rate": 0.0009483749923737109, "loss": 1.5016, "step": 1746 }, { "epoch": 0.14781284372620357, "grad_norm": 1.7109375, "learning_rate": 0.0009483160366398868, "loss": 1.2014, "step": 1747 }, { "epoch": 0.1478974532532363, "grad_norm": 1.7265625, "learning_rate": 0.0009482570490962019, "loss": 1.4114, "step": 1748 }, { "epoch": 0.14798206278026907, "grad_norm": 1.59375, "learning_rate": 0.0009481980297468415, "loss": 1.3073, "step": 1749 }, { "epoch": 0.1480666723073018, "grad_norm": 1.5703125, "learning_rate": 0.0009481389785959933, "loss": 1.4749, "step": 1750 }, { "epoch": 0.14815128183433454, "grad_norm": 1.6171875, "learning_rate": 0.0009480798956478471, "loss": 1.3195, "step": 1751 }, { "epoch": 0.14823589136136728, "grad_norm": 2.28125, "learning_rate": 0.0009480207809065953, "loss": 1.7721, "step": 1752 }, { "epoch": 0.14832050088840004, "grad_norm": 1.2265625, "learning_rate": 0.000947961634376432, "loss": 1.0431, "step": 1753 }, { "epoch": 0.14840511041543278, "grad_norm": 1.4375, "learning_rate": 0.0009479024560615541, "loss": 1.2831, "step": 1754 }, { "epoch": 0.14848971994246551, "grad_norm": 1.8984375, "learning_rate": 0.0009478432459661607, "loss": 1.6614, "step": 1755 }, { "epoch": 0.14857432946949828, "grad_norm": 1.65625, "learning_rate": 0.0009477840040944528, "loss": 1.2942, "step": 1756 }, { "epoch": 0.14865893899653101, "grad_norm": 1.6015625, "learning_rate": 0.0009477247304506338, "loss": 1.3369, "step": 1757 }, { "epoch": 0.14874354852356375, "grad_norm": 2.140625, "learning_rate": 0.0009476654250389094, "loss": 1.2856, "step": 1758 }, { "epoch": 0.1488281580505965, "grad_norm": 2.515625, "learning_rate": 0.0009476060878634878, "loss": 1.6847, "step": 1759 }, { "epoch": 0.14891276757762925, "grad_norm": 1.4921875, "learning_rate": 0.0009475467189285791, "loss": 1.2582, "step": 1760 }, { "epoch": 0.148997377104662, "grad_norm": 1.59375, "learning_rate": 0.0009474873182383956, "loss": 1.2678, "step": 1761 }, { "epoch": 0.14908198663169472, "grad_norm": 1.5546875, "learning_rate": 0.0009474278857971521, "loss": 1.1557, "step": 1762 }, { "epoch": 0.14916659615872746, "grad_norm": 1.9375, "learning_rate": 0.0009473684216090659, "loss": 1.4825, "step": 1763 }, { "epoch": 0.14925120568576022, "grad_norm": 1.546875, "learning_rate": 0.0009473089256783557, "loss": 1.1629, "step": 1764 }, { "epoch": 0.14933581521279296, "grad_norm": 1.828125, "learning_rate": 0.0009472493980092433, "loss": 1.6079, "step": 1765 }, { "epoch": 0.1494204247398257, "grad_norm": 1.8125, "learning_rate": 0.0009471898386059526, "loss": 1.6401, "step": 1766 }, { "epoch": 0.14950503426685846, "grad_norm": 1.5703125, "learning_rate": 0.000947130247472709, "loss": 1.7783, "step": 1767 }, { "epoch": 0.1495896437938912, "grad_norm": 1.7109375, "learning_rate": 0.0009470706246137413, "loss": 1.4931, "step": 1768 }, { "epoch": 0.14967425332092393, "grad_norm": 1.5078125, "learning_rate": 0.0009470109700332797, "loss": 1.5906, "step": 1769 }, { "epoch": 0.14975886284795667, "grad_norm": 1.7734375, "learning_rate": 0.0009469512837355572, "loss": 1.448, "step": 1770 }, { "epoch": 0.14984347237498943, "grad_norm": 1.5390625, "learning_rate": 0.0009468915657248083, "loss": 1.877, "step": 1771 }, { "epoch": 0.14992808190202217, "grad_norm": 1.4375, "learning_rate": 0.0009468318160052707, "loss": 1.1507, "step": 1772 }, { "epoch": 0.1500126914290549, "grad_norm": 1.7421875, "learning_rate": 0.0009467720345811837, "loss": 1.3734, "step": 1773 }, { "epoch": 0.15009730095608764, "grad_norm": 1.8359375, "learning_rate": 0.0009467122214567892, "loss": 1.4645, "step": 1774 }, { "epoch": 0.1501819104831204, "grad_norm": 1.609375, "learning_rate": 0.0009466523766363308, "loss": 1.6265, "step": 1775 }, { "epoch": 0.15026652001015314, "grad_norm": 1.3203125, "learning_rate": 0.0009465925001240551, "loss": 1.0501, "step": 1776 }, { "epoch": 0.15035112953718588, "grad_norm": 1.5625, "learning_rate": 0.0009465325919242107, "loss": 1.1689, "step": 1777 }, { "epoch": 0.15043573906421864, "grad_norm": 1.515625, "learning_rate": 0.0009464726520410479, "loss": 1.404, "step": 1778 }, { "epoch": 0.15052034859125138, "grad_norm": 1.546875, "learning_rate": 0.0009464126804788198, "loss": 1.0053, "step": 1779 }, { "epoch": 0.15060495811828412, "grad_norm": 1.8046875, "learning_rate": 0.0009463526772417819, "loss": 1.7042, "step": 1780 }, { "epoch": 0.15068956764531685, "grad_norm": 1.484375, "learning_rate": 0.0009462926423341913, "loss": 1.2346, "step": 1781 }, { "epoch": 0.15077417717234962, "grad_norm": 1.4765625, "learning_rate": 0.0009462325757603081, "loss": 1.1178, "step": 1782 }, { "epoch": 0.15085878669938235, "grad_norm": 1.828125, "learning_rate": 0.0009461724775243938, "loss": 1.5126, "step": 1783 }, { "epoch": 0.1509433962264151, "grad_norm": 2.796875, "learning_rate": 0.0009461123476307132, "loss": 1.6105, "step": 1784 }, { "epoch": 0.15102800575344782, "grad_norm": 1.703125, "learning_rate": 0.0009460521860835322, "loss": 1.4037, "step": 1785 }, { "epoch": 0.1511126152804806, "grad_norm": 2.25, "learning_rate": 0.00094599199288712, "loss": 1.8476, "step": 1786 }, { "epoch": 0.15119722480751333, "grad_norm": 1.8828125, "learning_rate": 0.0009459317680457472, "loss": 1.6835, "step": 1787 }, { "epoch": 0.15128183433454606, "grad_norm": 1.65625, "learning_rate": 0.0009458715115636871, "loss": 1.0112, "step": 1788 }, { "epoch": 0.15136644386157883, "grad_norm": 1.890625, "learning_rate": 0.0009458112234452153, "loss": 1.4347, "step": 1789 }, { "epoch": 0.15145105338861156, "grad_norm": 2.0625, "learning_rate": 0.0009457509036946092, "loss": 1.6033, "step": 1790 }, { "epoch": 0.1515356629156443, "grad_norm": 2.109375, "learning_rate": 0.000945690552316149, "loss": 2.2909, "step": 1791 }, { "epoch": 0.15162027244267703, "grad_norm": 1.7890625, "learning_rate": 0.0009456301693141167, "loss": 1.2648, "step": 1792 }, { "epoch": 0.1517048819697098, "grad_norm": 1.546875, "learning_rate": 0.0009455697546927968, "loss": 1.6219, "step": 1793 }, { "epoch": 0.15178949149674253, "grad_norm": 2.125, "learning_rate": 0.0009455093084564759, "loss": 1.9848, "step": 1794 }, { "epoch": 0.15187410102377527, "grad_norm": 1.8984375, "learning_rate": 0.0009454488306094431, "loss": 2.2436, "step": 1795 }, { "epoch": 0.151958710550808, "grad_norm": 1.7421875, "learning_rate": 0.0009453883211559893, "loss": 1.5176, "step": 1796 }, { "epoch": 0.15204332007784077, "grad_norm": 1.8125, "learning_rate": 0.0009453277801004081, "loss": 1.2729, "step": 1797 }, { "epoch": 0.1521279296048735, "grad_norm": 2.328125, "learning_rate": 0.000945267207446995, "loss": 1.7422, "step": 1798 }, { "epoch": 0.15221253913190624, "grad_norm": 1.8984375, "learning_rate": 0.0009452066032000479, "loss": 1.463, "step": 1799 }, { "epoch": 0.152297148658939, "grad_norm": 1.4140625, "learning_rate": 0.000945145967363867, "loss": 1.1533, "step": 1800 }, { "epoch": 0.15238175818597174, "grad_norm": 1.84375, "learning_rate": 0.0009450852999427544, "loss": 1.1459, "step": 1801 }, { "epoch": 0.15246636771300448, "grad_norm": 1.7421875, "learning_rate": 0.0009450246009410151, "loss": 1.5402, "step": 1802 }, { "epoch": 0.15255097724003722, "grad_norm": 1.9375, "learning_rate": 0.0009449638703629557, "loss": 1.6568, "step": 1803 }, { "epoch": 0.15263558676706998, "grad_norm": 1.828125, "learning_rate": 0.0009449031082128854, "loss": 1.4453, "step": 1804 }, { "epoch": 0.15272019629410272, "grad_norm": 2.109375, "learning_rate": 0.0009448423144951153, "loss": 1.4113, "step": 1805 }, { "epoch": 0.15280480582113545, "grad_norm": 1.7734375, "learning_rate": 0.0009447814892139593, "loss": 1.3675, "step": 1806 }, { "epoch": 0.15288941534816822, "grad_norm": 2.953125, "learning_rate": 0.0009447206323737327, "loss": 1.1828, "step": 1807 }, { "epoch": 0.15297402487520095, "grad_norm": 1.5703125, "learning_rate": 0.0009446597439787542, "loss": 1.2599, "step": 1808 }, { "epoch": 0.1530586344022337, "grad_norm": 2.140625, "learning_rate": 0.0009445988240333437, "loss": 1.6488, "step": 1809 }, { "epoch": 0.15314324392926643, "grad_norm": 1.6328125, "learning_rate": 0.0009445378725418236, "loss": 1.5707, "step": 1810 }, { "epoch": 0.1532278534562992, "grad_norm": 2.203125, "learning_rate": 0.000944476889508519, "loss": 1.8129, "step": 1811 }, { "epoch": 0.15331246298333193, "grad_norm": 1.609375, "learning_rate": 0.0009444158749377565, "loss": 1.3808, "step": 1812 }, { "epoch": 0.15339707251036466, "grad_norm": 1.6328125, "learning_rate": 0.0009443548288338657, "loss": 1.2372, "step": 1813 }, { "epoch": 0.1534816820373974, "grad_norm": 5.875, "learning_rate": 0.0009442937512011779, "loss": 1.1614, "step": 1814 }, { "epoch": 0.15356629156443016, "grad_norm": 1.53125, "learning_rate": 0.0009442326420440268, "loss": 1.2138, "step": 1815 }, { "epoch": 0.1536509010914629, "grad_norm": 2.1875, "learning_rate": 0.0009441715013667484, "loss": 1.5862, "step": 1816 }, { "epoch": 0.15373551061849564, "grad_norm": 2.203125, "learning_rate": 0.0009441103291736809, "loss": 1.1423, "step": 1817 }, { "epoch": 0.1538201201455284, "grad_norm": 1.859375, "learning_rate": 0.0009440491254691647, "loss": 1.1956, "step": 1818 }, { "epoch": 0.15390472967256114, "grad_norm": 1.8671875, "learning_rate": 0.0009439878902575422, "loss": 1.6562, "step": 1819 }, { "epoch": 0.15398933919959387, "grad_norm": 2.65625, "learning_rate": 0.0009439266235431588, "loss": 1.8061, "step": 1820 }, { "epoch": 0.1540739487266266, "grad_norm": 2.015625, "learning_rate": 0.0009438653253303613, "loss": 1.232, "step": 1821 }, { "epoch": 0.15415855825365937, "grad_norm": 3.140625, "learning_rate": 0.0009438039956234989, "loss": 2.5967, "step": 1822 }, { "epoch": 0.1542431677806921, "grad_norm": 1.578125, "learning_rate": 0.0009437426344269237, "loss": 1.1262, "step": 1823 }, { "epoch": 0.15432777730772484, "grad_norm": 2.328125, "learning_rate": 0.0009436812417449891, "loss": 1.4386, "step": 1824 }, { "epoch": 0.15441238683475758, "grad_norm": 2.484375, "learning_rate": 0.0009436198175820511, "loss": 1.4278, "step": 1825 }, { "epoch": 0.15449699636179035, "grad_norm": 1.6875, "learning_rate": 0.0009435583619424683, "loss": 1.1076, "step": 1826 }, { "epoch": 0.15458160588882308, "grad_norm": 2.078125, "learning_rate": 0.0009434968748306013, "loss": 1.9931, "step": 1827 }, { "epoch": 0.15466621541585582, "grad_norm": 1.828125, "learning_rate": 0.0009434353562508126, "loss": 1.5573, "step": 1828 }, { "epoch": 0.15475082494288858, "grad_norm": 2.015625, "learning_rate": 0.0009433738062074672, "loss": 1.5508, "step": 1829 }, { "epoch": 0.15483543446992132, "grad_norm": 1.5078125, "learning_rate": 0.0009433122247049324, "loss": 1.1859, "step": 1830 }, { "epoch": 0.15492004399695405, "grad_norm": 2.0, "learning_rate": 0.0009432506117475777, "loss": 1.5653, "step": 1831 }, { "epoch": 0.1550046535239868, "grad_norm": 1.875, "learning_rate": 0.0009431889673397747, "loss": 1.1332, "step": 1832 }, { "epoch": 0.15508926305101955, "grad_norm": 1.8828125, "learning_rate": 0.0009431272914858974, "loss": 1.6936, "step": 1833 }, { "epoch": 0.1551738725780523, "grad_norm": 1.5546875, "learning_rate": 0.000943065584190322, "loss": 1.3503, "step": 1834 }, { "epoch": 0.15525848210508503, "grad_norm": 1.6796875, "learning_rate": 0.0009430038454574267, "loss": 1.1929, "step": 1835 }, { "epoch": 0.15534309163211776, "grad_norm": 1.625, "learning_rate": 0.0009429420752915923, "loss": 1.2787, "step": 1836 }, { "epoch": 0.15542770115915053, "grad_norm": 2.09375, "learning_rate": 0.0009428802736972015, "loss": 1.5332, "step": 1837 }, { "epoch": 0.15551231068618326, "grad_norm": 2.203125, "learning_rate": 0.0009428184406786395, "loss": 1.7973, "step": 1838 }, { "epoch": 0.155596920213216, "grad_norm": 1.7265625, "learning_rate": 0.0009427565762402937, "loss": 1.2278, "step": 1839 }, { "epoch": 0.15568152974024876, "grad_norm": 1.8984375, "learning_rate": 0.0009426946803865533, "loss": 1.5101, "step": 1840 }, { "epoch": 0.1557661392672815, "grad_norm": 2.28125, "learning_rate": 0.0009426327531218104, "loss": 1.8662, "step": 1841 }, { "epoch": 0.15585074879431424, "grad_norm": 1.3828125, "learning_rate": 0.0009425707944504588, "loss": 1.0973, "step": 1842 }, { "epoch": 0.15593535832134697, "grad_norm": 1.3984375, "learning_rate": 0.0009425088043768948, "loss": 0.9126, "step": 1843 }, { "epoch": 0.15601996784837974, "grad_norm": 1.9140625, "learning_rate": 0.0009424467829055168, "loss": 1.6611, "step": 1844 }, { "epoch": 0.15610457737541247, "grad_norm": 2.1875, "learning_rate": 0.0009423847300407255, "loss": 1.6741, "step": 1845 }, { "epoch": 0.1561891869024452, "grad_norm": 2.078125, "learning_rate": 0.0009423226457869238, "loss": 1.2816, "step": 1846 }, { "epoch": 0.15627379642947795, "grad_norm": 2.421875, "learning_rate": 0.0009422605301485169, "loss": 2.3079, "step": 1847 }, { "epoch": 0.1563584059565107, "grad_norm": 1.78125, "learning_rate": 0.000942198383129912, "loss": 1.2403, "step": 1848 }, { "epoch": 0.15644301548354345, "grad_norm": 1.5703125, "learning_rate": 0.0009421362047355189, "loss": 1.3749, "step": 1849 }, { "epoch": 0.15652762501057618, "grad_norm": 2.09375, "learning_rate": 0.0009420739949697493, "loss": 1.2933, "step": 1850 }, { "epoch": 0.15661223453760895, "grad_norm": 1.703125, "learning_rate": 0.0009420117538370172, "loss": 1.2073, "step": 1851 }, { "epoch": 0.15669684406464168, "grad_norm": 2.140625, "learning_rate": 0.0009419494813417389, "loss": 1.4225, "step": 1852 }, { "epoch": 0.15678145359167442, "grad_norm": 1.6015625, "learning_rate": 0.0009418871774883328, "loss": 1.4623, "step": 1853 }, { "epoch": 0.15686606311870716, "grad_norm": 1.6640625, "learning_rate": 0.0009418248422812198, "loss": 1.7863, "step": 1854 }, { "epoch": 0.15695067264573992, "grad_norm": 1.7734375, "learning_rate": 0.0009417624757248228, "loss": 1.6152, "step": 1855 }, { "epoch": 0.15703528217277266, "grad_norm": 1.734375, "learning_rate": 0.0009417000778235668, "loss": 1.3535, "step": 1856 }, { "epoch": 0.1571198916998054, "grad_norm": 1.796875, "learning_rate": 0.0009416376485818792, "loss": 1.8126, "step": 1857 }, { "epoch": 0.15720450122683813, "grad_norm": 1.640625, "learning_rate": 0.00094157518800419, "loss": 1.3791, "step": 1858 }, { "epoch": 0.1572891107538709, "grad_norm": 2.375, "learning_rate": 0.0009415126960949305, "loss": 1.8918, "step": 1859 }, { "epoch": 0.15737372028090363, "grad_norm": 2.65625, "learning_rate": 0.000941450172858535, "loss": 1.2439, "step": 1860 }, { "epoch": 0.15745832980793636, "grad_norm": 2.125, "learning_rate": 0.0009413876182994399, "loss": 1.3569, "step": 1861 }, { "epoch": 0.15754293933496913, "grad_norm": 1.953125, "learning_rate": 0.0009413250324220835, "loss": 1.9869, "step": 1862 }, { "epoch": 0.15762754886200186, "grad_norm": 1.6640625, "learning_rate": 0.0009412624152309066, "loss": 1.2524, "step": 1863 }, { "epoch": 0.1577121583890346, "grad_norm": 2.40625, "learning_rate": 0.0009411997667303523, "loss": 1.0561, "step": 1864 }, { "epoch": 0.15779676791606734, "grad_norm": 2.21875, "learning_rate": 0.0009411370869248654, "loss": 1.6306, "step": 1865 }, { "epoch": 0.1578813774431001, "grad_norm": 1.375, "learning_rate": 0.0009410743758188936, "loss": 1.1667, "step": 1866 }, { "epoch": 0.15796598697013284, "grad_norm": 1.8359375, "learning_rate": 0.0009410116334168864, "loss": 1.6263, "step": 1867 }, { "epoch": 0.15805059649716557, "grad_norm": 2.1875, "learning_rate": 0.0009409488597232955, "loss": 1.8814, "step": 1868 }, { "epoch": 0.15813520602419834, "grad_norm": 1.4296875, "learning_rate": 0.0009408860547425754, "loss": 1.4415, "step": 1869 }, { "epoch": 0.15821981555123107, "grad_norm": 2.09375, "learning_rate": 0.0009408232184791818, "loss": 1.8425, "step": 1870 }, { "epoch": 0.1583044250782638, "grad_norm": 3.625, "learning_rate": 0.0009407603509375737, "loss": 2.0668, "step": 1871 }, { "epoch": 0.15838903460529655, "grad_norm": 2.5, "learning_rate": 0.0009406974521222114, "loss": 1.2793, "step": 1872 }, { "epoch": 0.1584736441323293, "grad_norm": 1.390625, "learning_rate": 0.000940634522037558, "loss": 1.2061, "step": 1873 }, { "epoch": 0.15855825365936205, "grad_norm": 1.96875, "learning_rate": 0.0009405715606880787, "loss": 1.673, "step": 1874 }, { "epoch": 0.15864286318639478, "grad_norm": 1.703125, "learning_rate": 0.0009405085680782409, "loss": 1.4777, "step": 1875 }, { "epoch": 0.15872747271342752, "grad_norm": 1.8984375, "learning_rate": 0.000940445544212514, "loss": 1.7374, "step": 1876 }, { "epoch": 0.15881208224046028, "grad_norm": 9.6875, "learning_rate": 0.0009403824890953697, "loss": 1.0062, "step": 1877 }, { "epoch": 0.15889669176749302, "grad_norm": 1.9296875, "learning_rate": 0.0009403194027312825, "loss": 1.4799, "step": 1878 }, { "epoch": 0.15898130129452576, "grad_norm": 1.5546875, "learning_rate": 0.0009402562851247283, "loss": 1.492, "step": 1879 }, { "epoch": 0.15906591082155852, "grad_norm": 1.6953125, "learning_rate": 0.0009401931362801855, "loss": 1.1135, "step": 1880 }, { "epoch": 0.15915052034859126, "grad_norm": 2.046875, "learning_rate": 0.0009401299562021348, "loss": 1.6576, "step": 1881 }, { "epoch": 0.159235129875624, "grad_norm": 2.046875, "learning_rate": 0.0009400667448950593, "loss": 1.3681, "step": 1882 }, { "epoch": 0.15931973940265673, "grad_norm": 2.0, "learning_rate": 0.0009400035023634439, "loss": 1.3583, "step": 1883 }, { "epoch": 0.1594043489296895, "grad_norm": 1.8359375, "learning_rate": 0.0009399402286117761, "loss": 1.2424, "step": 1884 }, { "epoch": 0.15948895845672223, "grad_norm": 1.90625, "learning_rate": 0.0009398769236445453, "loss": 1.7771, "step": 1885 }, { "epoch": 0.15957356798375497, "grad_norm": 2.046875, "learning_rate": 0.000939813587466243, "loss": 1.9298, "step": 1886 }, { "epoch": 0.1596581775107877, "grad_norm": 1.625, "learning_rate": 0.0009397502200813637, "loss": 1.2473, "step": 1887 }, { "epoch": 0.15974278703782047, "grad_norm": 1.5625, "learning_rate": 0.0009396868214944032, "loss": 1.4653, "step": 1888 }, { "epoch": 0.1598273965648532, "grad_norm": 1.453125, "learning_rate": 0.00093962339170986, "loss": 1.3443, "step": 1889 }, { "epoch": 0.15991200609188594, "grad_norm": 1.78125, "learning_rate": 0.0009395599307322346, "loss": 1.6403, "step": 1890 }, { "epoch": 0.1599966156189187, "grad_norm": 1.421875, "learning_rate": 0.00093949643856603, "loss": 1.2417, "step": 1891 }, { "epoch": 0.16008122514595144, "grad_norm": 1.3984375, "learning_rate": 0.000939432915215751, "loss": 1.165, "step": 1892 }, { "epoch": 0.16016583467298418, "grad_norm": 1.7421875, "learning_rate": 0.0009393693606859052, "loss": 1.9597, "step": 1893 }, { "epoch": 0.1602504442000169, "grad_norm": 1.6953125, "learning_rate": 0.0009393057749810017, "loss": 1.3675, "step": 1894 }, { "epoch": 0.16033505372704968, "grad_norm": 1.40625, "learning_rate": 0.0009392421581055524, "loss": 1.3526, "step": 1895 }, { "epoch": 0.1604196632540824, "grad_norm": 1.578125, "learning_rate": 0.0009391785100640709, "loss": 1.5649, "step": 1896 }, { "epoch": 0.16050427278111515, "grad_norm": 2.109375, "learning_rate": 0.0009391148308610735, "loss": 1.2963, "step": 1897 }, { "epoch": 0.16058888230814788, "grad_norm": 1.5625, "learning_rate": 0.0009390511205010785, "loss": 1.4875, "step": 1898 }, { "epoch": 0.16067349183518065, "grad_norm": 2.09375, "learning_rate": 0.0009389873789886063, "loss": 1.464, "step": 1899 }, { "epoch": 0.16075810136221338, "grad_norm": 1.3984375, "learning_rate": 0.0009389236063281799, "loss": 1.1415, "step": 1900 }, { "epoch": 0.16084271088924612, "grad_norm": 2.015625, "learning_rate": 0.0009388598025243238, "loss": 1.5802, "step": 1901 }, { "epoch": 0.16092732041627889, "grad_norm": 2.109375, "learning_rate": 0.0009387959675815655, "loss": 1.4158, "step": 1902 }, { "epoch": 0.16101192994331162, "grad_norm": 1.890625, "learning_rate": 0.0009387321015044343, "loss": 1.8283, "step": 1903 }, { "epoch": 0.16109653947034436, "grad_norm": 2.4375, "learning_rate": 0.0009386682042974615, "loss": 1.4637, "step": 1904 }, { "epoch": 0.1611811489973771, "grad_norm": 1.640625, "learning_rate": 0.0009386042759651812, "loss": 1.3931, "step": 1905 }, { "epoch": 0.16126575852440986, "grad_norm": 1.5234375, "learning_rate": 0.0009385403165121291, "loss": 1.2017, "step": 1906 }, { "epoch": 0.1613503680514426, "grad_norm": 1.6328125, "learning_rate": 0.0009384763259428435, "loss": 1.7418, "step": 1907 }, { "epoch": 0.16143497757847533, "grad_norm": 1.8359375, "learning_rate": 0.0009384123042618648, "loss": 1.6229, "step": 1908 }, { "epoch": 0.16151958710550807, "grad_norm": 1.3671875, "learning_rate": 0.0009383482514737358, "loss": 1.055, "step": 1909 }, { "epoch": 0.16160419663254083, "grad_norm": 2.328125, "learning_rate": 0.0009382841675830009, "loss": 2.5533, "step": 1910 }, { "epoch": 0.16168880615957357, "grad_norm": 1.65625, "learning_rate": 0.0009382200525942076, "loss": 1.4937, "step": 1911 }, { "epoch": 0.1617734156866063, "grad_norm": 1.6953125, "learning_rate": 0.0009381559065119045, "loss": 1.4919, "step": 1912 }, { "epoch": 0.16185802521363907, "grad_norm": 2.078125, "learning_rate": 0.0009380917293406436, "loss": 1.7114, "step": 1913 }, { "epoch": 0.1619426347406718, "grad_norm": 1.875, "learning_rate": 0.0009380275210849782, "loss": 1.356, "step": 1914 }, { "epoch": 0.16202724426770454, "grad_norm": 1.96875, "learning_rate": 0.0009379632817494644, "loss": 1.6278, "step": 1915 }, { "epoch": 0.16211185379473728, "grad_norm": 1.625, "learning_rate": 0.00093789901133866, "loss": 1.3118, "step": 1916 }, { "epoch": 0.16219646332177004, "grad_norm": 1.953125, "learning_rate": 0.0009378347098571254, "loss": 1.8237, "step": 1917 }, { "epoch": 0.16228107284880278, "grad_norm": 1.421875, "learning_rate": 0.0009377703773094228, "loss": 1.3333, "step": 1918 }, { "epoch": 0.1623656823758355, "grad_norm": 1.640625, "learning_rate": 0.0009377060137001173, "loss": 1.8687, "step": 1919 }, { "epoch": 0.16245029190286828, "grad_norm": 1.53125, "learning_rate": 0.0009376416190337754, "loss": 1.7222, "step": 1920 }, { "epoch": 0.162534901429901, "grad_norm": 1.6875, "learning_rate": 0.0009375771933149663, "loss": 1.6239, "step": 1921 }, { "epoch": 0.16261951095693375, "grad_norm": 1.2578125, "learning_rate": 0.0009375127365482613, "loss": 1.0085, "step": 1922 }, { "epoch": 0.16270412048396649, "grad_norm": 1.734375, "learning_rate": 0.0009374482487382338, "loss": 1.468, "step": 1923 }, { "epoch": 0.16278873001099925, "grad_norm": 1.4375, "learning_rate": 0.0009373837298894594, "loss": 1.255, "step": 1924 }, { "epoch": 0.16287333953803199, "grad_norm": 1.8359375, "learning_rate": 0.0009373191800065161, "loss": 1.6439, "step": 1925 }, { "epoch": 0.16295794906506472, "grad_norm": 1.859375, "learning_rate": 0.000937254599093984, "loss": 1.7379, "step": 1926 }, { "epoch": 0.16304255859209746, "grad_norm": 1.90625, "learning_rate": 0.0009371899871564454, "loss": 1.2702, "step": 1927 }, { "epoch": 0.16312716811913022, "grad_norm": 1.4296875, "learning_rate": 0.0009371253441984847, "loss": 1.0493, "step": 1928 }, { "epoch": 0.16321177764616296, "grad_norm": 1.8359375, "learning_rate": 0.0009370606702246887, "loss": 1.3762, "step": 1929 }, { "epoch": 0.1632963871731957, "grad_norm": 1.8359375, "learning_rate": 0.000936995965239646, "loss": 1.2363, "step": 1930 }, { "epoch": 0.16338099670022846, "grad_norm": 1.546875, "learning_rate": 0.0009369312292479478, "loss": 1.4387, "step": 1931 }, { "epoch": 0.1634656062272612, "grad_norm": 1.578125, "learning_rate": 0.0009368664622541876, "loss": 1.2653, "step": 1932 }, { "epoch": 0.16355021575429393, "grad_norm": 1.734375, "learning_rate": 0.0009368016642629606, "loss": 2.1522, "step": 1933 }, { "epoch": 0.16363482528132667, "grad_norm": 1.78125, "learning_rate": 0.0009367368352788648, "loss": 2.1567, "step": 1934 }, { "epoch": 0.16371943480835943, "grad_norm": 1.703125, "learning_rate": 0.0009366719753064999, "loss": 1.3402, "step": 1935 }, { "epoch": 0.16380404433539217, "grad_norm": 1.4296875, "learning_rate": 0.0009366070843504678, "loss": 1.223, "step": 1936 }, { "epoch": 0.1638886538624249, "grad_norm": 1.75, "learning_rate": 0.0009365421624153732, "loss": 1.5243, "step": 1937 }, { "epoch": 0.16397326338945764, "grad_norm": 1.796875, "learning_rate": 0.0009364772095058221, "loss": 1.7981, "step": 1938 }, { "epoch": 0.1640578729164904, "grad_norm": 1.921875, "learning_rate": 0.0009364122256264235, "loss": 2.1738, "step": 1939 }, { "epoch": 0.16414248244352314, "grad_norm": 1.5859375, "learning_rate": 0.0009363472107817882, "loss": 1.8093, "step": 1940 }, { "epoch": 0.16422709197055588, "grad_norm": 1.2578125, "learning_rate": 0.0009362821649765292, "loss": 1.1975, "step": 1941 }, { "epoch": 0.16431170149758864, "grad_norm": 1.546875, "learning_rate": 0.0009362170882152618, "loss": 1.6203, "step": 1942 }, { "epoch": 0.16439631102462138, "grad_norm": 1.625, "learning_rate": 0.0009361519805026038, "loss": 1.3249, "step": 1943 }, { "epoch": 0.16448092055165411, "grad_norm": 1.8046875, "learning_rate": 0.0009360868418431743, "loss": 1.5663, "step": 1944 }, { "epoch": 0.16456553007868685, "grad_norm": 1.6875, "learning_rate": 0.0009360216722415953, "loss": 1.1818, "step": 1945 }, { "epoch": 0.16465013960571961, "grad_norm": 1.6875, "learning_rate": 0.0009359564717024911, "loss": 1.611, "step": 1946 }, { "epoch": 0.16473474913275235, "grad_norm": 1.859375, "learning_rate": 0.0009358912402304877, "loss": 1.769, "step": 1947 }, { "epoch": 0.1648193586597851, "grad_norm": 1.5234375, "learning_rate": 0.0009358259778302137, "loss": 1.1229, "step": 1948 }, { "epoch": 0.16490396818681782, "grad_norm": 1.6171875, "learning_rate": 0.0009357606845062996, "loss": 1.4662, "step": 1949 }, { "epoch": 0.1649885777138506, "grad_norm": 1.5, "learning_rate": 0.0009356953602633784, "loss": 1.4891, "step": 1950 }, { "epoch": 0.16507318724088332, "grad_norm": 2.015625, "learning_rate": 0.000935630005106085, "loss": 2.1075, "step": 1951 }, { "epoch": 0.16515779676791606, "grad_norm": 1.8046875, "learning_rate": 0.0009355646190390565, "loss": 1.5036, "step": 1952 }, { "epoch": 0.16524240629494882, "grad_norm": 2.0625, "learning_rate": 0.0009354992020669326, "loss": 1.2627, "step": 1953 }, { "epoch": 0.16532701582198156, "grad_norm": 1.4765625, "learning_rate": 0.0009354337541943547, "loss": 1.2213, "step": 1954 }, { "epoch": 0.1654116253490143, "grad_norm": 1.4921875, "learning_rate": 0.0009353682754259667, "loss": 1.4819, "step": 1955 }, { "epoch": 0.16549623487604703, "grad_norm": 2.015625, "learning_rate": 0.0009353027657664146, "loss": 1.3307, "step": 1956 }, { "epoch": 0.1655808444030798, "grad_norm": 1.8046875, "learning_rate": 0.0009352372252203464, "loss": 1.4658, "step": 1957 }, { "epoch": 0.16566545393011253, "grad_norm": 1.6640625, "learning_rate": 0.0009351716537924126, "loss": 1.1424, "step": 1958 }, { "epoch": 0.16575006345714527, "grad_norm": 1.765625, "learning_rate": 0.0009351060514872657, "loss": 1.2496, "step": 1959 }, { "epoch": 0.165834672984178, "grad_norm": 1.7421875, "learning_rate": 0.0009350404183095605, "loss": 1.4414, "step": 1960 }, { "epoch": 0.16591928251121077, "grad_norm": 1.671875, "learning_rate": 0.0009349747542639539, "loss": 1.5162, "step": 1961 }, { "epoch": 0.1660038920382435, "grad_norm": 1.3203125, "learning_rate": 0.0009349090593551053, "loss": 1.1895, "step": 1962 }, { "epoch": 0.16608850156527624, "grad_norm": 1.65625, "learning_rate": 0.0009348433335876756, "loss": 1.4449, "step": 1963 }, { "epoch": 0.166173111092309, "grad_norm": 1.640625, "learning_rate": 0.0009347775769663286, "loss": 1.6672, "step": 1964 }, { "epoch": 0.16625772061934174, "grad_norm": 1.84375, "learning_rate": 0.0009347117894957298, "loss": 1.6937, "step": 1965 }, { "epoch": 0.16634233014637448, "grad_norm": 1.8125, "learning_rate": 0.0009346459711805472, "loss": 1.658, "step": 1966 }, { "epoch": 0.16642693967340721, "grad_norm": 1.703125, "learning_rate": 0.000934580122025451, "loss": 1.461, "step": 1967 }, { "epoch": 0.16651154920043998, "grad_norm": 3.46875, "learning_rate": 0.0009345142420351134, "loss": 1.5498, "step": 1968 }, { "epoch": 0.16659615872747272, "grad_norm": 2.015625, "learning_rate": 0.0009344483312142087, "loss": 2.1397, "step": 1969 }, { "epoch": 0.16668076825450545, "grad_norm": 1.7578125, "learning_rate": 0.0009343823895674135, "loss": 1.5917, "step": 1970 }, { "epoch": 0.1667653777815382, "grad_norm": 1.9609375, "learning_rate": 0.0009343164170994069, "loss": 1.6527, "step": 1971 }, { "epoch": 0.16684998730857095, "grad_norm": 1.609375, "learning_rate": 0.0009342504138148699, "loss": 1.1609, "step": 1972 }, { "epoch": 0.1669345968356037, "grad_norm": 2.46875, "learning_rate": 0.0009341843797184855, "loss": 1.6447, "step": 1973 }, { "epoch": 0.16701920636263642, "grad_norm": 2.859375, "learning_rate": 0.0009341183148149393, "loss": 1.4625, "step": 1974 }, { "epoch": 0.1671038158896692, "grad_norm": 1.796875, "learning_rate": 0.0009340522191089186, "loss": 1.754, "step": 1975 }, { "epoch": 0.16718842541670192, "grad_norm": 1.875, "learning_rate": 0.0009339860926051134, "loss": 1.6788, "step": 1976 }, { "epoch": 0.16727303494373466, "grad_norm": 2.203125, "learning_rate": 0.0009339199353082155, "loss": 1.6167, "step": 1977 }, { "epoch": 0.1673576444707674, "grad_norm": 1.515625, "learning_rate": 0.0009338537472229192, "loss": 1.2873, "step": 1978 }, { "epoch": 0.16744225399780016, "grad_norm": 1.640625, "learning_rate": 0.0009337875283539208, "loss": 1.3842, "step": 1979 }, { "epoch": 0.1675268635248329, "grad_norm": 2.1875, "learning_rate": 0.0009337212787059185, "loss": 1.5902, "step": 1980 }, { "epoch": 0.16761147305186563, "grad_norm": 2.125, "learning_rate": 0.0009336549982836133, "loss": 1.3081, "step": 1981 }, { "epoch": 0.1676960825788984, "grad_norm": 1.671875, "learning_rate": 0.0009335886870917079, "loss": 1.4388, "step": 1982 }, { "epoch": 0.16778069210593113, "grad_norm": 1.421875, "learning_rate": 0.0009335223451349075, "loss": 1.1311, "step": 1983 }, { "epoch": 0.16786530163296387, "grad_norm": 1.3828125, "learning_rate": 0.0009334559724179192, "loss": 1.2062, "step": 1984 }, { "epoch": 0.1679499111599966, "grad_norm": 2.015625, "learning_rate": 0.0009333895689454526, "loss": 1.9901, "step": 1985 }, { "epoch": 0.16803452068702937, "grad_norm": 1.8125, "learning_rate": 0.0009333231347222191, "loss": 1.6975, "step": 1986 }, { "epoch": 0.1681191302140621, "grad_norm": 2.03125, "learning_rate": 0.0009332566697529325, "loss": 1.8019, "step": 1987 }, { "epoch": 0.16820373974109484, "grad_norm": 1.5859375, "learning_rate": 0.000933190174042309, "loss": 1.2028, "step": 1988 }, { "epoch": 0.16828834926812758, "grad_norm": 1.5234375, "learning_rate": 0.0009331236475950664, "loss": 1.6539, "step": 1989 }, { "epoch": 0.16837295879516034, "grad_norm": 1.875, "learning_rate": 0.0009330570904159251, "loss": 1.6933, "step": 1990 }, { "epoch": 0.16845756832219308, "grad_norm": 1.3359375, "learning_rate": 0.0009329905025096078, "loss": 1.1401, "step": 1991 }, { "epoch": 0.16854217784922582, "grad_norm": 1.5, "learning_rate": 0.0009329238838808392, "loss": 1.3752, "step": 1992 }, { "epoch": 0.16862678737625858, "grad_norm": 1.8203125, "learning_rate": 0.0009328572345343458, "loss": 1.4965, "step": 1993 }, { "epoch": 0.16871139690329132, "grad_norm": 1.9609375, "learning_rate": 0.0009327905544748569, "loss": 2.2287, "step": 1994 }, { "epoch": 0.16879600643032405, "grad_norm": 1.3671875, "learning_rate": 0.0009327238437071038, "loss": 1.4141, "step": 1995 }, { "epoch": 0.1688806159573568, "grad_norm": 1.3828125, "learning_rate": 0.0009326571022358198, "loss": 1.1512, "step": 1996 }, { "epoch": 0.16896522548438955, "grad_norm": 1.4921875, "learning_rate": 0.0009325903300657404, "loss": 1.5299, "step": 1997 }, { "epoch": 0.1690498350114223, "grad_norm": 1.6640625, "learning_rate": 0.0009325235272016035, "loss": 1.4144, "step": 1998 }, { "epoch": 0.16913444453845503, "grad_norm": 1.7421875, "learning_rate": 0.0009324566936481491, "loss": 1.5284, "step": 1999 }, { "epoch": 0.16921905406548776, "grad_norm": 1.46875, "learning_rate": 0.0009323898294101191, "loss": 1.5575, "step": 2000 }, { "epoch": 0.16930366359252053, "grad_norm": 5.75, "learning_rate": 0.0009323229344922578, "loss": 1.1132, "step": 2001 }, { "epoch": 0.16938827311955326, "grad_norm": 2.75, "learning_rate": 0.000932256008899312, "loss": 2.0506, "step": 2002 }, { "epoch": 0.169472882646586, "grad_norm": 1.6484375, "learning_rate": 0.0009321890526360298, "loss": 1.7991, "step": 2003 }, { "epoch": 0.16955749217361876, "grad_norm": 1.2890625, "learning_rate": 0.0009321220657071625, "loss": 1.358, "step": 2004 }, { "epoch": 0.1696421017006515, "grad_norm": 1.9296875, "learning_rate": 0.0009320550481174628, "loss": 2.42, "step": 2005 }, { "epoch": 0.16972671122768423, "grad_norm": 1.671875, "learning_rate": 0.0009319879998716861, "loss": 1.2698, "step": 2006 }, { "epoch": 0.16981132075471697, "grad_norm": 1.9140625, "learning_rate": 0.0009319209209745897, "loss": 1.8028, "step": 2007 }, { "epoch": 0.16989593028174974, "grad_norm": 1.859375, "learning_rate": 0.0009318538114309329, "loss": 1.6793, "step": 2008 }, { "epoch": 0.16998053980878247, "grad_norm": 1.9921875, "learning_rate": 0.0009317866712454777, "loss": 1.9235, "step": 2009 }, { "epoch": 0.1700651493358152, "grad_norm": 1.4375, "learning_rate": 0.0009317195004229879, "loss": 1.1186, "step": 2010 }, { "epoch": 0.17014975886284794, "grad_norm": 1.7890625, "learning_rate": 0.0009316522989682292, "loss": 1.3719, "step": 2011 }, { "epoch": 0.1702343683898807, "grad_norm": 1.5, "learning_rate": 0.0009315850668859704, "loss": 1.1396, "step": 2012 }, { "epoch": 0.17031897791691344, "grad_norm": 1.59375, "learning_rate": 0.0009315178041809814, "loss": 1.1984, "step": 2013 }, { "epoch": 0.17040358744394618, "grad_norm": 1.7109375, "learning_rate": 0.0009314505108580351, "loss": 1.3112, "step": 2014 }, { "epoch": 0.17048819697097894, "grad_norm": 2.140625, "learning_rate": 0.0009313831869219059, "loss": 2.1961, "step": 2015 }, { "epoch": 0.17057280649801168, "grad_norm": 1.46875, "learning_rate": 0.0009313158323773711, "loss": 1.1527, "step": 2016 }, { "epoch": 0.17065741602504442, "grad_norm": 1.3671875, "learning_rate": 0.0009312484472292095, "loss": 1.1945, "step": 2017 }, { "epoch": 0.17074202555207715, "grad_norm": 1.6953125, "learning_rate": 0.0009311810314822024, "loss": 1.1694, "step": 2018 }, { "epoch": 0.17082663507910992, "grad_norm": 1.8125, "learning_rate": 0.0009311135851411332, "loss": 1.7358, "step": 2019 }, { "epoch": 0.17091124460614265, "grad_norm": 1.15625, "learning_rate": 0.0009310461082107877, "loss": 1.0897, "step": 2020 }, { "epoch": 0.1709958541331754, "grad_norm": 1.3984375, "learning_rate": 0.0009309786006959535, "loss": 1.1786, "step": 2021 }, { "epoch": 0.17108046366020813, "grad_norm": 1.6328125, "learning_rate": 0.0009309110626014204, "loss": 1.2865, "step": 2022 }, { "epoch": 0.1711650731872409, "grad_norm": 2.109375, "learning_rate": 0.0009308434939319807, "loss": 2.0812, "step": 2023 }, { "epoch": 0.17124968271427363, "grad_norm": 1.9375, "learning_rate": 0.0009307758946924289, "loss": 1.6138, "step": 2024 }, { "epoch": 0.17133429224130636, "grad_norm": 1.421875, "learning_rate": 0.0009307082648875609, "loss": 1.3349, "step": 2025 }, { "epoch": 0.17141890176833913, "grad_norm": 1.4921875, "learning_rate": 0.0009306406045221755, "loss": 1.4204, "step": 2026 }, { "epoch": 0.17150351129537186, "grad_norm": 1.734375, "learning_rate": 0.0009305729136010739, "loss": 1.3412, "step": 2027 }, { "epoch": 0.1715881208224046, "grad_norm": 1.5, "learning_rate": 0.0009305051921290586, "loss": 1.1893, "step": 2028 }, { "epoch": 0.17167273034943734, "grad_norm": 1.7734375, "learning_rate": 0.0009304374401109348, "loss": 1.89, "step": 2029 }, { "epoch": 0.1717573398764701, "grad_norm": 1.5625, "learning_rate": 0.0009303696575515097, "loss": 1.5132, "step": 2030 }, { "epoch": 0.17184194940350284, "grad_norm": 2.046875, "learning_rate": 0.0009303018444555929, "loss": 1.7629, "step": 2031 }, { "epoch": 0.17192655893053557, "grad_norm": 2.09375, "learning_rate": 0.0009302340008279961, "loss": 1.8209, "step": 2032 }, { "epoch": 0.1720111684575683, "grad_norm": 1.6171875, "learning_rate": 0.0009301661266735328, "loss": 1.3558, "step": 2033 }, { "epoch": 0.17209577798460107, "grad_norm": 1.7265625, "learning_rate": 0.0009300982219970194, "loss": 1.3058, "step": 2034 }, { "epoch": 0.1721803875116338, "grad_norm": 1.6015625, "learning_rate": 0.0009300302868032735, "loss": 1.128, "step": 2035 }, { "epoch": 0.17226499703866655, "grad_norm": 1.65625, "learning_rate": 0.0009299623210971157, "loss": 1.1239, "step": 2036 }, { "epoch": 0.1723496065656993, "grad_norm": 1.5859375, "learning_rate": 0.0009298943248833683, "loss": 1.298, "step": 2037 }, { "epoch": 0.17243421609273205, "grad_norm": 1.6484375, "learning_rate": 0.0009298262981668559, "loss": 1.1881, "step": 2038 }, { "epoch": 0.17251882561976478, "grad_norm": 1.5703125, "learning_rate": 0.0009297582409524054, "loss": 1.358, "step": 2039 }, { "epoch": 0.17260343514679752, "grad_norm": 2.140625, "learning_rate": 0.0009296901532448459, "loss": 1.5932, "step": 2040 }, { "epoch": 0.17268804467383028, "grad_norm": 1.90625, "learning_rate": 0.000929622035049008, "loss": 1.4426, "step": 2041 }, { "epoch": 0.17277265420086302, "grad_norm": 2.15625, "learning_rate": 0.0009295538863697254, "loss": 1.5209, "step": 2042 }, { "epoch": 0.17285726372789575, "grad_norm": 1.5859375, "learning_rate": 0.0009294857072118332, "loss": 1.35, "step": 2043 }, { "epoch": 0.17294187325492852, "grad_norm": 1.8515625, "learning_rate": 0.0009294174975801693, "loss": 1.5191, "step": 2044 }, { "epoch": 0.17302648278196125, "grad_norm": 1.2890625, "learning_rate": 0.0009293492574795734, "loss": 0.9924, "step": 2045 }, { "epoch": 0.173111092308994, "grad_norm": 1.40625, "learning_rate": 0.0009292809869148873, "loss": 1.2354, "step": 2046 }, { "epoch": 0.17319570183602673, "grad_norm": 1.4375, "learning_rate": 0.0009292126858909548, "loss": 1.0762, "step": 2047 }, { "epoch": 0.1732803113630595, "grad_norm": 2.265625, "learning_rate": 0.0009291443544126228, "loss": 2.0232, "step": 2048 }, { "epoch": 0.17336492089009223, "grad_norm": 1.625, "learning_rate": 0.0009290759924847393, "loss": 1.1327, "step": 2049 }, { "epoch": 0.17344953041712496, "grad_norm": 1.6953125, "learning_rate": 0.0009290076001121548, "loss": 1.0436, "step": 2050 }, { "epoch": 0.1735341399441577, "grad_norm": 1.640625, "learning_rate": 0.0009289391772997223, "loss": 1.5653, "step": 2051 }, { "epoch": 0.17361874947119046, "grad_norm": 1.671875, "learning_rate": 0.0009288707240522962, "loss": 1.4876, "step": 2052 }, { "epoch": 0.1737033589982232, "grad_norm": 2.1875, "learning_rate": 0.0009288022403747342, "loss": 1.2523, "step": 2053 }, { "epoch": 0.17378796852525594, "grad_norm": 1.6796875, "learning_rate": 0.0009287337262718949, "loss": 1.4522, "step": 2054 }, { "epoch": 0.1738725780522887, "grad_norm": 2.0625, "learning_rate": 0.00092866518174864, "loss": 1.5914, "step": 2055 }, { "epoch": 0.17395718757932144, "grad_norm": 1.9375, "learning_rate": 0.0009285966068098328, "loss": 1.7319, "step": 2056 }, { "epoch": 0.17404179710635417, "grad_norm": 2.03125, "learning_rate": 0.0009285280014603392, "loss": 1.4253, "step": 2057 }, { "epoch": 0.1741264066333869, "grad_norm": 1.609375, "learning_rate": 0.0009284593657050269, "loss": 1.9715, "step": 2058 }, { "epoch": 0.17421101616041967, "grad_norm": 2.203125, "learning_rate": 0.0009283906995487659, "loss": 1.4717, "step": 2059 }, { "epoch": 0.1742956256874524, "grad_norm": 1.484375, "learning_rate": 0.0009283220029964283, "loss": 1.5017, "step": 2060 }, { "epoch": 0.17438023521448515, "grad_norm": 2.078125, "learning_rate": 0.0009282532760528884, "loss": 1.4316, "step": 2061 }, { "epoch": 0.17446484474151788, "grad_norm": 1.8359375, "learning_rate": 0.0009281845187230228, "loss": 1.3302, "step": 2062 }, { "epoch": 0.17454945426855065, "grad_norm": 1.5, "learning_rate": 0.0009281157310117101, "loss": 1.507, "step": 2063 }, { "epoch": 0.17463406379558338, "grad_norm": 1.7421875, "learning_rate": 0.0009280469129238309, "loss": 1.5604, "step": 2064 }, { "epoch": 0.17471867332261612, "grad_norm": 1.78125, "learning_rate": 0.0009279780644642682, "loss": 1.864, "step": 2065 }, { "epoch": 0.17480328284964888, "grad_norm": 2.421875, "learning_rate": 0.0009279091856379072, "loss": 1.5318, "step": 2066 }, { "epoch": 0.17488789237668162, "grad_norm": 2.90625, "learning_rate": 0.000927840276449635, "loss": 1.8271, "step": 2067 }, { "epoch": 0.17497250190371436, "grad_norm": 1.5703125, "learning_rate": 0.000927771336904341, "loss": 1.1934, "step": 2068 }, { "epoch": 0.1750571114307471, "grad_norm": 1.84375, "learning_rate": 0.0009277023670069167, "loss": 1.2772, "step": 2069 }, { "epoch": 0.17514172095777986, "grad_norm": 1.9140625, "learning_rate": 0.0009276333667622561, "loss": 1.5809, "step": 2070 }, { "epoch": 0.1752263304848126, "grad_norm": 1.53125, "learning_rate": 0.0009275643361752546, "loss": 1.2883, "step": 2071 }, { "epoch": 0.17531094001184533, "grad_norm": 1.96875, "learning_rate": 0.0009274952752508104, "loss": 1.2511, "step": 2072 }, { "epoch": 0.17539554953887806, "grad_norm": 1.8125, "learning_rate": 0.0009274261839938237, "loss": 1.3826, "step": 2073 }, { "epoch": 0.17548015906591083, "grad_norm": 1.875, "learning_rate": 0.0009273570624091969, "loss": 1.6178, "step": 2074 }, { "epoch": 0.17556476859294357, "grad_norm": 1.5703125, "learning_rate": 0.0009272879105018342, "loss": 1.1386, "step": 2075 }, { "epoch": 0.1756493781199763, "grad_norm": 2.578125, "learning_rate": 0.0009272187282766425, "loss": 1.3303, "step": 2076 }, { "epoch": 0.17573398764700907, "grad_norm": 2.609375, "learning_rate": 0.0009271495157385304, "loss": 1.7576, "step": 2077 }, { "epoch": 0.1758185971740418, "grad_norm": 2.296875, "learning_rate": 0.0009270802728924087, "loss": 1.4598, "step": 2078 }, { "epoch": 0.17590320670107454, "grad_norm": 1.6171875, "learning_rate": 0.0009270109997431906, "loss": 1.4973, "step": 2079 }, { "epoch": 0.17598781622810727, "grad_norm": 1.796875, "learning_rate": 0.0009269416962957913, "loss": 1.6244, "step": 2080 }, { "epoch": 0.17607242575514004, "grad_norm": 1.4375, "learning_rate": 0.0009268723625551283, "loss": 0.9554, "step": 2081 }, { "epoch": 0.17615703528217277, "grad_norm": 1.6640625, "learning_rate": 0.0009268029985261211, "loss": 1.1307, "step": 2082 }, { "epoch": 0.1762416448092055, "grad_norm": 1.84375, "learning_rate": 0.0009267336042136909, "loss": 1.5077, "step": 2083 }, { "epoch": 0.17632625433623825, "grad_norm": 1.9375, "learning_rate": 0.0009266641796227622, "loss": 1.4737, "step": 2084 }, { "epoch": 0.176410863863271, "grad_norm": 1.8515625, "learning_rate": 0.0009265947247582603, "loss": 1.5799, "step": 2085 }, { "epoch": 0.17649547339030375, "grad_norm": 1.734375, "learning_rate": 0.0009265252396251138, "loss": 1.4372, "step": 2086 }, { "epoch": 0.17658008291733648, "grad_norm": 4.375, "learning_rate": 0.0009264557242282527, "loss": 1.6497, "step": 2087 }, { "epoch": 0.17666469244436925, "grad_norm": 2.125, "learning_rate": 0.0009263861785726097, "loss": 1.6416, "step": 2088 }, { "epoch": 0.17674930197140198, "grad_norm": 1.703125, "learning_rate": 0.0009263166026631189, "loss": 1.392, "step": 2089 }, { "epoch": 0.17683391149843472, "grad_norm": 1.7578125, "learning_rate": 0.0009262469965047173, "loss": 1.3979, "step": 2090 }, { "epoch": 0.17691852102546746, "grad_norm": 2.671875, "learning_rate": 0.0009261773601023438, "loss": 1.7699, "step": 2091 }, { "epoch": 0.17700313055250022, "grad_norm": 1.9296875, "learning_rate": 0.0009261076934609391, "loss": 2.2941, "step": 2092 }, { "epoch": 0.17708774007953296, "grad_norm": 1.703125, "learning_rate": 0.0009260379965854464, "loss": 1.786, "step": 2093 }, { "epoch": 0.1771723496065657, "grad_norm": 2.953125, "learning_rate": 0.0009259682694808113, "loss": 1.4906, "step": 2094 }, { "epoch": 0.17725695913359843, "grad_norm": 1.546875, "learning_rate": 0.0009258985121519808, "loss": 1.3156, "step": 2095 }, { "epoch": 0.1773415686606312, "grad_norm": 2.0, "learning_rate": 0.0009258287246039047, "loss": 1.6285, "step": 2096 }, { "epoch": 0.17742617818766393, "grad_norm": 2.0625, "learning_rate": 0.0009257589068415349, "loss": 1.8433, "step": 2097 }, { "epoch": 0.17751078771469667, "grad_norm": 1.8125, "learning_rate": 0.0009256890588698248, "loss": 1.4639, "step": 2098 }, { "epoch": 0.17759539724172943, "grad_norm": 1.390625, "learning_rate": 0.0009256191806937306, "loss": 1.2467, "step": 2099 }, { "epoch": 0.17768000676876217, "grad_norm": 2.40625, "learning_rate": 0.0009255492723182106, "loss": 1.7057, "step": 2100 }, { "epoch": 0.1777646162957949, "grad_norm": 2.234375, "learning_rate": 0.000925479333748225, "loss": 1.6582, "step": 2101 }, { "epoch": 0.17784922582282764, "grad_norm": 1.796875, "learning_rate": 0.000925409364988736, "loss": 1.4644, "step": 2102 }, { "epoch": 0.1779338353498604, "grad_norm": 1.5390625, "learning_rate": 0.0009253393660447086, "loss": 1.4866, "step": 2103 }, { "epoch": 0.17801844487689314, "grad_norm": 1.7734375, "learning_rate": 0.0009252693369211091, "loss": 1.2202, "step": 2104 }, { "epoch": 0.17810305440392588, "grad_norm": 2.0625, "learning_rate": 0.0009251992776229066, "loss": 1.8845, "step": 2105 }, { "epoch": 0.17818766393095864, "grad_norm": 1.3359375, "learning_rate": 0.000925129188155072, "loss": 1.1619, "step": 2106 }, { "epoch": 0.17827227345799138, "grad_norm": 1.5234375, "learning_rate": 0.0009250590685225783, "loss": 1.3261, "step": 2107 }, { "epoch": 0.1783568829850241, "grad_norm": 1.4453125, "learning_rate": 0.0009249889187304013, "loss": 1.1612, "step": 2108 }, { "epoch": 0.17844149251205685, "grad_norm": 1.9296875, "learning_rate": 0.0009249187387835178, "loss": 1.1502, "step": 2109 }, { "epoch": 0.1785261020390896, "grad_norm": 2.0, "learning_rate": 0.0009248485286869077, "loss": 1.2041, "step": 2110 }, { "epoch": 0.17861071156612235, "grad_norm": 3.28125, "learning_rate": 0.0009247782884455524, "loss": 2.0468, "step": 2111 }, { "epoch": 0.17869532109315509, "grad_norm": 1.8046875, "learning_rate": 0.000924708018064436, "loss": 1.8602, "step": 2112 }, { "epoch": 0.17877993062018782, "grad_norm": 1.5703125, "learning_rate": 0.0009246377175485445, "loss": 1.3335, "step": 2113 }, { "epoch": 0.17886454014722059, "grad_norm": 1.7109375, "learning_rate": 0.0009245673869028659, "loss": 1.5982, "step": 2114 }, { "epoch": 0.17894914967425332, "grad_norm": 1.953125, "learning_rate": 0.0009244970261323903, "loss": 1.4255, "step": 2115 }, { "epoch": 0.17903375920128606, "grad_norm": 1.7109375, "learning_rate": 0.0009244266352421105, "loss": 1.392, "step": 2116 }, { "epoch": 0.17911836872831882, "grad_norm": 3.5, "learning_rate": 0.0009243562142370208, "loss": 1.4956, "step": 2117 }, { "epoch": 0.17920297825535156, "grad_norm": 2.5, "learning_rate": 0.0009242857631221175, "loss": 1.671, "step": 2118 }, { "epoch": 0.1792875877823843, "grad_norm": 1.953125, "learning_rate": 0.0009242152819023999, "loss": 1.8055, "step": 2119 }, { "epoch": 0.17937219730941703, "grad_norm": 1.59375, "learning_rate": 0.0009241447705828687, "loss": 1.1456, "step": 2120 }, { "epoch": 0.1794568068364498, "grad_norm": 2.46875, "learning_rate": 0.0009240742291685271, "loss": 2.1941, "step": 2121 }, { "epoch": 0.17954141636348253, "grad_norm": 1.7265625, "learning_rate": 0.0009240036576643803, "loss": 1.3052, "step": 2122 }, { "epoch": 0.17962602589051527, "grad_norm": 1.9609375, "learning_rate": 0.0009239330560754354, "loss": 1.6161, "step": 2123 }, { "epoch": 0.179710635417548, "grad_norm": 2.78125, "learning_rate": 0.000923862424406702, "loss": 1.4737, "step": 2124 }, { "epoch": 0.17979524494458077, "grad_norm": 5.0, "learning_rate": 0.0009237917626631918, "loss": 1.5241, "step": 2125 }, { "epoch": 0.1798798544716135, "grad_norm": 2.4375, "learning_rate": 0.0009237210708499184, "loss": 2.414, "step": 2126 }, { "epoch": 0.17996446399864624, "grad_norm": 2.125, "learning_rate": 0.000923650348971898, "loss": 1.9227, "step": 2127 }, { "epoch": 0.180049073525679, "grad_norm": 2.03125, "learning_rate": 0.000923579597034148, "loss": 1.7176, "step": 2128 }, { "epoch": 0.18013368305271174, "grad_norm": 1.9375, "learning_rate": 0.0009235088150416892, "loss": 1.8486, "step": 2129 }, { "epoch": 0.18021829257974448, "grad_norm": 1.8203125, "learning_rate": 0.0009234380029995435, "loss": 1.5925, "step": 2130 }, { "epoch": 0.1803029021067772, "grad_norm": 1.6640625, "learning_rate": 0.0009233671609127352, "loss": 1.3302, "step": 2131 }, { "epoch": 0.18038751163380998, "grad_norm": 2.75, "learning_rate": 0.0009232962887862912, "loss": 1.8536, "step": 2132 }, { "epoch": 0.1804721211608427, "grad_norm": 1.4609375, "learning_rate": 0.0009232253866252399, "loss": 1.1746, "step": 2133 }, { "epoch": 0.18055673068787545, "grad_norm": 2.03125, "learning_rate": 0.0009231544544346123, "loss": 1.2852, "step": 2134 }, { "epoch": 0.18064134021490819, "grad_norm": 2.34375, "learning_rate": 0.0009230834922194411, "loss": 2.2466, "step": 2135 }, { "epoch": 0.18072594974194095, "grad_norm": 1.703125, "learning_rate": 0.0009230124999847615, "loss": 1.1943, "step": 2136 }, { "epoch": 0.1808105592689737, "grad_norm": 1.8828125, "learning_rate": 0.0009229414777356106, "loss": 1.3942, "step": 2137 }, { "epoch": 0.18089516879600642, "grad_norm": 1.9140625, "learning_rate": 0.0009228704254770279, "loss": 1.5476, "step": 2138 }, { "epoch": 0.1809797783230392, "grad_norm": 2.15625, "learning_rate": 0.0009227993432140546, "loss": 1.5338, "step": 2139 }, { "epoch": 0.18106438785007192, "grad_norm": 2.140625, "learning_rate": 0.0009227282309517345, "loss": 1.5624, "step": 2140 }, { "epoch": 0.18114899737710466, "grad_norm": 1.78125, "learning_rate": 0.0009226570886951131, "loss": 1.4732, "step": 2141 }, { "epoch": 0.1812336069041374, "grad_norm": 2.046875, "learning_rate": 0.0009225859164492384, "loss": 1.6608, "step": 2142 }, { "epoch": 0.18131821643117016, "grad_norm": 1.484375, "learning_rate": 0.0009225147142191603, "loss": 1.3427, "step": 2143 }, { "epoch": 0.1814028259582029, "grad_norm": 1.8515625, "learning_rate": 0.000922443482009931, "loss": 1.2796, "step": 2144 }, { "epoch": 0.18148743548523563, "grad_norm": 1.5546875, "learning_rate": 0.0009223722198266046, "loss": 1.7606, "step": 2145 }, { "epoch": 0.18157204501226837, "grad_norm": 1.8046875, "learning_rate": 0.0009223009276742374, "loss": 1.4295, "step": 2146 }, { "epoch": 0.18165665453930113, "grad_norm": 1.84375, "learning_rate": 0.0009222296055578879, "loss": 1.2771, "step": 2147 }, { "epoch": 0.18174126406633387, "grad_norm": 1.4140625, "learning_rate": 0.0009221582534826167, "loss": 1.0887, "step": 2148 }, { "epoch": 0.1818258735933666, "grad_norm": 2.203125, "learning_rate": 0.0009220868714534867, "loss": 1.5068, "step": 2149 }, { "epoch": 0.18191048312039937, "grad_norm": 1.6484375, "learning_rate": 0.0009220154594755625, "loss": 1.5045, "step": 2150 }, { "epoch": 0.1819950926474321, "grad_norm": 1.9765625, "learning_rate": 0.0009219440175539112, "loss": 1.8064, "step": 2151 }, { "epoch": 0.18207970217446484, "grad_norm": 1.046875, "learning_rate": 0.0009218725456936019, "loss": 0.9804, "step": 2152 }, { "epoch": 0.18216431170149758, "grad_norm": 1.578125, "learning_rate": 0.0009218010438997058, "loss": 1.4257, "step": 2153 }, { "epoch": 0.18224892122853034, "grad_norm": 1.734375, "learning_rate": 0.0009217295121772962, "loss": 1.5343, "step": 2154 }, { "epoch": 0.18233353075556308, "grad_norm": 1.5703125, "learning_rate": 0.0009216579505314487, "loss": 1.1877, "step": 2155 }, { "epoch": 0.18241814028259581, "grad_norm": 1.578125, "learning_rate": 0.0009215863589672407, "loss": 1.1004, "step": 2156 }, { "epoch": 0.18250274980962855, "grad_norm": 1.6640625, "learning_rate": 0.0009215147374897519, "loss": 1.4609, "step": 2157 }, { "epoch": 0.18258735933666131, "grad_norm": 1.71875, "learning_rate": 0.0009214430861040644, "loss": 1.1618, "step": 2158 }, { "epoch": 0.18267196886369405, "grad_norm": 1.640625, "learning_rate": 0.000921371404815262, "loss": 1.584, "step": 2159 }, { "epoch": 0.1827565783907268, "grad_norm": 1.8828125, "learning_rate": 0.0009212996936284306, "loss": 1.4214, "step": 2160 }, { "epoch": 0.18284118791775955, "grad_norm": 1.6484375, "learning_rate": 0.0009212279525486589, "loss": 1.6114, "step": 2161 }, { "epoch": 0.1829257974447923, "grad_norm": 1.53125, "learning_rate": 0.0009211561815810368, "loss": 1.1864, "step": 2162 }, { "epoch": 0.18301040697182502, "grad_norm": 1.6875, "learning_rate": 0.0009210843807306567, "loss": 1.6037, "step": 2163 }, { "epoch": 0.18309501649885776, "grad_norm": 1.875, "learning_rate": 0.0009210125500026135, "loss": 1.5924, "step": 2164 }, { "epoch": 0.18317962602589052, "grad_norm": 1.7734375, "learning_rate": 0.0009209406894020036, "loss": 1.7134, "step": 2165 }, { "epoch": 0.18326423555292326, "grad_norm": 1.5390625, "learning_rate": 0.0009208687989339261, "loss": 1.0128, "step": 2166 }, { "epoch": 0.183348845079956, "grad_norm": 1.6328125, "learning_rate": 0.0009207968786034815, "loss": 1.1552, "step": 2167 }, { "epoch": 0.18343345460698876, "grad_norm": 1.8046875, "learning_rate": 0.0009207249284157732, "loss": 1.3322, "step": 2168 }, { "epoch": 0.1835180641340215, "grad_norm": 1.4140625, "learning_rate": 0.0009206529483759063, "loss": 1.1785, "step": 2169 }, { "epoch": 0.18360267366105423, "grad_norm": 2.1875, "learning_rate": 0.000920580938488988, "loss": 1.2554, "step": 2170 }, { "epoch": 0.18368728318808697, "grad_norm": 7.59375, "learning_rate": 0.0009205088987601275, "loss": 1.3092, "step": 2171 }, { "epoch": 0.18377189271511973, "grad_norm": 2.171875, "learning_rate": 0.0009204368291944368, "loss": 1.4937, "step": 2172 }, { "epoch": 0.18385650224215247, "grad_norm": 1.9453125, "learning_rate": 0.0009203647297970292, "loss": 1.39, "step": 2173 }, { "epoch": 0.1839411117691852, "grad_norm": 1.375, "learning_rate": 0.0009202926005730205, "loss": 1.0359, "step": 2174 }, { "epoch": 0.18402572129621794, "grad_norm": 1.6484375, "learning_rate": 0.0009202204415275286, "loss": 1.3274, "step": 2175 }, { "epoch": 0.1841103308232507, "grad_norm": 1.515625, "learning_rate": 0.0009201482526656734, "loss": 1.4137, "step": 2176 }, { "epoch": 0.18419494035028344, "grad_norm": 2.046875, "learning_rate": 0.0009200760339925771, "loss": 1.5318, "step": 2177 }, { "epoch": 0.18427954987731618, "grad_norm": 1.5703125, "learning_rate": 0.0009200037855133638, "loss": 1.3045, "step": 2178 }, { "epoch": 0.18436415940434894, "grad_norm": 2.3125, "learning_rate": 0.0009199315072331599, "loss": 1.282, "step": 2179 }, { "epoch": 0.18444876893138168, "grad_norm": 1.75, "learning_rate": 0.0009198591991570938, "loss": 1.7231, "step": 2180 }, { "epoch": 0.18453337845841442, "grad_norm": 1.859375, "learning_rate": 0.0009197868612902962, "loss": 1.4484, "step": 2181 }, { "epoch": 0.18461798798544715, "grad_norm": 1.8046875, "learning_rate": 0.0009197144936378996, "loss": 1.5176, "step": 2182 }, { "epoch": 0.18470259751247992, "grad_norm": 1.5390625, "learning_rate": 0.0009196420962050386, "loss": 1.2198, "step": 2183 }, { "epoch": 0.18478720703951265, "grad_norm": 2.0625, "learning_rate": 0.0009195696689968507, "loss": 1.5066, "step": 2184 }, { "epoch": 0.1848718165665454, "grad_norm": 1.84375, "learning_rate": 0.0009194972120184744, "loss": 1.7683, "step": 2185 }, { "epoch": 0.18495642609357812, "grad_norm": 1.7109375, "learning_rate": 0.0009194247252750509, "loss": 1.5985, "step": 2186 }, { "epoch": 0.1850410356206109, "grad_norm": 1.6015625, "learning_rate": 0.0009193522087717234, "loss": 1.2666, "step": 2187 }, { "epoch": 0.18512564514764362, "grad_norm": 1.859375, "learning_rate": 0.0009192796625136374, "loss": 1.7045, "step": 2188 }, { "epoch": 0.18521025467467636, "grad_norm": 1.921875, "learning_rate": 0.0009192070865059406, "loss": 1.3567, "step": 2189 }, { "epoch": 0.18529486420170913, "grad_norm": 1.734375, "learning_rate": 0.0009191344807537818, "loss": 1.3978, "step": 2190 }, { "epoch": 0.18537947372874186, "grad_norm": 1.34375, "learning_rate": 0.0009190618452623134, "loss": 1.3272, "step": 2191 }, { "epoch": 0.1854640832557746, "grad_norm": 1.4765625, "learning_rate": 0.0009189891800366887, "loss": 1.1084, "step": 2192 }, { "epoch": 0.18554869278280733, "grad_norm": 1.859375, "learning_rate": 0.0009189164850820639, "loss": 1.3813, "step": 2193 }, { "epoch": 0.1856333023098401, "grad_norm": 2.0, "learning_rate": 0.000918843760403597, "loss": 1.2002, "step": 2194 }, { "epoch": 0.18571791183687283, "grad_norm": 1.6953125, "learning_rate": 0.0009187710060064478, "loss": 1.3292, "step": 2195 }, { "epoch": 0.18580252136390557, "grad_norm": 1.7578125, "learning_rate": 0.0009186982218957789, "loss": 1.5023, "step": 2196 }, { "epoch": 0.1858871308909383, "grad_norm": 1.5390625, "learning_rate": 0.0009186254080767546, "loss": 1.1386, "step": 2197 }, { "epoch": 0.18597174041797107, "grad_norm": 1.7265625, "learning_rate": 0.000918552564554541, "loss": 1.3189, "step": 2198 }, { "epoch": 0.1860563499450038, "grad_norm": 1.6953125, "learning_rate": 0.000918479691334307, "loss": 1.4116, "step": 2199 }, { "epoch": 0.18614095947203654, "grad_norm": 1.4453125, "learning_rate": 0.000918406788421223, "loss": 1.254, "step": 2200 }, { "epoch": 0.1862255689990693, "grad_norm": 1.375, "learning_rate": 0.000918333855820462, "loss": 1.2948, "step": 2201 }, { "epoch": 0.18631017852610204, "grad_norm": 1.296875, "learning_rate": 0.0009182608935371987, "loss": 1.1104, "step": 2202 }, { "epoch": 0.18639478805313478, "grad_norm": 1.7890625, "learning_rate": 0.0009181879015766101, "loss": 1.6329, "step": 2203 }, { "epoch": 0.18647939758016752, "grad_norm": 1.734375, "learning_rate": 0.0009181148799438753, "loss": 1.7026, "step": 2204 }, { "epoch": 0.18656400710720028, "grad_norm": 1.7421875, "learning_rate": 0.0009180418286441756, "loss": 1.207, "step": 2205 }, { "epoch": 0.18664861663423302, "grad_norm": 1.875, "learning_rate": 0.000917968747682694, "loss": 1.4727, "step": 2206 }, { "epoch": 0.18673322616126575, "grad_norm": 1.5703125, "learning_rate": 0.0009178956370646161, "loss": 1.1654, "step": 2207 }, { "epoch": 0.1868178356882985, "grad_norm": 1.875, "learning_rate": 0.0009178224967951294, "loss": 1.755, "step": 2208 }, { "epoch": 0.18690244521533125, "grad_norm": 1.578125, "learning_rate": 0.0009177493268794235, "loss": 1.4767, "step": 2209 }, { "epoch": 0.186987054742364, "grad_norm": 1.4140625, "learning_rate": 0.0009176761273226901, "loss": 1.3751, "step": 2210 }, { "epoch": 0.18707166426939673, "grad_norm": 1.9375, "learning_rate": 0.0009176028981301229, "loss": 1.1781, "step": 2211 }, { "epoch": 0.1871562737964295, "grad_norm": 2.0625, "learning_rate": 0.0009175296393069179, "loss": 1.3688, "step": 2212 }, { "epoch": 0.18724088332346223, "grad_norm": 1.4921875, "learning_rate": 0.0009174563508582731, "loss": 1.3899, "step": 2213 }, { "epoch": 0.18732549285049496, "grad_norm": 1.484375, "learning_rate": 0.0009173830327893886, "loss": 1.1216, "step": 2214 }, { "epoch": 0.1874101023775277, "grad_norm": 1.734375, "learning_rate": 0.0009173096851054667, "loss": 1.6513, "step": 2215 }, { "epoch": 0.18749471190456046, "grad_norm": 1.890625, "learning_rate": 0.0009172363078117117, "loss": 1.3796, "step": 2216 }, { "epoch": 0.1875793214315932, "grad_norm": 2.0625, "learning_rate": 0.0009171629009133297, "loss": 1.5884, "step": 2217 }, { "epoch": 0.18766393095862594, "grad_norm": 2.15625, "learning_rate": 0.0009170894644155299, "loss": 1.5429, "step": 2218 }, { "epoch": 0.18774854048565867, "grad_norm": 1.6875, "learning_rate": 0.0009170159983235222, "loss": 1.3887, "step": 2219 }, { "epoch": 0.18783315001269144, "grad_norm": 1.5625, "learning_rate": 0.0009169425026425197, "loss": 1.2938, "step": 2220 }, { "epoch": 0.18791775953972417, "grad_norm": 1.7109375, "learning_rate": 0.0009168689773777373, "loss": 1.3356, "step": 2221 }, { "epoch": 0.1880023690667569, "grad_norm": 1.609375, "learning_rate": 0.0009167954225343918, "loss": 1.2224, "step": 2222 }, { "epoch": 0.18808697859378967, "grad_norm": 1.5625, "learning_rate": 0.000916721838117702, "loss": 1.167, "step": 2223 }, { "epoch": 0.1881715881208224, "grad_norm": 1.765625, "learning_rate": 0.0009166482241328893, "loss": 1.3221, "step": 2224 }, { "epoch": 0.18825619764785514, "grad_norm": 1.46875, "learning_rate": 0.0009165745805851768, "loss": 1.2626, "step": 2225 }, { "epoch": 0.18834080717488788, "grad_norm": 1.5625, "learning_rate": 0.0009165009074797899, "loss": 1.1417, "step": 2226 }, { "epoch": 0.18842541670192064, "grad_norm": 1.9375, "learning_rate": 0.0009164272048219558, "loss": 1.8275, "step": 2227 }, { "epoch": 0.18851002622895338, "grad_norm": 1.5625, "learning_rate": 0.0009163534726169042, "loss": 1.1833, "step": 2228 }, { "epoch": 0.18859463575598612, "grad_norm": 1.1875, "learning_rate": 0.0009162797108698669, "loss": 0.9852, "step": 2229 }, { "epoch": 0.18867924528301888, "grad_norm": 1.96875, "learning_rate": 0.000916205919586077, "loss": 1.7494, "step": 2230 }, { "epoch": 0.18876385481005162, "grad_norm": 1.484375, "learning_rate": 0.0009161320987707708, "loss": 1.1872, "step": 2231 }, { "epoch": 0.18884846433708435, "grad_norm": 1.8203125, "learning_rate": 0.0009160582484291862, "loss": 1.4756, "step": 2232 }, { "epoch": 0.1889330738641171, "grad_norm": 1.4453125, "learning_rate": 0.0009159843685665628, "loss": 1.1538, "step": 2233 }, { "epoch": 0.18901768339114985, "grad_norm": 2.171875, "learning_rate": 0.0009159104591881428, "loss": 1.3384, "step": 2234 }, { "epoch": 0.1891022929181826, "grad_norm": 1.4921875, "learning_rate": 0.0009158365202991708, "loss": 1.2659, "step": 2235 }, { "epoch": 0.18918690244521533, "grad_norm": 1.828125, "learning_rate": 0.0009157625519048925, "loss": 1.2394, "step": 2236 }, { "epoch": 0.18927151197224806, "grad_norm": 1.75, "learning_rate": 0.0009156885540105566, "loss": 1.9432, "step": 2237 }, { "epoch": 0.18935612149928083, "grad_norm": 1.7109375, "learning_rate": 0.0009156145266214135, "loss": 1.1162, "step": 2238 }, { "epoch": 0.18944073102631356, "grad_norm": 2.28125, "learning_rate": 0.0009155404697427157, "loss": 1.5853, "step": 2239 }, { "epoch": 0.1895253405533463, "grad_norm": 1.7734375, "learning_rate": 0.0009154663833797177, "loss": 1.2452, "step": 2240 }, { "epoch": 0.18960995008037906, "grad_norm": 1.40625, "learning_rate": 0.0009153922675376765, "loss": 1.4376, "step": 2241 }, { "epoch": 0.1896945596074118, "grad_norm": 1.2734375, "learning_rate": 0.0009153181222218507, "loss": 0.9616, "step": 2242 }, { "epoch": 0.18977916913444454, "grad_norm": 1.828125, "learning_rate": 0.0009152439474375014, "loss": 1.3223, "step": 2243 }, { "epoch": 0.18986377866147727, "grad_norm": 1.3359375, "learning_rate": 0.0009151697431898913, "loss": 1.4978, "step": 2244 }, { "epoch": 0.18994838818851004, "grad_norm": 1.3203125, "learning_rate": 0.000915095509484286, "loss": 1.107, "step": 2245 }, { "epoch": 0.19003299771554277, "grad_norm": 1.75, "learning_rate": 0.0009150212463259522, "loss": 1.6537, "step": 2246 }, { "epoch": 0.1901176072425755, "grad_norm": 1.453125, "learning_rate": 0.0009149469537201595, "loss": 1.5124, "step": 2247 }, { "epoch": 0.19020221676960825, "grad_norm": 1.984375, "learning_rate": 0.0009148726316721791, "loss": 1.3403, "step": 2248 }, { "epoch": 0.190286826296641, "grad_norm": 1.1796875, "learning_rate": 0.0009147982801872845, "loss": 1.0746, "step": 2249 }, { "epoch": 0.19037143582367375, "grad_norm": 3.21875, "learning_rate": 0.0009147238992707512, "loss": 1.4484, "step": 2250 }, { "epoch": 0.19045604535070648, "grad_norm": 2.15625, "learning_rate": 0.0009146494889278568, "loss": 1.9725, "step": 2251 }, { "epoch": 0.19054065487773925, "grad_norm": 1.75, "learning_rate": 0.0009145750491638811, "loss": 1.4135, "step": 2252 }, { "epoch": 0.19062526440477198, "grad_norm": 1.5703125, "learning_rate": 0.0009145005799841059, "loss": 1.7837, "step": 2253 }, { "epoch": 0.19070987393180472, "grad_norm": 1.890625, "learning_rate": 0.0009144260813938153, "loss": 1.3222, "step": 2254 }, { "epoch": 0.19079448345883745, "grad_norm": 1.8046875, "learning_rate": 0.000914351553398295, "loss": 1.2527, "step": 2255 }, { "epoch": 0.19087909298587022, "grad_norm": 1.3203125, "learning_rate": 0.0009142769960028327, "loss": 1.0412, "step": 2256 }, { "epoch": 0.19096370251290296, "grad_norm": 1.703125, "learning_rate": 0.0009142024092127195, "loss": 1.4463, "step": 2257 }, { "epoch": 0.1910483120399357, "grad_norm": 1.5546875, "learning_rate": 0.0009141277930332469, "loss": 1.0304, "step": 2258 }, { "epoch": 0.19113292156696843, "grad_norm": 1.6171875, "learning_rate": 0.0009140531474697094, "loss": 1.1613, "step": 2259 }, { "epoch": 0.1912175310940012, "grad_norm": 1.484375, "learning_rate": 0.0009139784725274037, "loss": 1.5207, "step": 2260 }, { "epoch": 0.19130214062103393, "grad_norm": 1.671875, "learning_rate": 0.0009139037682116278, "loss": 1.1336, "step": 2261 }, { "epoch": 0.19138675014806666, "grad_norm": 2.265625, "learning_rate": 0.0009138290345276825, "loss": 1.6996, "step": 2262 }, { "epoch": 0.19147135967509943, "grad_norm": 1.7890625, "learning_rate": 0.0009137542714808707, "loss": 1.3926, "step": 2263 }, { "epoch": 0.19155596920213216, "grad_norm": 1.671875, "learning_rate": 0.0009136794790764967, "loss": 1.2748, "step": 2264 }, { "epoch": 0.1916405787291649, "grad_norm": 1.8046875, "learning_rate": 0.0009136046573198677, "loss": 1.5361, "step": 2265 }, { "epoch": 0.19172518825619764, "grad_norm": 1.8203125, "learning_rate": 0.0009135298062162925, "loss": 1.8665, "step": 2266 }, { "epoch": 0.1918097977832304, "grad_norm": 1.859375, "learning_rate": 0.0009134549257710819, "loss": 1.736, "step": 2267 }, { "epoch": 0.19189440731026314, "grad_norm": 3.328125, "learning_rate": 0.0009133800159895493, "loss": 1.7561, "step": 2268 }, { "epoch": 0.19197901683729587, "grad_norm": 2.390625, "learning_rate": 0.0009133050768770096, "loss": 1.887, "step": 2269 }, { "epoch": 0.1920636263643286, "grad_norm": 1.671875, "learning_rate": 0.0009132301084387802, "loss": 1.4751, "step": 2270 }, { "epoch": 0.19214823589136137, "grad_norm": 2.171875, "learning_rate": 0.0009131551106801803, "loss": 2.0235, "step": 2271 }, { "epoch": 0.1922328454183941, "grad_norm": 1.2890625, "learning_rate": 0.0009130800836065313, "loss": 1.135, "step": 2272 }, { "epoch": 0.19231745494542685, "grad_norm": 1.7890625, "learning_rate": 0.0009130050272231568, "loss": 1.3973, "step": 2273 }, { "epoch": 0.1924020644724596, "grad_norm": 1.359375, "learning_rate": 0.0009129299415353823, "loss": 0.998, "step": 2274 }, { "epoch": 0.19248667399949235, "grad_norm": 1.671875, "learning_rate": 0.0009128548265485355, "loss": 1.2169, "step": 2275 }, { "epoch": 0.19257128352652508, "grad_norm": 1.453125, "learning_rate": 0.0009127796822679459, "loss": 1.475, "step": 2276 }, { "epoch": 0.19265589305355782, "grad_norm": 1.8359375, "learning_rate": 0.0009127045086989453, "loss": 1.6614, "step": 2277 }, { "epoch": 0.19274050258059058, "grad_norm": 1.7265625, "learning_rate": 0.000912629305846868, "loss": 1.6123, "step": 2278 }, { "epoch": 0.19282511210762332, "grad_norm": 1.625, "learning_rate": 0.0009125540737170495, "loss": 1.4699, "step": 2279 }, { "epoch": 0.19290972163465606, "grad_norm": 1.671875, "learning_rate": 0.000912478812314828, "loss": 1.2954, "step": 2280 }, { "epoch": 0.1929943311616888, "grad_norm": 1.65625, "learning_rate": 0.0009124035216455435, "loss": 1.7954, "step": 2281 }, { "epoch": 0.19307894068872156, "grad_norm": 1.3984375, "learning_rate": 0.0009123282017145384, "loss": 1.0851, "step": 2282 }, { "epoch": 0.1931635502157543, "grad_norm": 1.484375, "learning_rate": 0.0009122528525271567, "loss": 1.1904, "step": 2283 }, { "epoch": 0.19324815974278703, "grad_norm": 1.734375, "learning_rate": 0.000912177474088745, "loss": 1.602, "step": 2284 }, { "epoch": 0.1933327692698198, "grad_norm": 1.7421875, "learning_rate": 0.0009121020664046516, "loss": 1.3934, "step": 2285 }, { "epoch": 0.19341737879685253, "grad_norm": 1.7265625, "learning_rate": 0.0009120266294802269, "loss": 1.5267, "step": 2286 }, { "epoch": 0.19350198832388527, "grad_norm": 2.71875, "learning_rate": 0.0009119511633208234, "loss": 2.046, "step": 2287 }, { "epoch": 0.193586597850918, "grad_norm": 1.8828125, "learning_rate": 0.0009118756679317958, "loss": 1.4614, "step": 2288 }, { "epoch": 0.19367120737795077, "grad_norm": 2.328125, "learning_rate": 0.0009118001433185011, "loss": 2.3825, "step": 2289 }, { "epoch": 0.1937558169049835, "grad_norm": 1.4609375, "learning_rate": 0.0009117245894862978, "loss": 1.0952, "step": 2290 }, { "epoch": 0.19384042643201624, "grad_norm": 2.015625, "learning_rate": 0.0009116490064405467, "loss": 1.3006, "step": 2291 }, { "epoch": 0.193925035959049, "grad_norm": 1.9140625, "learning_rate": 0.0009115733941866108, "loss": 1.9692, "step": 2292 }, { "epoch": 0.19400964548608174, "grad_norm": 1.2109375, "learning_rate": 0.0009114977527298552, "loss": 1.1803, "step": 2293 }, { "epoch": 0.19409425501311448, "grad_norm": 1.53125, "learning_rate": 0.0009114220820756468, "loss": 1.2199, "step": 2294 }, { "epoch": 0.1941788645401472, "grad_norm": 2.671875, "learning_rate": 0.0009113463822293549, "loss": 1.6762, "step": 2295 }, { "epoch": 0.19426347406717998, "grad_norm": 1.59375, "learning_rate": 0.0009112706531963506, "loss": 1.5845, "step": 2296 }, { "epoch": 0.1943480835942127, "grad_norm": 1.5859375, "learning_rate": 0.0009111948949820073, "loss": 1.7383, "step": 2297 }, { "epoch": 0.19443269312124545, "grad_norm": 1.9609375, "learning_rate": 0.0009111191075917003, "loss": 1.8204, "step": 2298 }, { "epoch": 0.19451730264827818, "grad_norm": 1.71875, "learning_rate": 0.0009110432910308068, "loss": 1.1609, "step": 2299 }, { "epoch": 0.19460191217531095, "grad_norm": 1.453125, "learning_rate": 0.0009109674453047068, "loss": 1.1249, "step": 2300 }, { "epoch": 0.19468652170234368, "grad_norm": 1.5, "learning_rate": 0.0009108915704187814, "loss": 1.2179, "step": 2301 }, { "epoch": 0.19477113122937642, "grad_norm": 1.6015625, "learning_rate": 0.0009108156663784144, "loss": 1.7145, "step": 2302 }, { "epoch": 0.19485574075640918, "grad_norm": 1.3671875, "learning_rate": 0.0009107397331889915, "loss": 1.0961, "step": 2303 }, { "epoch": 0.19494035028344192, "grad_norm": 1.59375, "learning_rate": 0.0009106637708559006, "loss": 1.1866, "step": 2304 }, { "epoch": 0.19502495981047466, "grad_norm": 1.296875, "learning_rate": 0.0009105877793845314, "loss": 1.2544, "step": 2305 }, { "epoch": 0.1951095693375074, "grad_norm": 1.8671875, "learning_rate": 0.0009105117587802758, "loss": 1.1294, "step": 2306 }, { "epoch": 0.19519417886454016, "grad_norm": 7.40625, "learning_rate": 0.0009104357090485278, "loss": 1.5854, "step": 2307 }, { "epoch": 0.1952787883915729, "grad_norm": 1.6171875, "learning_rate": 0.0009103596301946833, "loss": 1.2407, "step": 2308 }, { "epoch": 0.19536339791860563, "grad_norm": 2.953125, "learning_rate": 0.0009102835222241408, "loss": 1.5886, "step": 2309 }, { "epoch": 0.19544800744563837, "grad_norm": 1.640625, "learning_rate": 0.0009102073851423001, "loss": 1.5726, "step": 2310 }, { "epoch": 0.19553261697267113, "grad_norm": 2.21875, "learning_rate": 0.0009101312189545635, "loss": 1.0301, "step": 2311 }, { "epoch": 0.19561722649970387, "grad_norm": 2.15625, "learning_rate": 0.0009100550236663354, "loss": 1.3398, "step": 2312 }, { "epoch": 0.1957018360267366, "grad_norm": 1.609375, "learning_rate": 0.0009099787992830222, "loss": 1.185, "step": 2313 }, { "epoch": 0.19578644555376937, "grad_norm": 1.9921875, "learning_rate": 0.0009099025458100321, "loss": 1.3896, "step": 2314 }, { "epoch": 0.1958710550808021, "grad_norm": 2.28125, "learning_rate": 0.000909826263252776, "loss": 1.402, "step": 2315 }, { "epoch": 0.19595566460783484, "grad_norm": 1.671875, "learning_rate": 0.0009097499516166661, "loss": 1.1302, "step": 2316 }, { "epoch": 0.19604027413486758, "grad_norm": 1.4921875, "learning_rate": 0.000909673610907117, "loss": 1.0956, "step": 2317 }, { "epoch": 0.19612488366190034, "grad_norm": 1.921875, "learning_rate": 0.0009095972411295457, "loss": 1.7005, "step": 2318 }, { "epoch": 0.19620949318893308, "grad_norm": 1.4453125, "learning_rate": 0.0009095208422893708, "loss": 1.1225, "step": 2319 }, { "epoch": 0.1962941027159658, "grad_norm": 1.5546875, "learning_rate": 0.0009094444143920131, "loss": 1.3376, "step": 2320 }, { "epoch": 0.19637871224299855, "grad_norm": 1.328125, "learning_rate": 0.0009093679574428954, "loss": 1.1678, "step": 2321 }, { "epoch": 0.1964633217700313, "grad_norm": 1.9140625, "learning_rate": 0.0009092914714474427, "loss": 1.9783, "step": 2322 }, { "epoch": 0.19654793129706405, "grad_norm": 2.203125, "learning_rate": 0.0009092149564110818, "loss": 1.3787, "step": 2323 }, { "epoch": 0.19663254082409679, "grad_norm": 1.359375, "learning_rate": 0.0009091384123392424, "loss": 1.1309, "step": 2324 }, { "epoch": 0.19671715035112955, "grad_norm": 1.7109375, "learning_rate": 0.0009090618392373548, "loss": 1.3368, "step": 2325 }, { "epoch": 0.19680175987816229, "grad_norm": 1.421875, "learning_rate": 0.0009089852371108527, "loss": 1.0897, "step": 2326 }, { "epoch": 0.19688636940519502, "grad_norm": 1.8203125, "learning_rate": 0.000908908605965171, "loss": 1.3994, "step": 2327 }, { "epoch": 0.19697097893222776, "grad_norm": 1.3828125, "learning_rate": 0.0009088319458057473, "loss": 1.257, "step": 2328 }, { "epoch": 0.19705558845926052, "grad_norm": 1.2265625, "learning_rate": 0.0009087552566380209, "loss": 1.1568, "step": 2329 }, { "epoch": 0.19714019798629326, "grad_norm": 1.484375, "learning_rate": 0.000908678538467433, "loss": 1.1905, "step": 2330 }, { "epoch": 0.197224807513326, "grad_norm": 1.390625, "learning_rate": 0.0009086017912994271, "loss": 1.2623, "step": 2331 }, { "epoch": 0.19730941704035873, "grad_norm": 1.1640625, "learning_rate": 0.000908525015139449, "loss": 1.0003, "step": 2332 }, { "epoch": 0.1973940265673915, "grad_norm": 1.796875, "learning_rate": 0.0009084482099929461, "loss": 1.1562, "step": 2333 }, { "epoch": 0.19747863609442423, "grad_norm": 2.25, "learning_rate": 0.000908371375865368, "loss": 1.5529, "step": 2334 }, { "epoch": 0.19756324562145697, "grad_norm": 1.46875, "learning_rate": 0.0009082945127621664, "loss": 1.1999, "step": 2335 }, { "epoch": 0.19764785514848973, "grad_norm": 1.6796875, "learning_rate": 0.0009082176206887951, "loss": 1.5201, "step": 2336 }, { "epoch": 0.19773246467552247, "grad_norm": 1.421875, "learning_rate": 0.00090814069965071, "loss": 1.4584, "step": 2337 }, { "epoch": 0.1978170742025552, "grad_norm": 1.375, "learning_rate": 0.0009080637496533688, "loss": 1.1343, "step": 2338 }, { "epoch": 0.19790168372958794, "grad_norm": 1.8203125, "learning_rate": 0.0009079867707022316, "loss": 1.6996, "step": 2339 }, { "epoch": 0.1979862932566207, "grad_norm": 1.8203125, "learning_rate": 0.0009079097628027601, "loss": 1.6344, "step": 2340 }, { "epoch": 0.19807090278365344, "grad_norm": 1.4140625, "learning_rate": 0.0009078327259604186, "loss": 1.3483, "step": 2341 }, { "epoch": 0.19815551231068618, "grad_norm": 2.0, "learning_rate": 0.000907755660180673, "loss": 1.6691, "step": 2342 }, { "epoch": 0.1982401218377189, "grad_norm": 1.6953125, "learning_rate": 0.0009076785654689915, "loss": 1.2387, "step": 2343 }, { "epoch": 0.19832473136475168, "grad_norm": 2.015625, "learning_rate": 0.0009076014418308442, "loss": 1.8789, "step": 2344 }, { "epoch": 0.1984093408917844, "grad_norm": 1.7109375, "learning_rate": 0.0009075242892717038, "loss": 1.1345, "step": 2345 }, { "epoch": 0.19849395041881715, "grad_norm": 1.2890625, "learning_rate": 0.0009074471077970438, "loss": 1.3652, "step": 2346 }, { "epoch": 0.19857855994584991, "grad_norm": 1.8125, "learning_rate": 0.000907369897412341, "loss": 1.1288, "step": 2347 }, { "epoch": 0.19866316947288265, "grad_norm": 1.625, "learning_rate": 0.0009072926581230738, "loss": 1.4516, "step": 2348 }, { "epoch": 0.1987477789999154, "grad_norm": 1.5859375, "learning_rate": 0.0009072153899347227, "loss": 1.4141, "step": 2349 }, { "epoch": 0.19883238852694812, "grad_norm": 1.84375, "learning_rate": 0.00090713809285277, "loss": 1.2366, "step": 2350 }, { "epoch": 0.1989169980539809, "grad_norm": 1.84375, "learning_rate": 0.0009070607668827004, "loss": 1.2034, "step": 2351 }, { "epoch": 0.19900160758101362, "grad_norm": 1.375, "learning_rate": 0.0009069834120300004, "loss": 1.0135, "step": 2352 }, { "epoch": 0.19908621710804636, "grad_norm": 1.625, "learning_rate": 0.0009069060283001586, "loss": 1.5767, "step": 2353 }, { "epoch": 0.19917082663507912, "grad_norm": 1.3125, "learning_rate": 0.0009068286156986658, "loss": 1.2334, "step": 2354 }, { "epoch": 0.19925543616211186, "grad_norm": 1.6640625, "learning_rate": 0.0009067511742310146, "loss": 1.4696, "step": 2355 }, { "epoch": 0.1993400456891446, "grad_norm": 1.671875, "learning_rate": 0.0009066737039027001, "loss": 1.364, "step": 2356 }, { "epoch": 0.19942465521617733, "grad_norm": 1.53125, "learning_rate": 0.0009065962047192189, "loss": 1.2737, "step": 2357 }, { "epoch": 0.1995092647432101, "grad_norm": 1.6171875, "learning_rate": 0.0009065186766860698, "loss": 1.9808, "step": 2358 }, { "epoch": 0.19959387427024283, "grad_norm": 1.6328125, "learning_rate": 0.0009064411198087539, "loss": 1.4105, "step": 2359 }, { "epoch": 0.19967848379727557, "grad_norm": 1.078125, "learning_rate": 0.000906363534092774, "loss": 0.9741, "step": 2360 }, { "epoch": 0.1997630933243083, "grad_norm": 1.6015625, "learning_rate": 0.0009062859195436355, "loss": 1.4164, "step": 2361 }, { "epoch": 0.19984770285134107, "grad_norm": 5.625, "learning_rate": 0.0009062082761668451, "loss": 1.327, "step": 2362 }, { "epoch": 0.1999323123783738, "grad_norm": 1.8359375, "learning_rate": 0.000906130603967912, "loss": 1.5411, "step": 2363 }, { "epoch": 0.20001692190540654, "grad_norm": 1.7421875, "learning_rate": 0.0009060529029523474, "loss": 1.4187, "step": 2364 }, { "epoch": 0.2001015314324393, "grad_norm": 2.0, "learning_rate": 0.0009059751731256646, "loss": 1.5101, "step": 2365 }, { "epoch": 0.20018614095947204, "grad_norm": 2.1875, "learning_rate": 0.0009058974144933787, "loss": 1.767, "step": 2366 }, { "epoch": 0.20027075048650478, "grad_norm": 2.0625, "learning_rate": 0.000905819627061007, "loss": 1.4327, "step": 2367 }, { "epoch": 0.20035536001353751, "grad_norm": 1.953125, "learning_rate": 0.0009057418108340689, "loss": 1.5096, "step": 2368 }, { "epoch": 0.20043996954057028, "grad_norm": 2.203125, "learning_rate": 0.0009056639658180859, "loss": 1.2079, "step": 2369 }, { "epoch": 0.20052457906760301, "grad_norm": 1.21875, "learning_rate": 0.0009055860920185811, "loss": 0.9657, "step": 2370 }, { "epoch": 0.20060918859463575, "grad_norm": 1.6171875, "learning_rate": 0.0009055081894410802, "loss": 1.193, "step": 2371 }, { "epoch": 0.2006937981216685, "grad_norm": 1.7578125, "learning_rate": 0.0009054302580911107, "loss": 1.249, "step": 2372 }, { "epoch": 0.20077840764870125, "grad_norm": 1.3359375, "learning_rate": 0.0009053522979742022, "loss": 0.9413, "step": 2373 }, { "epoch": 0.200863017175734, "grad_norm": 1.671875, "learning_rate": 0.000905274309095886, "loss": 1.3488, "step": 2374 }, { "epoch": 0.20094762670276672, "grad_norm": 1.5, "learning_rate": 0.0009051962914616961, "loss": 1.1056, "step": 2375 }, { "epoch": 0.2010322362297995, "grad_norm": 1.4375, "learning_rate": 0.0009051182450771679, "loss": 1.1472, "step": 2376 }, { "epoch": 0.20111684575683222, "grad_norm": 1.7890625, "learning_rate": 0.0009050401699478392, "loss": 1.6678, "step": 2377 }, { "epoch": 0.20120145528386496, "grad_norm": 1.7109375, "learning_rate": 0.0009049620660792499, "loss": 1.3897, "step": 2378 }, { "epoch": 0.2012860648108977, "grad_norm": 2.34375, "learning_rate": 0.0009048839334769415, "loss": 1.8433, "step": 2379 }, { "epoch": 0.20137067433793046, "grad_norm": 2.234375, "learning_rate": 0.0009048057721464581, "loss": 2.0182, "step": 2380 }, { "epoch": 0.2014552838649632, "grad_norm": 1.8515625, "learning_rate": 0.0009047275820933453, "loss": 1.1403, "step": 2381 }, { "epoch": 0.20153989339199593, "grad_norm": 1.5078125, "learning_rate": 0.0009046493633231514, "loss": 1.2296, "step": 2382 }, { "epoch": 0.20162450291902867, "grad_norm": 1.46875, "learning_rate": 0.0009045711158414258, "loss": 1.1582, "step": 2383 }, { "epoch": 0.20170911244606143, "grad_norm": 2.046875, "learning_rate": 0.000904492839653721, "loss": 1.3214, "step": 2384 }, { "epoch": 0.20179372197309417, "grad_norm": 1.953125, "learning_rate": 0.0009044145347655907, "loss": 1.2736, "step": 2385 }, { "epoch": 0.2018783315001269, "grad_norm": 2.703125, "learning_rate": 0.000904336201182591, "loss": 1.4438, "step": 2386 }, { "epoch": 0.20196294102715967, "grad_norm": 1.6015625, "learning_rate": 0.0009042578389102801, "loss": 1.2284, "step": 2387 }, { "epoch": 0.2020475505541924, "grad_norm": 1.3125, "learning_rate": 0.0009041794479542181, "loss": 1.0408, "step": 2388 }, { "epoch": 0.20213216008122514, "grad_norm": 2.34375, "learning_rate": 0.0009041010283199671, "loss": 2.1843, "step": 2389 }, { "epoch": 0.20221676960825788, "grad_norm": 1.7890625, "learning_rate": 0.0009040225800130913, "loss": 1.5271, "step": 2390 }, { "epoch": 0.20230137913529064, "grad_norm": 1.9921875, "learning_rate": 0.0009039441030391569, "loss": 1.7187, "step": 2391 }, { "epoch": 0.20238598866232338, "grad_norm": 1.5703125, "learning_rate": 0.0009038655974037322, "loss": 1.4733, "step": 2392 }, { "epoch": 0.20247059818935612, "grad_norm": 1.578125, "learning_rate": 0.0009037870631123875, "loss": 0.9981, "step": 2393 }, { "epoch": 0.20255520771638885, "grad_norm": 2.140625, "learning_rate": 0.000903708500170695, "loss": 1.8024, "step": 2394 }, { "epoch": 0.20263981724342162, "grad_norm": 1.6328125, "learning_rate": 0.0009036299085842294, "loss": 1.3167, "step": 2395 }, { "epoch": 0.20272442677045435, "grad_norm": 1.5546875, "learning_rate": 0.0009035512883585668, "loss": 1.3702, "step": 2396 }, { "epoch": 0.2028090362974871, "grad_norm": 1.6953125, "learning_rate": 0.0009034726394992858, "loss": 1.3869, "step": 2397 }, { "epoch": 0.20289364582451985, "grad_norm": 1.7265625, "learning_rate": 0.0009033939620119668, "loss": 1.4452, "step": 2398 }, { "epoch": 0.2029782553515526, "grad_norm": 1.9609375, "learning_rate": 0.0009033152559021923, "loss": 1.3606, "step": 2399 }, { "epoch": 0.20306286487858533, "grad_norm": 2.0625, "learning_rate": 0.0009032365211755465, "loss": 1.7065, "step": 2400 }, { "epoch": 0.20314747440561806, "grad_norm": 1.71875, "learning_rate": 0.0009031577578376165, "loss": 1.2851, "step": 2401 }, { "epoch": 0.20323208393265083, "grad_norm": 5.96875, "learning_rate": 0.0009030789658939904, "loss": 1.1013, "step": 2402 }, { "epoch": 0.20331669345968356, "grad_norm": 1.828125, "learning_rate": 0.0009030001453502592, "loss": 1.013, "step": 2403 }, { "epoch": 0.2034013029867163, "grad_norm": 1.7265625, "learning_rate": 0.0009029212962120154, "loss": 1.3218, "step": 2404 }, { "epoch": 0.20348591251374903, "grad_norm": 1.8828125, "learning_rate": 0.0009028424184848536, "loss": 1.5475, "step": 2405 }, { "epoch": 0.2035705220407818, "grad_norm": 1.265625, "learning_rate": 0.0009027635121743705, "loss": 0.9247, "step": 2406 }, { "epoch": 0.20365513156781453, "grad_norm": 1.8203125, "learning_rate": 0.0009026845772861649, "loss": 1.6982, "step": 2407 }, { "epoch": 0.20373974109484727, "grad_norm": 1.953125, "learning_rate": 0.0009026056138258375, "loss": 1.606, "step": 2408 }, { "epoch": 0.20382435062188003, "grad_norm": 2.0625, "learning_rate": 0.0009025266217989912, "loss": 1.5454, "step": 2409 }, { "epoch": 0.20390896014891277, "grad_norm": 1.625, "learning_rate": 0.0009024476012112306, "loss": 0.9943, "step": 2410 }, { "epoch": 0.2039935696759455, "grad_norm": 2.9375, "learning_rate": 0.0009023685520681626, "loss": 1.9137, "step": 2411 }, { "epoch": 0.20407817920297824, "grad_norm": 1.5859375, "learning_rate": 0.0009022894743753962, "loss": 1.2983, "step": 2412 }, { "epoch": 0.204162788730011, "grad_norm": 1.6875, "learning_rate": 0.0009022103681385421, "loss": 1.4534, "step": 2413 }, { "epoch": 0.20424739825704374, "grad_norm": 1.9296875, "learning_rate": 0.0009021312333632133, "loss": 1.778, "step": 2414 }, { "epoch": 0.20433200778407648, "grad_norm": 1.953125, "learning_rate": 0.0009020520700550249, "loss": 1.1399, "step": 2415 }, { "epoch": 0.20441661731110924, "grad_norm": 1.5625, "learning_rate": 0.0009019728782195936, "loss": 1.4993, "step": 2416 }, { "epoch": 0.20450122683814198, "grad_norm": 1.3828125, "learning_rate": 0.0009018936578625385, "loss": 1.0006, "step": 2417 }, { "epoch": 0.20458583636517472, "grad_norm": 1.9921875, "learning_rate": 0.0009018144089894807, "loss": 1.7064, "step": 2418 }, { "epoch": 0.20467044589220745, "grad_norm": 1.3515625, "learning_rate": 0.0009017351316060429, "loss": 1.0797, "step": 2419 }, { "epoch": 0.20475505541924022, "grad_norm": 2.09375, "learning_rate": 0.0009016558257178506, "loss": 1.4292, "step": 2420 }, { "epoch": 0.20483966494627295, "grad_norm": 1.5078125, "learning_rate": 0.0009015764913305305, "loss": 1.441, "step": 2421 }, { "epoch": 0.2049242744733057, "grad_norm": 1.8515625, "learning_rate": 0.000901497128449712, "loss": 1.2225, "step": 2422 }, { "epoch": 0.20500888400033843, "grad_norm": 1.8984375, "learning_rate": 0.0009014177370810259, "loss": 1.3433, "step": 2423 }, { "epoch": 0.2050934935273712, "grad_norm": 2.21875, "learning_rate": 0.0009013383172301057, "loss": 1.5261, "step": 2424 }, { "epoch": 0.20517810305440393, "grad_norm": 1.8125, "learning_rate": 0.0009012588689025862, "loss": 1.3596, "step": 2425 }, { "epoch": 0.20526271258143666, "grad_norm": 1.8515625, "learning_rate": 0.0009011793921041049, "loss": 1.9291, "step": 2426 }, { "epoch": 0.20534732210846943, "grad_norm": 1.78125, "learning_rate": 0.0009010998868403011, "loss": 1.3361, "step": 2427 }, { "epoch": 0.20543193163550216, "grad_norm": 1.8671875, "learning_rate": 0.0009010203531168155, "loss": 1.4515, "step": 2428 }, { "epoch": 0.2055165411625349, "grad_norm": 2.328125, "learning_rate": 0.0009009407909392917, "loss": 1.7527, "step": 2429 }, { "epoch": 0.20560115068956764, "grad_norm": 1.59375, "learning_rate": 0.0009008612003133749, "loss": 1.3654, "step": 2430 }, { "epoch": 0.2056857602166004, "grad_norm": 1.8671875, "learning_rate": 0.0009007815812447125, "loss": 1.5151, "step": 2431 }, { "epoch": 0.20577036974363314, "grad_norm": 1.59375, "learning_rate": 0.0009007019337389537, "loss": 1.9208, "step": 2432 }, { "epoch": 0.20585497927066587, "grad_norm": 1.4921875, "learning_rate": 0.0009006222578017498, "loss": 1.4966, "step": 2433 }, { "epoch": 0.2059395887976986, "grad_norm": 1.421875, "learning_rate": 0.000900542553438754, "loss": 1.5673, "step": 2434 }, { "epoch": 0.20602419832473137, "grad_norm": 1.703125, "learning_rate": 0.0009004628206556219, "loss": 1.3673, "step": 2435 }, { "epoch": 0.2061088078517641, "grad_norm": 2.140625, "learning_rate": 0.0009003830594580107, "loss": 1.4469, "step": 2436 }, { "epoch": 0.20619341737879684, "grad_norm": 2.203125, "learning_rate": 0.00090030326985158, "loss": 2.2375, "step": 2437 }, { "epoch": 0.2062780269058296, "grad_norm": 1.6875, "learning_rate": 0.0009002234518419909, "loss": 1.8203, "step": 2438 }, { "epoch": 0.20636263643286235, "grad_norm": 1.421875, "learning_rate": 0.000900143605434907, "loss": 1.0731, "step": 2439 }, { "epoch": 0.20644724595989508, "grad_norm": 1.5546875, "learning_rate": 0.0009000637306359938, "loss": 1.1951, "step": 2440 }, { "epoch": 0.20653185548692782, "grad_norm": 1.2734375, "learning_rate": 0.0008999838274509185, "loss": 1.1173, "step": 2441 }, { "epoch": 0.20661646501396058, "grad_norm": 1.546875, "learning_rate": 0.0008999038958853507, "loss": 1.7688, "step": 2442 }, { "epoch": 0.20670107454099332, "grad_norm": 1.640625, "learning_rate": 0.0008998239359449618, "loss": 1.3929, "step": 2443 }, { "epoch": 0.20678568406802605, "grad_norm": 1.890625, "learning_rate": 0.0008997439476354255, "loss": 1.383, "step": 2444 }, { "epoch": 0.2068702935950588, "grad_norm": 1.453125, "learning_rate": 0.0008996639309624171, "loss": 1.4021, "step": 2445 }, { "epoch": 0.20695490312209155, "grad_norm": 1.8828125, "learning_rate": 0.0008995838859316141, "loss": 1.8299, "step": 2446 }, { "epoch": 0.2070395126491243, "grad_norm": 1.375, "learning_rate": 0.0008995038125486961, "loss": 1.2027, "step": 2447 }, { "epoch": 0.20712412217615703, "grad_norm": 1.6171875, "learning_rate": 0.0008994237108193448, "loss": 1.5898, "step": 2448 }, { "epoch": 0.2072087317031898, "grad_norm": 1.6953125, "learning_rate": 0.0008993435807492433, "loss": 1.8602, "step": 2449 }, { "epoch": 0.20729334123022253, "grad_norm": 1.921875, "learning_rate": 0.0008992634223440777, "loss": 2.0188, "step": 2450 }, { "epoch": 0.20737795075725526, "grad_norm": 1.5390625, "learning_rate": 0.000899183235609535, "loss": 1.2205, "step": 2451 }, { "epoch": 0.207462560284288, "grad_norm": 2.140625, "learning_rate": 0.0008991030205513053, "loss": 2.3748, "step": 2452 }, { "epoch": 0.20754716981132076, "grad_norm": 1.6328125, "learning_rate": 0.0008990227771750799, "loss": 1.396, "step": 2453 }, { "epoch": 0.2076317793383535, "grad_norm": 1.84375, "learning_rate": 0.0008989425054865524, "loss": 1.8429, "step": 2454 }, { "epoch": 0.20771638886538624, "grad_norm": 1.6015625, "learning_rate": 0.0008988622054914184, "loss": 1.21, "step": 2455 }, { "epoch": 0.20780099839241897, "grad_norm": 1.6484375, "learning_rate": 0.0008987818771953758, "loss": 1.128, "step": 2456 }, { "epoch": 0.20788560791945174, "grad_norm": 2.53125, "learning_rate": 0.0008987015206041237, "loss": 1.3079, "step": 2457 }, { "epoch": 0.20797021744648447, "grad_norm": 1.6484375, "learning_rate": 0.0008986211357233642, "loss": 1.2274, "step": 2458 }, { "epoch": 0.2080548269735172, "grad_norm": 2.046875, "learning_rate": 0.0008985407225588007, "loss": 1.5793, "step": 2459 }, { "epoch": 0.20813943650054997, "grad_norm": 1.7421875, "learning_rate": 0.0008984602811161388, "loss": 1.2449, "step": 2460 }, { "epoch": 0.2082240460275827, "grad_norm": 1.65625, "learning_rate": 0.0008983798114010864, "loss": 1.1036, "step": 2461 }, { "epoch": 0.20830865555461545, "grad_norm": 1.578125, "learning_rate": 0.0008982993134193532, "loss": 1.1015, "step": 2462 }, { "epoch": 0.20839326508164818, "grad_norm": 1.890625, "learning_rate": 0.0008982187871766503, "loss": 1.313, "step": 2463 }, { "epoch": 0.20847787460868095, "grad_norm": 2.359375, "learning_rate": 0.000898138232678692, "loss": 1.7213, "step": 2464 }, { "epoch": 0.20856248413571368, "grad_norm": 3.8125, "learning_rate": 0.0008980576499311936, "loss": 1.7602, "step": 2465 }, { "epoch": 0.20864709366274642, "grad_norm": 2.4375, "learning_rate": 0.0008979770389398728, "loss": 1.6117, "step": 2466 }, { "epoch": 0.20873170318977916, "grad_norm": 1.78125, "learning_rate": 0.0008978963997104494, "loss": 1.198, "step": 2467 }, { "epoch": 0.20881631271681192, "grad_norm": 4.03125, "learning_rate": 0.0008978157322486452, "loss": 1.1206, "step": 2468 }, { "epoch": 0.20890092224384466, "grad_norm": 3.984375, "learning_rate": 0.0008977350365601838, "loss": 1.0643, "step": 2469 }, { "epoch": 0.2089855317708774, "grad_norm": 1.8046875, "learning_rate": 0.0008976543126507906, "loss": 1.1602, "step": 2470 }, { "epoch": 0.20907014129791016, "grad_norm": 1.6328125, "learning_rate": 0.0008975735605261935, "loss": 1.3832, "step": 2471 }, { "epoch": 0.2091547508249429, "grad_norm": 1.875, "learning_rate": 0.0008974927801921223, "loss": 1.4801, "step": 2472 }, { "epoch": 0.20923936035197563, "grad_norm": 1.640625, "learning_rate": 0.0008974119716543086, "loss": 1.05, "step": 2473 }, { "epoch": 0.20932396987900836, "grad_norm": 2.25, "learning_rate": 0.0008973311349184862, "loss": 1.4596, "step": 2474 }, { "epoch": 0.20940857940604113, "grad_norm": 1.7265625, "learning_rate": 0.0008972502699903907, "loss": 1.4191, "step": 2475 }, { "epoch": 0.20949318893307387, "grad_norm": 2.0, "learning_rate": 0.0008971693768757597, "loss": 1.1295, "step": 2476 }, { "epoch": 0.2095777984601066, "grad_norm": 1.4375, "learning_rate": 0.000897088455580333, "loss": 1.293, "step": 2477 }, { "epoch": 0.20966240798713937, "grad_norm": 2.125, "learning_rate": 0.0008970075061098524, "loss": 1.6866, "step": 2478 }, { "epoch": 0.2097470175141721, "grad_norm": 1.8828125, "learning_rate": 0.0008969265284700614, "loss": 1.5655, "step": 2479 }, { "epoch": 0.20983162704120484, "grad_norm": 2.0, "learning_rate": 0.0008968455226667059, "loss": 1.8487, "step": 2480 }, { "epoch": 0.20991623656823757, "grad_norm": 1.6796875, "learning_rate": 0.0008967644887055333, "loss": 1.3777, "step": 2481 }, { "epoch": 0.21000084609527034, "grad_norm": 2.640625, "learning_rate": 0.0008966834265922936, "loss": 1.3945, "step": 2482 }, { "epoch": 0.21008545562230307, "grad_norm": 2.140625, "learning_rate": 0.0008966023363327385, "loss": 1.4827, "step": 2483 }, { "epoch": 0.2101700651493358, "grad_norm": 1.578125, "learning_rate": 0.0008965212179326214, "loss": 1.6236, "step": 2484 }, { "epoch": 0.21025467467636855, "grad_norm": 2.171875, "learning_rate": 0.0008964400713976982, "loss": 1.652, "step": 2485 }, { "epoch": 0.2103392842034013, "grad_norm": 1.296875, "learning_rate": 0.0008963588967337264, "loss": 1.0395, "step": 2486 }, { "epoch": 0.21042389373043405, "grad_norm": 2.203125, "learning_rate": 0.0008962776939464658, "loss": 2.1581, "step": 2487 }, { "epoch": 0.21050850325746678, "grad_norm": 2.296875, "learning_rate": 0.0008961964630416782, "loss": 1.9719, "step": 2488 }, { "epoch": 0.21059311278449955, "grad_norm": 1.65625, "learning_rate": 0.000896115204025127, "loss": 1.2211, "step": 2489 }, { "epoch": 0.21067772231153228, "grad_norm": 1.6953125, "learning_rate": 0.000896033916902578, "loss": 1.1039, "step": 2490 }, { "epoch": 0.21076233183856502, "grad_norm": 1.8828125, "learning_rate": 0.000895952601679799, "loss": 1.3436, "step": 2491 }, { "epoch": 0.21084694136559776, "grad_norm": 1.6640625, "learning_rate": 0.0008958712583625592, "loss": 1.2605, "step": 2492 }, { "epoch": 0.21093155089263052, "grad_norm": 1.453125, "learning_rate": 0.0008957898869566308, "loss": 1.039, "step": 2493 }, { "epoch": 0.21101616041966326, "grad_norm": 1.640625, "learning_rate": 0.0008957084874677871, "loss": 1.3151, "step": 2494 }, { "epoch": 0.211100769946696, "grad_norm": 1.7578125, "learning_rate": 0.0008956270599018038, "loss": 1.4182, "step": 2495 }, { "epoch": 0.21118537947372873, "grad_norm": 1.4609375, "learning_rate": 0.0008955456042644585, "loss": 1.0842, "step": 2496 }, { "epoch": 0.2112699890007615, "grad_norm": 3.34375, "learning_rate": 0.0008954641205615308, "loss": 2.4689, "step": 2497 }, { "epoch": 0.21135459852779423, "grad_norm": 1.8515625, "learning_rate": 0.0008953826087988023, "loss": 1.5552, "step": 2498 }, { "epoch": 0.21143920805482697, "grad_norm": 1.65625, "learning_rate": 0.0008953010689820567, "loss": 1.168, "step": 2499 }, { "epoch": 0.21152381758185973, "grad_norm": 1.71875, "learning_rate": 0.0008952195011170794, "loss": 1.6484, "step": 2500 }, { "epoch": 0.21160842710889247, "grad_norm": 1.8828125, "learning_rate": 0.0008951379052096582, "loss": 1.45, "step": 2501 }, { "epoch": 0.2116930366359252, "grad_norm": 2.25, "learning_rate": 0.0008950562812655825, "loss": 1.5187, "step": 2502 }, { "epoch": 0.21177764616295794, "grad_norm": 1.671875, "learning_rate": 0.0008949746292906439, "loss": 1.4828, "step": 2503 }, { "epoch": 0.2118622556899907, "grad_norm": 2.0625, "learning_rate": 0.0008948929492906359, "loss": 1.8354, "step": 2504 }, { "epoch": 0.21194686521702344, "grad_norm": 1.3046875, "learning_rate": 0.0008948112412713542, "loss": 1.0898, "step": 2505 }, { "epoch": 0.21203147474405618, "grad_norm": 1.3125, "learning_rate": 0.000894729505238596, "loss": 1.2541, "step": 2506 }, { "epoch": 0.2121160842710889, "grad_norm": 1.4375, "learning_rate": 0.000894647741198161, "loss": 1.4018, "step": 2507 }, { "epoch": 0.21220069379812168, "grad_norm": 1.328125, "learning_rate": 0.0008945659491558509, "loss": 1.1878, "step": 2508 }, { "epoch": 0.2122853033251544, "grad_norm": 1.2890625, "learning_rate": 0.0008944841291174689, "loss": 1.1348, "step": 2509 }, { "epoch": 0.21236991285218715, "grad_norm": 1.71875, "learning_rate": 0.0008944022810888205, "loss": 1.6959, "step": 2510 }, { "epoch": 0.2124545223792199, "grad_norm": 5.5, "learning_rate": 0.0008943204050757132, "loss": 1.5868, "step": 2511 }, { "epoch": 0.21253913190625265, "grad_norm": 1.5234375, "learning_rate": 0.0008942385010839564, "loss": 1.4721, "step": 2512 }, { "epoch": 0.21262374143328538, "grad_norm": 1.5546875, "learning_rate": 0.0008941565691193616, "loss": 1.5138, "step": 2513 }, { "epoch": 0.21270835096031812, "grad_norm": 14.4375, "learning_rate": 0.000894074609187742, "loss": 1.3912, "step": 2514 }, { "epoch": 0.21279296048735089, "grad_norm": 1.96875, "learning_rate": 0.0008939926212949134, "loss": 1.7743, "step": 2515 }, { "epoch": 0.21287757001438362, "grad_norm": 1.9140625, "learning_rate": 0.000893910605446693, "loss": 1.8886, "step": 2516 }, { "epoch": 0.21296217954141636, "grad_norm": 2.0, "learning_rate": 0.0008938285616489, "loss": 1.0949, "step": 2517 }, { "epoch": 0.2130467890684491, "grad_norm": 1.5625, "learning_rate": 0.0008937464899073556, "loss": 1.2388, "step": 2518 }, { "epoch": 0.21313139859548186, "grad_norm": 1.7109375, "learning_rate": 0.0008936643902278836, "loss": 1.246, "step": 2519 }, { "epoch": 0.2132160081225146, "grad_norm": 1.4765625, "learning_rate": 0.000893582262616309, "loss": 1.7066, "step": 2520 }, { "epoch": 0.21330061764954733, "grad_norm": 1.7421875, "learning_rate": 0.0008935001070784591, "loss": 1.6349, "step": 2521 }, { "epoch": 0.2133852271765801, "grad_norm": 1.3828125, "learning_rate": 0.0008934179236201634, "loss": 1.3974, "step": 2522 }, { "epoch": 0.21346983670361283, "grad_norm": 1.8203125, "learning_rate": 0.0008933357122472529, "loss": 1.752, "step": 2523 }, { "epoch": 0.21355444623064557, "grad_norm": 1.875, "learning_rate": 0.000893253472965561, "loss": 1.3237, "step": 2524 }, { "epoch": 0.2136390557576783, "grad_norm": 1.2890625, "learning_rate": 0.0008931712057809228, "loss": 1.1395, "step": 2525 }, { "epoch": 0.21372366528471107, "grad_norm": 1.4453125, "learning_rate": 0.0008930889106991756, "loss": 1.2093, "step": 2526 }, { "epoch": 0.2138082748117438, "grad_norm": 1.8359375, "learning_rate": 0.0008930065877261584, "loss": 1.1918, "step": 2527 }, { "epoch": 0.21389288433877654, "grad_norm": 1.7890625, "learning_rate": 0.0008929242368677124, "loss": 1.375, "step": 2528 }, { "epoch": 0.2139774938658093, "grad_norm": 1.21875, "learning_rate": 0.0008928418581296809, "loss": 1.137, "step": 2529 }, { "epoch": 0.21406210339284204, "grad_norm": 1.5390625, "learning_rate": 0.0008927594515179089, "loss": 1.5825, "step": 2530 }, { "epoch": 0.21414671291987478, "grad_norm": 1.21875, "learning_rate": 0.0008926770170382435, "loss": 1.0552, "step": 2531 }, { "epoch": 0.2142313224469075, "grad_norm": 1.984375, "learning_rate": 0.0008925945546965337, "loss": 1.6677, "step": 2532 }, { "epoch": 0.21431593197394028, "grad_norm": 1.84375, "learning_rate": 0.0008925120644986306, "loss": 1.2906, "step": 2533 }, { "epoch": 0.214400541500973, "grad_norm": 1.40625, "learning_rate": 0.0008924295464503872, "loss": 1.2665, "step": 2534 }, { "epoch": 0.21448515102800575, "grad_norm": 1.75, "learning_rate": 0.0008923470005576586, "loss": 1.1124, "step": 2535 }, { "epoch": 0.21456976055503849, "grad_norm": 1.65625, "learning_rate": 0.0008922644268263016, "loss": 1.3918, "step": 2536 }, { "epoch": 0.21465437008207125, "grad_norm": 1.6015625, "learning_rate": 0.0008921818252621751, "loss": 1.1893, "step": 2537 }, { "epoch": 0.21473897960910399, "grad_norm": 1.7265625, "learning_rate": 0.0008920991958711403, "loss": 1.6657, "step": 2538 }, { "epoch": 0.21482358913613672, "grad_norm": 1.8984375, "learning_rate": 0.00089201653865906, "loss": 1.5966, "step": 2539 }, { "epoch": 0.2149081986631695, "grad_norm": 1.9140625, "learning_rate": 0.0008919338536317988, "loss": 1.5345, "step": 2540 }, { "epoch": 0.21499280819020222, "grad_norm": 1.859375, "learning_rate": 0.0008918511407952239, "loss": 1.5964, "step": 2541 }, { "epoch": 0.21507741771723496, "grad_norm": 1.8203125, "learning_rate": 0.0008917684001552038, "loss": 1.861, "step": 2542 }, { "epoch": 0.2151620272442677, "grad_norm": 1.8203125, "learning_rate": 0.0008916856317176097, "loss": 1.567, "step": 2543 }, { "epoch": 0.21524663677130046, "grad_norm": 1.953125, "learning_rate": 0.0008916028354883138, "loss": 1.3746, "step": 2544 }, { "epoch": 0.2153312462983332, "grad_norm": 1.6328125, "learning_rate": 0.0008915200114731911, "loss": 1.8498, "step": 2545 }, { "epoch": 0.21541585582536593, "grad_norm": 1.546875, "learning_rate": 0.0008914371596781186, "loss": 1.2825, "step": 2546 }, { "epoch": 0.21550046535239867, "grad_norm": 1.5625, "learning_rate": 0.0008913542801089747, "loss": 1.2897, "step": 2547 }, { "epoch": 0.21558507487943143, "grad_norm": 1.359375, "learning_rate": 0.00089127137277164, "loss": 1.2085, "step": 2548 }, { "epoch": 0.21566968440646417, "grad_norm": 1.9453125, "learning_rate": 0.0008911884376719972, "loss": 1.5559, "step": 2549 }, { "epoch": 0.2157542939334969, "grad_norm": 1.5234375, "learning_rate": 0.0008911054748159307, "loss": 1.7889, "step": 2550 }, { "epoch": 0.21583890346052967, "grad_norm": 1.4921875, "learning_rate": 0.0008910224842093274, "loss": 1.2688, "step": 2551 }, { "epoch": 0.2159235129875624, "grad_norm": 1.46875, "learning_rate": 0.0008909394658580756, "loss": 1.33, "step": 2552 }, { "epoch": 0.21600812251459514, "grad_norm": 1.546875, "learning_rate": 0.0008908564197680658, "loss": 1.4977, "step": 2553 }, { "epoch": 0.21609273204162788, "grad_norm": 1.5703125, "learning_rate": 0.0008907733459451905, "loss": 1.427, "step": 2554 }, { "epoch": 0.21617734156866064, "grad_norm": 1.59375, "learning_rate": 0.0008906902443953442, "loss": 1.4393, "step": 2555 }, { "epoch": 0.21626195109569338, "grad_norm": 1.6328125, "learning_rate": 0.0008906071151244231, "loss": 1.4365, "step": 2556 }, { "epoch": 0.21634656062272611, "grad_norm": 1.6796875, "learning_rate": 0.0008905239581383257, "loss": 1.0837, "step": 2557 }, { "epoch": 0.21643117014975885, "grad_norm": 1.515625, "learning_rate": 0.0008904407734429523, "loss": 1.9066, "step": 2558 }, { "epoch": 0.21651577967679161, "grad_norm": 1.9453125, "learning_rate": 0.0008903575610442052, "loss": 1.5318, "step": 2559 }, { "epoch": 0.21660038920382435, "grad_norm": 1.4765625, "learning_rate": 0.0008902743209479887, "loss": 1.1657, "step": 2560 }, { "epoch": 0.2166849987308571, "grad_norm": 1.7421875, "learning_rate": 0.000890191053160209, "loss": 1.3481, "step": 2561 }, { "epoch": 0.21676960825788985, "grad_norm": 1.2734375, "learning_rate": 0.0008901077576867742, "loss": 0.9893, "step": 2562 }, { "epoch": 0.2168542177849226, "grad_norm": 1.2109375, "learning_rate": 0.0008900244345335947, "loss": 1.0684, "step": 2563 }, { "epoch": 0.21693882731195532, "grad_norm": 1.734375, "learning_rate": 0.0008899410837065824, "loss": 1.3021, "step": 2564 }, { "epoch": 0.21702343683898806, "grad_norm": 1.71875, "learning_rate": 0.0008898577052116513, "loss": 1.4724, "step": 2565 }, { "epoch": 0.21710804636602082, "grad_norm": 1.6953125, "learning_rate": 0.0008897742990547177, "loss": 1.3151, "step": 2566 }, { "epoch": 0.21719265589305356, "grad_norm": 2.140625, "learning_rate": 0.0008896908652416995, "loss": 1.6081, "step": 2567 }, { "epoch": 0.2172772654200863, "grad_norm": 2.0, "learning_rate": 0.0008896074037785166, "loss": 1.5427, "step": 2568 }, { "epoch": 0.21736187494711903, "grad_norm": 1.578125, "learning_rate": 0.0008895239146710911, "loss": 1.1243, "step": 2569 }, { "epoch": 0.2174464844741518, "grad_norm": 1.765625, "learning_rate": 0.0008894403979253467, "loss": 1.4176, "step": 2570 }, { "epoch": 0.21753109400118453, "grad_norm": 1.484375, "learning_rate": 0.0008893568535472094, "loss": 1.3226, "step": 2571 }, { "epoch": 0.21761570352821727, "grad_norm": 1.6015625, "learning_rate": 0.0008892732815426069, "loss": 1.2629, "step": 2572 }, { "epoch": 0.21770031305525003, "grad_norm": 1.7421875, "learning_rate": 0.0008891896819174691, "loss": 1.5304, "step": 2573 }, { "epoch": 0.21778492258228277, "grad_norm": 1.5, "learning_rate": 0.0008891060546777277, "loss": 1.1629, "step": 2574 }, { "epoch": 0.2178695321093155, "grad_norm": 1.546875, "learning_rate": 0.0008890223998293163, "loss": 1.3848, "step": 2575 }, { "epoch": 0.21795414163634824, "grad_norm": 1.515625, "learning_rate": 0.0008889387173781707, "loss": 1.3257, "step": 2576 }, { "epoch": 0.218038751163381, "grad_norm": 1.7890625, "learning_rate": 0.0008888550073302282, "loss": 2.0771, "step": 2577 }, { "epoch": 0.21812336069041374, "grad_norm": 35.75, "learning_rate": 0.0008887712696914289, "loss": 1.0835, "step": 2578 }, { "epoch": 0.21820797021744648, "grad_norm": 1.5, "learning_rate": 0.000888687504467714, "loss": 1.581, "step": 2579 }, { "epoch": 0.21829257974447921, "grad_norm": 1.5234375, "learning_rate": 0.000888603711665027, "loss": 1.2986, "step": 2580 }, { "epoch": 0.21837718927151198, "grad_norm": 2.15625, "learning_rate": 0.0008885198912893133, "loss": 1.7609, "step": 2581 }, { "epoch": 0.21846179879854472, "grad_norm": 1.7421875, "learning_rate": 0.0008884360433465204, "loss": 1.3864, "step": 2582 }, { "epoch": 0.21854640832557745, "grad_norm": 1.5546875, "learning_rate": 0.0008883521678425977, "loss": 1.3401, "step": 2583 }, { "epoch": 0.21863101785261022, "grad_norm": 1.5703125, "learning_rate": 0.0008882682647834963, "loss": 1.2742, "step": 2584 }, { "epoch": 0.21871562737964295, "grad_norm": 2.15625, "learning_rate": 0.0008881843341751696, "loss": 1.0519, "step": 2585 }, { "epoch": 0.2188002369066757, "grad_norm": 1.921875, "learning_rate": 0.0008881003760235728, "loss": 1.3839, "step": 2586 }, { "epoch": 0.21888484643370842, "grad_norm": 1.6875, "learning_rate": 0.0008880163903346633, "loss": 1.4174, "step": 2587 }, { "epoch": 0.2189694559607412, "grad_norm": 19.75, "learning_rate": 0.0008879323771143998, "loss": 1.3053, "step": 2588 }, { "epoch": 0.21905406548777392, "grad_norm": 2.046875, "learning_rate": 0.0008878483363687437, "loss": 1.4272, "step": 2589 }, { "epoch": 0.21913867501480666, "grad_norm": 2.21875, "learning_rate": 0.0008877642681036581, "loss": 1.0243, "step": 2590 }, { "epoch": 0.21922328454183942, "grad_norm": 3.03125, "learning_rate": 0.0008876801723251076, "loss": 1.1029, "step": 2591 }, { "epoch": 0.21930789406887216, "grad_norm": 2.375, "learning_rate": 0.0008875960490390595, "loss": 2.2641, "step": 2592 }, { "epoch": 0.2193925035959049, "grad_norm": 68.5, "learning_rate": 0.0008875118982514825, "loss": 1.3603, "step": 2593 }, { "epoch": 0.21947711312293763, "grad_norm": 1512.0, "learning_rate": 0.0008874277199683476, "loss": 1.2561, "step": 2594 }, { "epoch": 0.2195617226499704, "grad_norm": 1408.0, "learning_rate": 0.0008873435141956274, "loss": 1.4461, "step": 2595 }, { "epoch": 0.21964633217700313, "grad_norm": 25.375, "learning_rate": 0.0008872592809392967, "loss": 2.0389, "step": 2596 }, { "epoch": 0.21973094170403587, "grad_norm": 473088.0, "learning_rate": 0.0008871750202053323, "loss": 1.8174, "step": 2597 }, { "epoch": 0.2198155512310686, "grad_norm": 1466368.0, "learning_rate": 0.0008870907319997128, "loss": 1.3063, "step": 2598 }, { "epoch": 0.21990016075810137, "grad_norm": 1458176.0, "learning_rate": 0.0008870064163284189, "loss": 1.4864, "step": 2599 }, { "epoch": 0.2199847702851341, "grad_norm": 1122304.0, "learning_rate": 0.0008869220731974328, "loss": 1.4685, "step": 2600 }, { "epoch": 0.22006937981216684, "grad_norm": 217088.0, "learning_rate": 0.0008868377026127393, "loss": 1.2127, "step": 2601 }, { "epoch": 0.2201539893391996, "grad_norm": 505856.0, "learning_rate": 0.0008867533045803249, "loss": 1.2835, "step": 2602 }, { "epoch": 0.22023859886623234, "grad_norm": 219136.0, "learning_rate": 0.0008866688791061777, "loss": 1.5861, "step": 2603 }, { "epoch": 0.22032320839326508, "grad_norm": 41728.0, "learning_rate": 0.000886584426196288, "loss": 1.6669, "step": 2604 }, { "epoch": 0.22040781792029782, "grad_norm": 22144.0, "learning_rate": 0.0008864999458566485, "loss": 1.6118, "step": 2605 }, { "epoch": 0.22049242744733058, "grad_norm": 5472.0, "learning_rate": 0.0008864154380932531, "loss": 1.6802, "step": 2606 }, { "epoch": 0.22057703697436332, "grad_norm": 516.0, "learning_rate": 0.000886330902912098, "loss": 1.8683, "step": 2607 }, { "epoch": 0.22066164650139605, "grad_norm": 76.0, "learning_rate": 0.0008862463403191814, "loss": 1.391, "step": 2608 }, { "epoch": 0.2207462560284288, "grad_norm": 3.0, "learning_rate": 0.0008861617503205034, "loss": 1.4949, "step": 2609 }, { "epoch": 0.22083086555546155, "grad_norm": 2.40625, "learning_rate": 0.0008860771329220658, "loss": 1.6207, "step": 2610 }, { "epoch": 0.2209154750824943, "grad_norm": 2.859375, "learning_rate": 0.0008859924881298728, "loss": 1.9426, "step": 2611 }, { "epoch": 0.22100008460952703, "grad_norm": 3.0, "learning_rate": 0.0008859078159499302, "loss": 1.257, "step": 2612 }, { "epoch": 0.2210846941365598, "grad_norm": 2.046875, "learning_rate": 0.0008858231163882457, "loss": 1.9186, "step": 2613 }, { "epoch": 0.22116930366359253, "grad_norm": 2.3125, "learning_rate": 0.0008857383894508294, "loss": 1.2892, "step": 2614 }, { "epoch": 0.22125391319062526, "grad_norm": 2.265625, "learning_rate": 0.0008856536351436927, "loss": 1.3198, "step": 2615 }, { "epoch": 0.221338522717658, "grad_norm": 2.15625, "learning_rate": 0.0008855688534728495, "loss": 1.3635, "step": 2616 }, { "epoch": 0.22142313224469076, "grad_norm": 1.8125, "learning_rate": 0.0008854840444443151, "loss": 1.2477, "step": 2617 }, { "epoch": 0.2215077417717235, "grad_norm": 2.78125, "learning_rate": 0.0008853992080641073, "loss": 1.5673, "step": 2618 }, { "epoch": 0.22159235129875623, "grad_norm": 2.609375, "learning_rate": 0.0008853143443382456, "loss": 1.6482, "step": 2619 }, { "epoch": 0.22167696082578897, "grad_norm": 1.828125, "learning_rate": 0.0008852294532727514, "loss": 1.3679, "step": 2620 }, { "epoch": 0.22176157035282174, "grad_norm": 1.7890625, "learning_rate": 0.000885144534873648, "loss": 1.2207, "step": 2621 }, { "epoch": 0.22184617987985447, "grad_norm": 1.9765625, "learning_rate": 0.0008850595891469608, "loss": 1.3827, "step": 2622 }, { "epoch": 0.2219307894068872, "grad_norm": 1.78125, "learning_rate": 0.0008849746160987169, "loss": 1.5024, "step": 2623 }, { "epoch": 0.22201539893391997, "grad_norm": 5.96875, "learning_rate": 0.0008848896157349455, "loss": 1.1484, "step": 2624 }, { "epoch": 0.2221000084609527, "grad_norm": 2.046875, "learning_rate": 0.000884804588061678, "loss": 1.9569, "step": 2625 }, { "epoch": 0.22218461798798544, "grad_norm": 2.109375, "learning_rate": 0.0008847195330849471, "loss": 1.5809, "step": 2626 }, { "epoch": 0.22226922751501818, "grad_norm": 2.078125, "learning_rate": 0.0008846344508107881, "loss": 1.129, "step": 2627 }, { "epoch": 0.22235383704205094, "grad_norm": 2.09375, "learning_rate": 0.0008845493412452378, "loss": 1.3943, "step": 2628 }, { "epoch": 0.22243844656908368, "grad_norm": 1.609375, "learning_rate": 0.000884464204394335, "loss": 1.2059, "step": 2629 }, { "epoch": 0.22252305609611642, "grad_norm": 1.8125, "learning_rate": 0.0008843790402641205, "loss": 1.4547, "step": 2630 }, { "epoch": 0.22260766562314915, "grad_norm": 1.875, "learning_rate": 0.0008842938488606374, "loss": 1.3521, "step": 2631 }, { "epoch": 0.22269227515018192, "grad_norm": 1.546875, "learning_rate": 0.0008842086301899298, "loss": 1.2176, "step": 2632 }, { "epoch": 0.22277688467721465, "grad_norm": 2.171875, "learning_rate": 0.0008841233842580448, "loss": 1.5455, "step": 2633 }, { "epoch": 0.2228614942042474, "grad_norm": 1.84375, "learning_rate": 0.0008840381110710307, "loss": 1.4032, "step": 2634 }, { "epoch": 0.22294610373128015, "grad_norm": 1.7578125, "learning_rate": 0.000883952810634938, "loss": 1.2338, "step": 2635 }, { "epoch": 0.2230307132583129, "grad_norm": 1.7265625, "learning_rate": 0.0008838674829558193, "loss": 1.3313, "step": 2636 }, { "epoch": 0.22311532278534563, "grad_norm": 1.40625, "learning_rate": 0.0008837821280397287, "loss": 1.2865, "step": 2637 }, { "epoch": 0.22319993231237836, "grad_norm": 1.625, "learning_rate": 0.0008836967458927226, "loss": 1.1735, "step": 2638 }, { "epoch": 0.22328454183941113, "grad_norm": 1.828125, "learning_rate": 0.0008836113365208593, "loss": 1.1633, "step": 2639 }, { "epoch": 0.22336915136644386, "grad_norm": 2.71875, "learning_rate": 0.0008835258999301987, "loss": 1.5114, "step": 2640 }, { "epoch": 0.2234537608934766, "grad_norm": 1.875, "learning_rate": 0.0008834404361268032, "loss": 1.3958, "step": 2641 }, { "epoch": 0.22353837042050934, "grad_norm": 2.0, "learning_rate": 0.0008833549451167366, "loss": 1.554, "step": 2642 }, { "epoch": 0.2236229799475421, "grad_norm": 1.6953125, "learning_rate": 0.0008832694269060649, "loss": 1.2976, "step": 2643 }, { "epoch": 0.22370758947457484, "grad_norm": 1.90625, "learning_rate": 0.0008831838815008558, "loss": 1.8131, "step": 2644 }, { "epoch": 0.22379219900160757, "grad_norm": 1.5, "learning_rate": 0.0008830983089071794, "loss": 1.1211, "step": 2645 }, { "epoch": 0.22387680852864034, "grad_norm": 1.53125, "learning_rate": 0.0008830127091311071, "loss": 1.3068, "step": 2646 }, { "epoch": 0.22396141805567307, "grad_norm": 1.8828125, "learning_rate": 0.0008829270821787128, "loss": 1.6667, "step": 2647 }, { "epoch": 0.2240460275827058, "grad_norm": 1.8359375, "learning_rate": 0.000882841428056072, "loss": 1.3173, "step": 2648 }, { "epoch": 0.22413063710973855, "grad_norm": 1.90625, "learning_rate": 0.0008827557467692621, "loss": 1.332, "step": 2649 }, { "epoch": 0.2242152466367713, "grad_norm": 1.5859375, "learning_rate": 0.000882670038324363, "loss": 1.9832, "step": 2650 }, { "epoch": 0.22429985616380405, "grad_norm": 1.6171875, "learning_rate": 0.0008825843027274554, "loss": 1.3166, "step": 2651 }, { "epoch": 0.22438446569083678, "grad_norm": 1.9609375, "learning_rate": 0.000882498539984623, "loss": 1.9933, "step": 2652 }, { "epoch": 0.22446907521786955, "grad_norm": 1.5390625, "learning_rate": 0.000882412750101951, "loss": 1.336, "step": 2653 }, { "epoch": 0.22455368474490228, "grad_norm": 1.9375, "learning_rate": 0.0008823269330855264, "loss": 1.7634, "step": 2654 }, { "epoch": 0.22463829427193502, "grad_norm": 1.3125, "learning_rate": 0.0008822410889414384, "loss": 1.0534, "step": 2655 }, { "epoch": 0.22472290379896775, "grad_norm": 1.8515625, "learning_rate": 0.0008821552176757779, "loss": 1.6947, "step": 2656 }, { "epoch": 0.22480751332600052, "grad_norm": 2.09375, "learning_rate": 0.0008820693192946379, "loss": 2.2214, "step": 2657 }, { "epoch": 0.22489212285303326, "grad_norm": 1.375, "learning_rate": 0.000881983393804113, "loss": 1.3105, "step": 2658 }, { "epoch": 0.224976732380066, "grad_norm": 1.484375, "learning_rate": 0.0008818974412103005, "loss": 1.2491, "step": 2659 }, { "epoch": 0.22506134190709873, "grad_norm": 1.640625, "learning_rate": 0.0008818114615192985, "loss": 1.1721, "step": 2660 }, { "epoch": 0.2251459514341315, "grad_norm": 1.3046875, "learning_rate": 0.000881725454737208, "loss": 1.1451, "step": 2661 }, { "epoch": 0.22523056096116423, "grad_norm": 1.640625, "learning_rate": 0.0008816394208701313, "loss": 1.4411, "step": 2662 }, { "epoch": 0.22531517048819696, "grad_norm": 1.53125, "learning_rate": 0.0008815533599241729, "loss": 1.4658, "step": 2663 }, { "epoch": 0.22539978001522973, "grad_norm": 1.5390625, "learning_rate": 0.0008814672719054394, "loss": 1.4821, "step": 2664 }, { "epoch": 0.22548438954226246, "grad_norm": 1.59375, "learning_rate": 0.0008813811568200388, "loss": 1.6654, "step": 2665 }, { "epoch": 0.2255689990692952, "grad_norm": 1.5859375, "learning_rate": 0.0008812950146740815, "loss": 1.2943, "step": 2666 }, { "epoch": 0.22565360859632794, "grad_norm": 1.53125, "learning_rate": 0.0008812088454736798, "loss": 1.6562, "step": 2667 }, { "epoch": 0.2257382181233607, "grad_norm": 1.453125, "learning_rate": 0.0008811226492249473, "loss": 1.1574, "step": 2668 }, { "epoch": 0.22582282765039344, "grad_norm": 1.796875, "learning_rate": 0.0008810364259340002, "loss": 1.7134, "step": 2669 }, { "epoch": 0.22590743717742617, "grad_norm": 1.6640625, "learning_rate": 0.0008809501756069564, "loss": 2.0919, "step": 2670 }, { "epoch": 0.2259920467044589, "grad_norm": 1.9765625, "learning_rate": 0.0008808638982499359, "loss": 2.0838, "step": 2671 }, { "epoch": 0.22607665623149167, "grad_norm": 1.7109375, "learning_rate": 0.0008807775938690601, "loss": 1.1516, "step": 2672 }, { "epoch": 0.2261612657585244, "grad_norm": 1.1640625, "learning_rate": 0.0008806912624704529, "loss": 0.9851, "step": 2673 }, { "epoch": 0.22624587528555715, "grad_norm": 1.609375, "learning_rate": 0.0008806049040602398, "loss": 1.3182, "step": 2674 }, { "epoch": 0.2263304848125899, "grad_norm": 2.015625, "learning_rate": 0.0008805185186445482, "loss": 1.714, "step": 2675 }, { "epoch": 0.22641509433962265, "grad_norm": 1.34375, "learning_rate": 0.0008804321062295075, "loss": 1.1657, "step": 2676 }, { "epoch": 0.22649970386665538, "grad_norm": 1.328125, "learning_rate": 0.0008803456668212492, "loss": 1.0647, "step": 2677 }, { "epoch": 0.22658431339368812, "grad_norm": 1.390625, "learning_rate": 0.0008802592004259063, "loss": 1.2013, "step": 2678 }, { "epoch": 0.22666892292072088, "grad_norm": 1.953125, "learning_rate": 0.0008801727070496142, "loss": 1.9727, "step": 2679 }, { "epoch": 0.22675353244775362, "grad_norm": 1.515625, "learning_rate": 0.0008800861866985097, "loss": 1.1203, "step": 2680 }, { "epoch": 0.22683814197478636, "grad_norm": 1.7578125, "learning_rate": 0.0008799996393787319, "loss": 1.5046, "step": 2681 }, { "epoch": 0.2269227515018191, "grad_norm": 2.4375, "learning_rate": 0.0008799130650964218, "loss": 1.6036, "step": 2682 }, { "epoch": 0.22700736102885186, "grad_norm": 1.40625, "learning_rate": 0.0008798264638577218, "loss": 1.0213, "step": 2683 }, { "epoch": 0.2270919705558846, "grad_norm": 1.625, "learning_rate": 0.0008797398356687772, "loss": 1.9584, "step": 2684 }, { "epoch": 0.22717658008291733, "grad_norm": 1.6953125, "learning_rate": 0.0008796531805357341, "loss": 1.103, "step": 2685 }, { "epoch": 0.2272611896099501, "grad_norm": 1.5703125, "learning_rate": 0.0008795664984647413, "loss": 1.4827, "step": 2686 }, { "epoch": 0.22734579913698283, "grad_norm": 1.3359375, "learning_rate": 0.0008794797894619493, "loss": 1.2598, "step": 2687 }, { "epoch": 0.22743040866401557, "grad_norm": 3.5, "learning_rate": 0.0008793930535335102, "loss": 1.6202, "step": 2688 }, { "epoch": 0.2275150181910483, "grad_norm": 1.8671875, "learning_rate": 0.0008793062906855785, "loss": 2.1636, "step": 2689 }, { "epoch": 0.22759962771808107, "grad_norm": 1.6171875, "learning_rate": 0.0008792195009243104, "loss": 1.1157, "step": 2690 }, { "epoch": 0.2276842372451138, "grad_norm": 1.3046875, "learning_rate": 0.0008791326842558638, "loss": 1.1193, "step": 2691 }, { "epoch": 0.22776884677214654, "grad_norm": 1.4453125, "learning_rate": 0.0008790458406863988, "loss": 1.2044, "step": 2692 }, { "epoch": 0.22785345629917927, "grad_norm": 5.28125, "learning_rate": 0.0008789589702220773, "loss": 1.6874, "step": 2693 }, { "epoch": 0.22793806582621204, "grad_norm": 1.6953125, "learning_rate": 0.0008788720728690632, "loss": 1.2665, "step": 2694 }, { "epoch": 0.22802267535324477, "grad_norm": 1.7265625, "learning_rate": 0.0008787851486335219, "loss": 1.4513, "step": 2695 }, { "epoch": 0.2281072848802775, "grad_norm": 1.578125, "learning_rate": 0.0008786981975216215, "loss": 1.4304, "step": 2696 }, { "epoch": 0.22819189440731028, "grad_norm": 1.2890625, "learning_rate": 0.0008786112195395311, "loss": 1.0832, "step": 2697 }, { "epoch": 0.228276503934343, "grad_norm": 2.109375, "learning_rate": 0.0008785242146934225, "loss": 2.1895, "step": 2698 }, { "epoch": 0.22836111346137575, "grad_norm": 1.21875, "learning_rate": 0.0008784371829894691, "loss": 1.5515, "step": 2699 }, { "epoch": 0.22844572298840848, "grad_norm": 1.609375, "learning_rate": 0.0008783501244338457, "loss": 1.4098, "step": 2700 }, { "epoch": 0.22853033251544125, "grad_norm": 1.59375, "learning_rate": 0.0008782630390327297, "loss": 1.4978, "step": 2701 }, { "epoch": 0.22861494204247398, "grad_norm": 1.5859375, "learning_rate": 0.0008781759267923003, "loss": 1.1874, "step": 2702 }, { "epoch": 0.22869955156950672, "grad_norm": 1.5546875, "learning_rate": 0.0008780887877187384, "loss": 1.1372, "step": 2703 }, { "epoch": 0.22878416109653946, "grad_norm": 1.1015625, "learning_rate": 0.0008780016218182268, "loss": 0.9058, "step": 2704 }, { "epoch": 0.22886877062357222, "grad_norm": 1.8359375, "learning_rate": 0.0008779144290969501, "loss": 1.6349, "step": 2705 }, { "epoch": 0.22895338015060496, "grad_norm": 1.5234375, "learning_rate": 0.0008778272095610956, "loss": 1.5519, "step": 2706 }, { "epoch": 0.2290379896776377, "grad_norm": 1.7734375, "learning_rate": 0.000877739963216851, "loss": 1.1712, "step": 2707 }, { "epoch": 0.22912259920467046, "grad_norm": 1.671875, "learning_rate": 0.0008776526900704076, "loss": 1.4494, "step": 2708 }, { "epoch": 0.2292072087317032, "grad_norm": 2.40625, "learning_rate": 0.0008775653901279574, "loss": 1.3978, "step": 2709 }, { "epoch": 0.22929181825873593, "grad_norm": 1.4296875, "learning_rate": 0.0008774780633956947, "loss": 1.0627, "step": 2710 }, { "epoch": 0.22937642778576867, "grad_norm": 1.421875, "learning_rate": 0.0008773907098798158, "loss": 1.0296, "step": 2711 }, { "epoch": 0.22946103731280143, "grad_norm": 1.4296875, "learning_rate": 0.0008773033295865185, "loss": 1.0328, "step": 2712 }, { "epoch": 0.22954564683983417, "grad_norm": 1.890625, "learning_rate": 0.0008772159225220031, "loss": 1.6422, "step": 2713 }, { "epoch": 0.2296302563668669, "grad_norm": 1.53125, "learning_rate": 0.0008771284886924715, "loss": 1.1939, "step": 2714 }, { "epoch": 0.22971486589389967, "grad_norm": 1.2578125, "learning_rate": 0.0008770410281041274, "loss": 1.027, "step": 2715 }, { "epoch": 0.2297994754209324, "grad_norm": 2.046875, "learning_rate": 0.0008769535407631764, "loss": 1.4467, "step": 2716 }, { "epoch": 0.22988408494796514, "grad_norm": 1.3515625, "learning_rate": 0.0008768660266758262, "loss": 1.5729, "step": 2717 }, { "epoch": 0.22996869447499788, "grad_norm": 1.359375, "learning_rate": 0.0008767784858482861, "loss": 1.4839, "step": 2718 }, { "epoch": 0.23005330400203064, "grad_norm": 1.7578125, "learning_rate": 0.0008766909182867678, "loss": 1.4801, "step": 2719 }, { "epoch": 0.23013791352906338, "grad_norm": 1.3984375, "learning_rate": 0.0008766033239974843, "loss": 1.2452, "step": 2720 }, { "epoch": 0.2302225230560961, "grad_norm": 1.8359375, "learning_rate": 0.000876515702986651, "loss": 1.8348, "step": 2721 }, { "epoch": 0.23030713258312885, "grad_norm": 2.390625, "learning_rate": 0.0008764280552604847, "loss": 1.2546, "step": 2722 }, { "epoch": 0.2303917421101616, "grad_norm": 1.375, "learning_rate": 0.0008763403808252046, "loss": 1.1628, "step": 2723 }, { "epoch": 0.23047635163719435, "grad_norm": 2.078125, "learning_rate": 0.0008762526796870314, "loss": 1.7299, "step": 2724 }, { "epoch": 0.23056096116422709, "grad_norm": 1.8984375, "learning_rate": 0.0008761649518521879, "loss": 1.8137, "step": 2725 }, { "epoch": 0.23064557069125985, "grad_norm": 1.921875, "learning_rate": 0.0008760771973268989, "loss": 2.2917, "step": 2726 }, { "epoch": 0.23073018021829259, "grad_norm": 1.71875, "learning_rate": 0.000875989416117391, "loss": 1.5087, "step": 2727 }, { "epoch": 0.23081478974532532, "grad_norm": 1.359375, "learning_rate": 0.000875901608229892, "loss": 1.05, "step": 2728 }, { "epoch": 0.23089939927235806, "grad_norm": 1.2734375, "learning_rate": 0.0008758137736706329, "loss": 1.0356, "step": 2729 }, { "epoch": 0.23098400879939082, "grad_norm": 1.6484375, "learning_rate": 0.0008757259124458459, "loss": 1.5113, "step": 2730 }, { "epoch": 0.23106861832642356, "grad_norm": 1.734375, "learning_rate": 0.0008756380245617646, "loss": 1.3844, "step": 2731 }, { "epoch": 0.2311532278534563, "grad_norm": 1.734375, "learning_rate": 0.0008755501100246255, "loss": 1.5566, "step": 2732 }, { "epoch": 0.23123783738048903, "grad_norm": 1.359375, "learning_rate": 0.0008754621688406663, "loss": 1.2449, "step": 2733 }, { "epoch": 0.2313224469075218, "grad_norm": 2.03125, "learning_rate": 0.0008753742010161268, "loss": 1.1149, "step": 2734 }, { "epoch": 0.23140705643455453, "grad_norm": 2.078125, "learning_rate": 0.0008752862065572487, "loss": 1.7033, "step": 2735 }, { "epoch": 0.23149166596158727, "grad_norm": 1.84375, "learning_rate": 0.0008751981854702755, "loss": 1.3811, "step": 2736 }, { "epoch": 0.23157627548862003, "grad_norm": 2.1875, "learning_rate": 0.0008751101377614528, "loss": 2.261, "step": 2737 }, { "epoch": 0.23166088501565277, "grad_norm": 1.3671875, "learning_rate": 0.0008750220634370278, "loss": 1.1927, "step": 2738 }, { "epoch": 0.2317454945426855, "grad_norm": 1.6015625, "learning_rate": 0.0008749339625032499, "loss": 1.1146, "step": 2739 }, { "epoch": 0.23183010406971824, "grad_norm": 1.90625, "learning_rate": 0.00087484583496637, "loss": 1.8976, "step": 2740 }, { "epoch": 0.231914713596751, "grad_norm": 1.6328125, "learning_rate": 0.0008747576808326413, "loss": 1.2385, "step": 2741 }, { "epoch": 0.23199932312378374, "grad_norm": 1.6953125, "learning_rate": 0.0008746695001083187, "loss": 1.79, "step": 2742 }, { "epoch": 0.23208393265081648, "grad_norm": 1.8671875, "learning_rate": 0.0008745812927996587, "loss": 1.6459, "step": 2743 }, { "epoch": 0.2321685421778492, "grad_norm": 1.6015625, "learning_rate": 0.0008744930589129203, "loss": 1.6154, "step": 2744 }, { "epoch": 0.23225315170488198, "grad_norm": 1.8984375, "learning_rate": 0.0008744047984543639, "loss": 1.3885, "step": 2745 }, { "epoch": 0.2323377612319147, "grad_norm": 1.140625, "learning_rate": 0.0008743165114302521, "loss": 0.985, "step": 2746 }, { "epoch": 0.23242237075894745, "grad_norm": 1.7578125, "learning_rate": 0.0008742281978468489, "loss": 1.4935, "step": 2747 }, { "epoch": 0.2325069802859802, "grad_norm": 1.6875, "learning_rate": 0.0008741398577104208, "loss": 2.1737, "step": 2748 }, { "epoch": 0.23259158981301295, "grad_norm": 1.84375, "learning_rate": 0.0008740514910272359, "loss": 1.4344, "step": 2749 }, { "epoch": 0.2326761993400457, "grad_norm": 1.4375, "learning_rate": 0.000873963097803564, "loss": 1.2181, "step": 2750 }, { "epoch": 0.23276080886707842, "grad_norm": 1.6640625, "learning_rate": 0.000873874678045677, "loss": 1.4556, "step": 2751 }, { "epoch": 0.2328454183941112, "grad_norm": 1.671875, "learning_rate": 0.0008737862317598488, "loss": 1.181, "step": 2752 }, { "epoch": 0.23293002792114392, "grad_norm": 1.3828125, "learning_rate": 0.0008736977589523548, "loss": 1.2506, "step": 2753 }, { "epoch": 0.23301463744817666, "grad_norm": 2.0625, "learning_rate": 0.0008736092596294727, "loss": 1.764, "step": 2754 }, { "epoch": 0.2330992469752094, "grad_norm": 1.5546875, "learning_rate": 0.0008735207337974819, "loss": 1.3002, "step": 2755 }, { "epoch": 0.23318385650224216, "grad_norm": 1.5, "learning_rate": 0.0008734321814626634, "loss": 1.2612, "step": 2756 }, { "epoch": 0.2332684660292749, "grad_norm": 2.78125, "learning_rate": 0.0008733436026313006, "loss": 1.4061, "step": 2757 }, { "epoch": 0.23335307555630763, "grad_norm": 1.703125, "learning_rate": 0.0008732549973096786, "loss": 1.2828, "step": 2758 }, { "epoch": 0.2334376850833404, "grad_norm": 1.453125, "learning_rate": 0.0008731663655040841, "loss": 1.2475, "step": 2759 }, { "epoch": 0.23352229461037313, "grad_norm": 1.3359375, "learning_rate": 0.000873077707220806, "loss": 1.0972, "step": 2760 }, { "epoch": 0.23360690413740587, "grad_norm": 1.7578125, "learning_rate": 0.000872989022466135, "loss": 1.1219, "step": 2761 }, { "epoch": 0.2336915136644386, "grad_norm": 1.171875, "learning_rate": 0.0008729003112463633, "loss": 1.0636, "step": 2762 }, { "epoch": 0.23377612319147137, "grad_norm": 1.4609375, "learning_rate": 0.000872811573567786, "loss": 1.4327, "step": 2763 }, { "epoch": 0.2338607327185041, "grad_norm": 1.2265625, "learning_rate": 0.0008727228094366988, "loss": 0.8678, "step": 2764 }, { "epoch": 0.23394534224553684, "grad_norm": 2.203125, "learning_rate": 0.0008726340188594001, "loss": 2.2619, "step": 2765 }, { "epoch": 0.23402995177256958, "grad_norm": 1.4375, "learning_rate": 0.00087254520184219, "loss": 1.2703, "step": 2766 }, { "epoch": 0.23411456129960234, "grad_norm": 2.0625, "learning_rate": 0.0008724563583913703, "loss": 1.5079, "step": 2767 }, { "epoch": 0.23419917082663508, "grad_norm": 2.109375, "learning_rate": 0.0008723674885132449, "loss": 2.199, "step": 2768 }, { "epoch": 0.23428378035366781, "grad_norm": 1.9609375, "learning_rate": 0.0008722785922141196, "loss": 1.8848, "step": 2769 }, { "epoch": 0.23436838988070058, "grad_norm": 1.4765625, "learning_rate": 0.0008721896695003018, "loss": 1.2944, "step": 2770 }, { "epoch": 0.23445299940773331, "grad_norm": 1.78125, "learning_rate": 0.0008721007203781008, "loss": 1.918, "step": 2771 }, { "epoch": 0.23453760893476605, "grad_norm": 1.6171875, "learning_rate": 0.000872011744853828, "loss": 1.1977, "step": 2772 }, { "epoch": 0.2346222184617988, "grad_norm": 2.0, "learning_rate": 0.0008719227429337971, "loss": 1.0736, "step": 2773 }, { "epoch": 0.23470682798883155, "grad_norm": 1.4296875, "learning_rate": 0.0008718337146243223, "loss": 1.146, "step": 2774 }, { "epoch": 0.2347914375158643, "grad_norm": 1.9296875, "learning_rate": 0.000871744659931721, "loss": 1.4926, "step": 2775 }, { "epoch": 0.23487604704289702, "grad_norm": 1.2890625, "learning_rate": 0.0008716555788623119, "loss": 1.2434, "step": 2776 }, { "epoch": 0.2349606565699298, "grad_norm": 1.7734375, "learning_rate": 0.0008715664714224156, "loss": 1.4593, "step": 2777 }, { "epoch": 0.23504526609696252, "grad_norm": 2.1875, "learning_rate": 0.0008714773376183549, "loss": 1.3533, "step": 2778 }, { "epoch": 0.23512987562399526, "grad_norm": 1.8046875, "learning_rate": 0.0008713881774564539, "loss": 1.4273, "step": 2779 }, { "epoch": 0.235214485151028, "grad_norm": 1.9609375, "learning_rate": 0.0008712989909430391, "loss": 2.0826, "step": 2780 }, { "epoch": 0.23529909467806076, "grad_norm": 1.7109375, "learning_rate": 0.0008712097780844384, "loss": 1.7682, "step": 2781 }, { "epoch": 0.2353837042050935, "grad_norm": 1.71875, "learning_rate": 0.0008711205388869821, "loss": 1.2716, "step": 2782 }, { "epoch": 0.23546831373212623, "grad_norm": 1.4921875, "learning_rate": 0.0008710312733570019, "loss": 1.029, "step": 2783 }, { "epoch": 0.23555292325915897, "grad_norm": 1.9453125, "learning_rate": 0.0008709419815008317, "loss": 1.4865, "step": 2784 }, { "epoch": 0.23563753278619173, "grad_norm": 1.890625, "learning_rate": 0.0008708526633248071, "loss": 1.2381, "step": 2785 }, { "epoch": 0.23572214231322447, "grad_norm": 1.59375, "learning_rate": 0.0008707633188352652, "loss": 1.3446, "step": 2786 }, { "epoch": 0.2358067518402572, "grad_norm": 1.46875, "learning_rate": 0.000870673948038546, "loss": 1.4228, "step": 2787 }, { "epoch": 0.23589136136728997, "grad_norm": 1.578125, "learning_rate": 0.0008705845509409903, "loss": 1.1726, "step": 2788 }, { "epoch": 0.2359759708943227, "grad_norm": 1.1796875, "learning_rate": 0.0008704951275489412, "loss": 1.1583, "step": 2789 }, { "epoch": 0.23606058042135544, "grad_norm": 1.265625, "learning_rate": 0.0008704056778687439, "loss": 1.0434, "step": 2790 }, { "epoch": 0.23614518994838818, "grad_norm": 1.4453125, "learning_rate": 0.0008703162019067451, "loss": 1.6138, "step": 2791 }, { "epoch": 0.23622979947542094, "grad_norm": 1.2265625, "learning_rate": 0.0008702266996692933, "loss": 0.9944, "step": 2792 }, { "epoch": 0.23631440900245368, "grad_norm": 2.015625, "learning_rate": 0.0008701371711627395, "loss": 1.5954, "step": 2793 }, { "epoch": 0.23639901852948642, "grad_norm": 2.234375, "learning_rate": 0.0008700476163934356, "loss": 1.5631, "step": 2794 }, { "epoch": 0.23648362805651915, "grad_norm": 1.78125, "learning_rate": 0.0008699580353677361, "loss": 1.5404, "step": 2795 }, { "epoch": 0.23656823758355192, "grad_norm": 1.53125, "learning_rate": 0.0008698684280919973, "loss": 1.1929, "step": 2796 }, { "epoch": 0.23665284711058465, "grad_norm": 1.3671875, "learning_rate": 0.0008697787945725769, "loss": 1.1729, "step": 2797 }, { "epoch": 0.2367374566376174, "grad_norm": 1.3671875, "learning_rate": 0.000869689134815835, "loss": 1.0673, "step": 2798 }, { "epoch": 0.23682206616465015, "grad_norm": 1.875, "learning_rate": 0.0008695994488281332, "loss": 1.5636, "step": 2799 }, { "epoch": 0.2369066756916829, "grad_norm": 1.7109375, "learning_rate": 0.0008695097366158351, "loss": 1.4929, "step": 2800 }, { "epoch": 0.23699128521871562, "grad_norm": 2.171875, "learning_rate": 0.0008694199981853064, "loss": 1.5947, "step": 2801 }, { "epoch": 0.23707589474574836, "grad_norm": 1.7265625, "learning_rate": 0.000869330233542914, "loss": 1.409, "step": 2802 }, { "epoch": 0.23716050427278113, "grad_norm": 1.515625, "learning_rate": 0.0008692404426950273, "loss": 1.252, "step": 2803 }, { "epoch": 0.23724511379981386, "grad_norm": 1.96875, "learning_rate": 0.0008691506256480173, "loss": 1.2529, "step": 2804 }, { "epoch": 0.2373297233268466, "grad_norm": 1.6796875, "learning_rate": 0.0008690607824082568, "loss": 1.206, "step": 2805 }, { "epoch": 0.23741433285387933, "grad_norm": 1.3125, "learning_rate": 0.0008689709129821207, "loss": 1.1905, "step": 2806 }, { "epoch": 0.2374989423809121, "grad_norm": 1.8203125, "learning_rate": 0.0008688810173759856, "loss": 1.8923, "step": 2807 }, { "epoch": 0.23758355190794483, "grad_norm": 1.9296875, "learning_rate": 0.0008687910955962298, "loss": 1.438, "step": 2808 }, { "epoch": 0.23766816143497757, "grad_norm": 1.453125, "learning_rate": 0.0008687011476492338, "loss": 1.2032, "step": 2809 }, { "epoch": 0.23775277096201033, "grad_norm": 1.46875, "learning_rate": 0.0008686111735413795, "loss": 1.2185, "step": 2810 }, { "epoch": 0.23783738048904307, "grad_norm": 1.2734375, "learning_rate": 0.0008685211732790512, "loss": 1.0866, "step": 2811 }, { "epoch": 0.2379219900160758, "grad_norm": 1.453125, "learning_rate": 0.0008684311468686348, "loss": 1.3376, "step": 2812 }, { "epoch": 0.23800659954310854, "grad_norm": 1.9609375, "learning_rate": 0.0008683410943165179, "loss": 1.7454, "step": 2813 }, { "epoch": 0.2380912090701413, "grad_norm": 1.6640625, "learning_rate": 0.0008682510156290901, "loss": 1.6194, "step": 2814 }, { "epoch": 0.23817581859717404, "grad_norm": 1.6015625, "learning_rate": 0.000868160910812743, "loss": 1.1913, "step": 2815 }, { "epoch": 0.23826042812420678, "grad_norm": 1.78125, "learning_rate": 0.0008680707798738699, "loss": 1.4462, "step": 2816 }, { "epoch": 0.23834503765123952, "grad_norm": 1.9921875, "learning_rate": 0.0008679806228188656, "loss": 1.4198, "step": 2817 }, { "epoch": 0.23842964717827228, "grad_norm": 1.2890625, "learning_rate": 0.0008678904396541274, "loss": 1.0139, "step": 2818 }, { "epoch": 0.23851425670530502, "grad_norm": 2.046875, "learning_rate": 0.0008678002303860544, "loss": 1.2707, "step": 2819 }, { "epoch": 0.23859886623233775, "grad_norm": 1.6171875, "learning_rate": 0.0008677099950210467, "loss": 1.7213, "step": 2820 }, { "epoch": 0.23868347575937052, "grad_norm": 1.5, "learning_rate": 0.0008676197335655075, "loss": 1.2266, "step": 2821 }, { "epoch": 0.23876808528640325, "grad_norm": 1.7109375, "learning_rate": 0.000867529446025841, "loss": 1.3053, "step": 2822 }, { "epoch": 0.238852694813436, "grad_norm": 1.1484375, "learning_rate": 0.0008674391324084531, "loss": 1.071, "step": 2823 }, { "epoch": 0.23893730434046873, "grad_norm": 1.5078125, "learning_rate": 0.0008673487927197526, "loss": 1.7749, "step": 2824 }, { "epoch": 0.2390219138675015, "grad_norm": 1.6875, "learning_rate": 0.0008672584269661488, "loss": 1.362, "step": 2825 }, { "epoch": 0.23910652339453423, "grad_norm": 1.4765625, "learning_rate": 0.0008671680351540541, "loss": 1.6291, "step": 2826 }, { "epoch": 0.23919113292156696, "grad_norm": 1.546875, "learning_rate": 0.0008670776172898818, "loss": 1.5056, "step": 2827 }, { "epoch": 0.2392757424485997, "grad_norm": 1.421875, "learning_rate": 0.0008669871733800474, "loss": 1.117, "step": 2828 }, { "epoch": 0.23936035197563246, "grad_norm": 1.1875, "learning_rate": 0.0008668967034309685, "loss": 0.9995, "step": 2829 }, { "epoch": 0.2394449615026652, "grad_norm": 1.59375, "learning_rate": 0.0008668062074490641, "loss": 1.6379, "step": 2830 }, { "epoch": 0.23952957102969794, "grad_norm": 1.84375, "learning_rate": 0.0008667156854407555, "loss": 1.4909, "step": 2831 }, { "epoch": 0.2396141805567307, "grad_norm": 1.6796875, "learning_rate": 0.0008666251374124654, "loss": 1.3236, "step": 2832 }, { "epoch": 0.23969879008376344, "grad_norm": 1.8125, "learning_rate": 0.0008665345633706188, "loss": 1.8935, "step": 2833 }, { "epoch": 0.23978339961079617, "grad_norm": 1.546875, "learning_rate": 0.0008664439633216419, "loss": 1.0412, "step": 2834 }, { "epoch": 0.2398680091378289, "grad_norm": 1.203125, "learning_rate": 0.0008663533372719634, "loss": 1.0011, "step": 2835 }, { "epoch": 0.23995261866486167, "grad_norm": 1.390625, "learning_rate": 0.0008662626852280137, "loss": 1.0947, "step": 2836 }, { "epoch": 0.2400372281918944, "grad_norm": 1.8046875, "learning_rate": 0.0008661720071962246, "loss": 1.4713, "step": 2837 }, { "epoch": 0.24012183771892714, "grad_norm": 1.515625, "learning_rate": 0.0008660813031830303, "loss": 1.3567, "step": 2838 }, { "epoch": 0.2402064472459599, "grad_norm": 1.484375, "learning_rate": 0.0008659905731948668, "loss": 1.5275, "step": 2839 }, { "epoch": 0.24029105677299264, "grad_norm": 1.7109375, "learning_rate": 0.0008658998172381714, "loss": 1.6217, "step": 2840 }, { "epoch": 0.24037566630002538, "grad_norm": 1.6171875, "learning_rate": 0.0008658090353193837, "loss": 1.4286, "step": 2841 }, { "epoch": 0.24046027582705812, "grad_norm": 2.203125, "learning_rate": 0.0008657182274449453, "loss": 1.1282, "step": 2842 }, { "epoch": 0.24054488535409088, "grad_norm": 1.765625, "learning_rate": 0.0008656273936212991, "loss": 1.2086, "step": 2843 }, { "epoch": 0.24062949488112362, "grad_norm": 1.796875, "learning_rate": 0.0008655365338548902, "loss": 1.2861, "step": 2844 }, { "epoch": 0.24071410440815635, "grad_norm": 1.984375, "learning_rate": 0.0008654456481521656, "loss": 1.4803, "step": 2845 }, { "epoch": 0.2407987139351891, "grad_norm": 1.0, "learning_rate": 0.0008653547365195739, "loss": 0.9144, "step": 2846 }, { "epoch": 0.24088332346222185, "grad_norm": 1.4375, "learning_rate": 0.0008652637989635657, "loss": 1.1321, "step": 2847 }, { "epoch": 0.2409679329892546, "grad_norm": 1.0859375, "learning_rate": 0.0008651728354905933, "loss": 0.876, "step": 2848 }, { "epoch": 0.24105254251628733, "grad_norm": 1.953125, "learning_rate": 0.000865081846107111, "loss": 1.8349, "step": 2849 }, { "epoch": 0.2411371520433201, "grad_norm": 1.578125, "learning_rate": 0.0008649908308195749, "loss": 1.6385, "step": 2850 }, { "epoch": 0.24122176157035283, "grad_norm": 1.5, "learning_rate": 0.0008648997896344428, "loss": 0.9535, "step": 2851 }, { "epoch": 0.24130637109738556, "grad_norm": 1.7578125, "learning_rate": 0.0008648087225581745, "loss": 1.2518, "step": 2852 }, { "epoch": 0.2413909806244183, "grad_norm": 1.9140625, "learning_rate": 0.0008647176295972318, "loss": 1.618, "step": 2853 }, { "epoch": 0.24147559015145106, "grad_norm": 1.7265625, "learning_rate": 0.0008646265107580777, "loss": 1.1742, "step": 2854 }, { "epoch": 0.2415601996784838, "grad_norm": 1.359375, "learning_rate": 0.000864535366047178, "loss": 1.1411, "step": 2855 }, { "epoch": 0.24164480920551654, "grad_norm": 1.7890625, "learning_rate": 0.0008644441954709992, "loss": 1.6697, "step": 2856 }, { "epoch": 0.24172941873254927, "grad_norm": 1.7734375, "learning_rate": 0.0008643529990360106, "loss": 1.9578, "step": 2857 }, { "epoch": 0.24181402825958204, "grad_norm": 1.875, "learning_rate": 0.0008642617767486828, "loss": 1.317, "step": 2858 }, { "epoch": 0.24189863778661477, "grad_norm": 1.6953125, "learning_rate": 0.0008641705286154884, "loss": 1.8751, "step": 2859 }, { "epoch": 0.2419832473136475, "grad_norm": 1.796875, "learning_rate": 0.0008640792546429019, "loss": 1.374, "step": 2860 }, { "epoch": 0.24206785684068027, "grad_norm": 2.15625, "learning_rate": 0.0008639879548373997, "loss": 1.8236, "step": 2861 }, { "epoch": 0.242152466367713, "grad_norm": 1.5390625, "learning_rate": 0.0008638966292054597, "loss": 1.5013, "step": 2862 }, { "epoch": 0.24223707589474575, "grad_norm": 1.5703125, "learning_rate": 0.0008638052777535617, "loss": 1.0258, "step": 2863 }, { "epoch": 0.24232168542177848, "grad_norm": 1.671875, "learning_rate": 0.0008637139004881878, "loss": 1.6959, "step": 2864 }, { "epoch": 0.24240629494881125, "grad_norm": 1.6171875, "learning_rate": 0.0008636224974158214, "loss": 1.1078, "step": 2865 }, { "epoch": 0.24249090447584398, "grad_norm": 1.640625, "learning_rate": 0.000863531068542948, "loss": 1.4919, "step": 2866 }, { "epoch": 0.24257551400287672, "grad_norm": 1.2890625, "learning_rate": 0.0008634396138760546, "loss": 1.1175, "step": 2867 }, { "epoch": 0.24266012352990945, "grad_norm": 1.84375, "learning_rate": 0.0008633481334216306, "loss": 1.5178, "step": 2868 }, { "epoch": 0.24274473305694222, "grad_norm": 1.5625, "learning_rate": 0.0008632566271861668, "loss": 1.7322, "step": 2869 }, { "epoch": 0.24282934258397496, "grad_norm": 2.125, "learning_rate": 0.000863165095176156, "loss": 1.3605, "step": 2870 }, { "epoch": 0.2429139521110077, "grad_norm": 1.890625, "learning_rate": 0.0008630735373980925, "loss": 1.3433, "step": 2871 }, { "epoch": 0.24299856163804046, "grad_norm": 1.5703125, "learning_rate": 0.0008629819538584731, "loss": 1.3504, "step": 2872 }, { "epoch": 0.2430831711650732, "grad_norm": 2.203125, "learning_rate": 0.0008628903445637956, "loss": 1.2478, "step": 2873 }, { "epoch": 0.24316778069210593, "grad_norm": 1.8125, "learning_rate": 0.0008627987095205606, "loss": 1.2672, "step": 2874 }, { "epoch": 0.24325239021913866, "grad_norm": 1.6171875, "learning_rate": 0.0008627070487352695, "loss": 1.2092, "step": 2875 }, { "epoch": 0.24333699974617143, "grad_norm": 1.6640625, "learning_rate": 0.0008626153622144263, "loss": 1.2462, "step": 2876 }, { "epoch": 0.24342160927320416, "grad_norm": 1.5703125, "learning_rate": 0.0008625236499645362, "loss": 1.2356, "step": 2877 }, { "epoch": 0.2435062188002369, "grad_norm": 1.3046875, "learning_rate": 0.0008624319119921069, "loss": 1.0251, "step": 2878 }, { "epoch": 0.24359082832726964, "grad_norm": 1.890625, "learning_rate": 0.0008623401483036475, "loss": 1.3778, "step": 2879 }, { "epoch": 0.2436754378543024, "grad_norm": 1.59375, "learning_rate": 0.0008622483589056689, "loss": 1.1657, "step": 2880 }, { "epoch": 0.24376004738133514, "grad_norm": 1.7734375, "learning_rate": 0.0008621565438046841, "loss": 1.4953, "step": 2881 }, { "epoch": 0.24384465690836787, "grad_norm": 1.546875, "learning_rate": 0.0008620647030072075, "loss": 1.3632, "step": 2882 }, { "epoch": 0.24392926643540064, "grad_norm": 1.5546875, "learning_rate": 0.0008619728365197559, "loss": 1.2188, "step": 2883 }, { "epoch": 0.24401387596243337, "grad_norm": 1.5546875, "learning_rate": 0.0008618809443488474, "loss": 1.4056, "step": 2884 }, { "epoch": 0.2440984854894661, "grad_norm": 1.515625, "learning_rate": 0.0008617890265010022, "loss": 1.2619, "step": 2885 }, { "epoch": 0.24418309501649885, "grad_norm": 2.09375, "learning_rate": 0.0008616970829827422, "loss": 2.0393, "step": 2886 }, { "epoch": 0.2442677045435316, "grad_norm": 1.8125, "learning_rate": 0.0008616051138005911, "loss": 1.6146, "step": 2887 }, { "epoch": 0.24435231407056435, "grad_norm": 1.28125, "learning_rate": 0.0008615131189610747, "loss": 1.0625, "step": 2888 }, { "epoch": 0.24443692359759708, "grad_norm": 1.6484375, "learning_rate": 0.0008614210984707202, "loss": 1.14, "step": 2889 }, { "epoch": 0.24452153312462982, "grad_norm": 1.6953125, "learning_rate": 0.0008613290523360571, "loss": 1.6951, "step": 2890 }, { "epoch": 0.24460614265166258, "grad_norm": 1.2890625, "learning_rate": 0.000861236980563616, "loss": 1.0132, "step": 2891 }, { "epoch": 0.24469075217869532, "grad_norm": 1.5234375, "learning_rate": 0.0008611448831599303, "loss": 1.1349, "step": 2892 }, { "epoch": 0.24477536170572806, "grad_norm": 1.8359375, "learning_rate": 0.0008610527601315343, "loss": 1.5404, "step": 2893 }, { "epoch": 0.24485997123276082, "grad_norm": 1.84375, "learning_rate": 0.0008609606114849646, "loss": 1.683, "step": 2894 }, { "epoch": 0.24494458075979356, "grad_norm": 1.4140625, "learning_rate": 0.0008608684372267597, "loss": 1.4211, "step": 2895 }, { "epoch": 0.2450291902868263, "grad_norm": 1.6796875, "learning_rate": 0.0008607762373634596, "loss": 1.4385, "step": 2896 }, { "epoch": 0.24511379981385903, "grad_norm": 1.6875, "learning_rate": 0.0008606840119016061, "loss": 1.6186, "step": 2897 }, { "epoch": 0.2451984093408918, "grad_norm": 1.890625, "learning_rate": 0.0008605917608477433, "loss": 1.9468, "step": 2898 }, { "epoch": 0.24528301886792453, "grad_norm": 1.984375, "learning_rate": 0.0008604994842084166, "loss": 1.2648, "step": 2899 }, { "epoch": 0.24536762839495727, "grad_norm": 1.4375, "learning_rate": 0.0008604071819901734, "loss": 1.0198, "step": 2900 }, { "epoch": 0.24545223792199003, "grad_norm": 1.7109375, "learning_rate": 0.0008603148541995629, "loss": 1.302, "step": 2901 }, { "epoch": 0.24553684744902277, "grad_norm": 1.546875, "learning_rate": 0.0008602225008431363, "loss": 1.9302, "step": 2902 }, { "epoch": 0.2456214569760555, "grad_norm": 1.8515625, "learning_rate": 0.0008601301219274462, "loss": 1.9246, "step": 2903 }, { "epoch": 0.24570606650308824, "grad_norm": 1.609375, "learning_rate": 0.0008600377174590475, "loss": 2.0845, "step": 2904 }, { "epoch": 0.245790676030121, "grad_norm": 1.546875, "learning_rate": 0.0008599452874444967, "loss": 1.1711, "step": 2905 }, { "epoch": 0.24587528555715374, "grad_norm": 2.484375, "learning_rate": 0.0008598528318903517, "loss": 1.8542, "step": 2906 }, { "epoch": 0.24595989508418648, "grad_norm": 1.40625, "learning_rate": 0.0008597603508031731, "loss": 1.1227, "step": 2907 }, { "epoch": 0.2460445046112192, "grad_norm": 1.5703125, "learning_rate": 0.0008596678441895224, "loss": 1.5422, "step": 2908 }, { "epoch": 0.24612911413825198, "grad_norm": 1.6015625, "learning_rate": 0.0008595753120559635, "loss": 1.4239, "step": 2909 }, { "epoch": 0.2462137236652847, "grad_norm": 1.5546875, "learning_rate": 0.0008594827544090621, "loss": 1.1886, "step": 2910 }, { "epoch": 0.24629833319231745, "grad_norm": 1.3515625, "learning_rate": 0.0008593901712553853, "loss": 1.0686, "step": 2911 }, { "epoch": 0.2463829427193502, "grad_norm": 1.75, "learning_rate": 0.0008592975626015024, "loss": 1.0961, "step": 2912 }, { "epoch": 0.24646755224638295, "grad_norm": 1.8828125, "learning_rate": 0.0008592049284539843, "loss": 1.3186, "step": 2913 }, { "epoch": 0.24655216177341568, "grad_norm": 1.6328125, "learning_rate": 0.0008591122688194037, "loss": 1.0713, "step": 2914 }, { "epoch": 0.24663677130044842, "grad_norm": 2.21875, "learning_rate": 0.0008590195837043352, "loss": 1.5938, "step": 2915 }, { "epoch": 0.24672138082748118, "grad_norm": 8.25, "learning_rate": 0.0008589268731153554, "loss": 1.6384, "step": 2916 }, { "epoch": 0.24680599035451392, "grad_norm": 1.8046875, "learning_rate": 0.0008588341370590422, "loss": 1.1262, "step": 2917 }, { "epoch": 0.24689059988154666, "grad_norm": 2.125, "learning_rate": 0.0008587413755419758, "loss": 1.8849, "step": 2918 }, { "epoch": 0.2469752094085794, "grad_norm": 1.484375, "learning_rate": 0.0008586485885707379, "loss": 0.9423, "step": 2919 }, { "epoch": 0.24705981893561216, "grad_norm": 2.109375, "learning_rate": 0.0008585557761519123, "loss": 1.833, "step": 2920 }, { "epoch": 0.2471444284626449, "grad_norm": 1.609375, "learning_rate": 0.0008584629382920843, "loss": 1.2429, "step": 2921 }, { "epoch": 0.24722903798967763, "grad_norm": 1.453125, "learning_rate": 0.000858370074997841, "loss": 1.1259, "step": 2922 }, { "epoch": 0.2473136475167104, "grad_norm": 1.421875, "learning_rate": 0.0008582771862757715, "loss": 1.0183, "step": 2923 }, { "epoch": 0.24739825704374313, "grad_norm": 1.4765625, "learning_rate": 0.0008581842721324667, "loss": 1.1199, "step": 2924 }, { "epoch": 0.24748286657077587, "grad_norm": 1.6796875, "learning_rate": 0.0008580913325745194, "loss": 1.288, "step": 2925 }, { "epoch": 0.2475674760978086, "grad_norm": 1.5, "learning_rate": 0.0008579983676085237, "loss": 1.3318, "step": 2926 }, { "epoch": 0.24765208562484137, "grad_norm": 1.453125, "learning_rate": 0.0008579053772410761, "loss": 1.3879, "step": 2927 }, { "epoch": 0.2477366951518741, "grad_norm": 1.4921875, "learning_rate": 0.0008578123614787745, "loss": 1.2309, "step": 2928 }, { "epoch": 0.24782130467890684, "grad_norm": 1.859375, "learning_rate": 0.0008577193203282189, "loss": 1.748, "step": 2929 }, { "epoch": 0.24790591420593958, "grad_norm": 1.2890625, "learning_rate": 0.0008576262537960107, "loss": 1.4697, "step": 2930 }, { "epoch": 0.24799052373297234, "grad_norm": 1.4375, "learning_rate": 0.0008575331618887537, "loss": 1.1713, "step": 2931 }, { "epoch": 0.24807513326000508, "grad_norm": 1.4765625, "learning_rate": 0.0008574400446130528, "loss": 1.3693, "step": 2932 }, { "epoch": 0.2481597427870378, "grad_norm": 1.40625, "learning_rate": 0.0008573469019755154, "loss": 1.2322, "step": 2933 }, { "epoch": 0.24824435231407058, "grad_norm": 1.828125, "learning_rate": 0.00085725373398275, "loss": 1.2347, "step": 2934 }, { "epoch": 0.2483289618411033, "grad_norm": 1.65625, "learning_rate": 0.0008571605406413674, "loss": 1.4775, "step": 2935 }, { "epoch": 0.24841357136813605, "grad_norm": 1.359375, "learning_rate": 0.0008570673219579802, "loss": 1.3701, "step": 2936 }, { "epoch": 0.24849818089516879, "grad_norm": 1.5859375, "learning_rate": 0.0008569740779392024, "loss": 1.9535, "step": 2937 }, { "epoch": 0.24858279042220155, "grad_norm": 1.46875, "learning_rate": 0.0008568808085916503, "loss": 1.3873, "step": 2938 }, { "epoch": 0.24866739994923429, "grad_norm": 1.6953125, "learning_rate": 0.0008567875139219416, "loss": 1.775, "step": 2939 }, { "epoch": 0.24875200947626702, "grad_norm": 1.578125, "learning_rate": 0.0008566941939366959, "loss": 1.3409, "step": 2940 }, { "epoch": 0.24883661900329976, "grad_norm": 1.28125, "learning_rate": 0.0008566008486425345, "loss": 1.1273, "step": 2941 }, { "epoch": 0.24892122853033252, "grad_norm": 1.203125, "learning_rate": 0.0008565074780460811, "loss": 1.1493, "step": 2942 }, { "epoch": 0.24900583805736526, "grad_norm": 2.078125, "learning_rate": 0.0008564140821539603, "loss": 1.844, "step": 2943 }, { "epoch": 0.249090447584398, "grad_norm": 1.7578125, "learning_rate": 0.0008563206609727991, "loss": 1.4655, "step": 2944 }, { "epoch": 0.24917505711143076, "grad_norm": 2.78125, "learning_rate": 0.000856227214509226, "loss": 1.8247, "step": 2945 }, { "epoch": 0.2492596666384635, "grad_norm": 1.375, "learning_rate": 0.0008561337427698717, "loss": 1.1893, "step": 2946 }, { "epoch": 0.24934427616549623, "grad_norm": 1.375, "learning_rate": 0.0008560402457613678, "loss": 1.6554, "step": 2947 }, { "epoch": 0.24942888569252897, "grad_norm": 1.8828125, "learning_rate": 0.0008559467234903491, "loss": 1.199, "step": 2948 }, { "epoch": 0.24951349521956173, "grad_norm": 1.3984375, "learning_rate": 0.0008558531759634507, "loss": 1.1253, "step": 2949 }, { "epoch": 0.24959810474659447, "grad_norm": 1.484375, "learning_rate": 0.0008557596031873106, "loss": 1.1262, "step": 2950 }, { "epoch": 0.2496827142736272, "grad_norm": 1.875, "learning_rate": 0.0008556660051685679, "loss": 1.6585, "step": 2951 }, { "epoch": 0.24976732380065994, "grad_norm": 1.5234375, "learning_rate": 0.0008555723819138641, "loss": 1.2208, "step": 2952 }, { "epoch": 0.2498519333276927, "grad_norm": 1.7109375, "learning_rate": 0.0008554787334298417, "loss": 1.184, "step": 2953 }, { "epoch": 0.24993654285472544, "grad_norm": 1.5390625, "learning_rate": 0.0008553850597231459, "loss": 1.4987, "step": 2954 }, { "epoch": 0.2500211523817582, "grad_norm": 1.8515625, "learning_rate": 0.0008552913608004228, "loss": 1.5925, "step": 2955 }, { "epoch": 0.2500211523817582, "eval_loss": 1.3909703493118286, "eval_runtime": 387.925, "eval_samples_per_second": 48.749, "eval_steps_per_second": 6.094, "step": 2955 } ], "logging_steps": 1, "max_steps": 11819, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2955, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.881990609215488e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }